diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 00000000..e69de29b diff --git a/cache.json b/cache.json new file mode 100644 index 00000000..df92a477 --- /dev/null +++ b/cache.json @@ -0,0 +1 @@ +{"2023-12-01T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.00784v1","updated":"2023-12-01T18:59:56Z","published":"2023-12-01T18:59:56Z","title":"Making Large Multimodal Models Understand Arbitrary Visual Prompts","summary":" While existing large vision-language multimodal models focus on whole image\nunderstanding, there is a prominent gap in achieving region-specific\ncomprehension. Current approaches that use textual coordinates or spatial\nencodings often fail to provide a user-friendly interface for visual prompting.\nTo address this challenge, we introduce a novel multimodal model capable of\ndecoding arbitrary visual prompts. This allows users to intuitively mark images\nand interact with the model using natural cues like a \"red bounding box\" or\n\"pointed arrow\". Our simple design directly overlays visual markers onto the\nRGB image, eliminating the need for complex region encodings, yet achieves\nstate-of-the-art performance on region-understanding tasks like Visual7W,\nPointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present\nViP-Bench, a comprehensive benchmark to assess the capability of models in\nunderstanding visual prompts across multiple dimensions, enabling future\nresearch in this domain. Code, data, and model are publicly available.\n","authors":["Mu Cai","Haotian Liu","Siva Karthik Mustikovela","Gregory P. Meyer","Yuning Chai","Dennis Park","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.00784v1.pdf","comment":"Project page: https://vip-llava.github.io/"},{"id":"http://arxiv.org/abs/2312.00774v1","updated":"2023-12-01T18:53:51Z","published":"2023-12-01T18:53:51Z","title":"Context Retrieval via Normalized Contextual Latent Interaction for\n Conversational Agent","summary":" Conversational agents leveraging AI, particularly deep learning, are emerging\nin both academic research and real-world applications. However, these\napplications still face challenges, including disrespecting knowledge and\nfacts, not personalizing to user preferences, and enormous demand for\ncomputational resources during training and inference. Recent research efforts\nhave been focused on addressing these challenges from various aspects,\nincluding supplementing various types of auxiliary information to the\nconversational agents. However, existing methods are still not able to\neffectively and efficiently exploit relevant information from these auxiliary\nsupplements to further unleash the power of the conversational agents and the\nlanguage models they use. In this paper, we present a novel method, PK-NCLI,\nthat is able to accurately and efficiently identify relevant auxiliary\ninformation to improve the quality of conversational responses by learning the\nrelevance among persona, chat history, and knowledge background through\nlow-level normalized contextual latent interaction. Our experimental results\nindicate that PK-NCLI outperforms the state-of-the-art method, PK-FoCus, by\n47.80%/30.61%/24.14% in terms of perplexity, knowledge grounding, and training\nefficiency, respectively, and maintained the same level of persona grounding\nperformance. We also provide a detailed analysis of how different factors,\nincluding language model choices and trade-offs on training weights, would\naffect the performance of PK-NCLI.\n","authors":["Junfeng Liu","Zhuocheng Mei","Kewen Peng","Ranga Raju Vatsavai"],"pdf_url":"https://arxiv.org/pdf/2312.00774v1.pdf","comment":"2023 IEEE International Conference on Data Mining Workshops (ICDMW)"},{"id":"http://arxiv.org/abs/2309.16770v2","updated":"2023-12-01T18:45:12Z","published":"2023-09-28T18:07:01Z","title":"Persona-Coded Poly-Encoder: Persona-Guided Multi-Stream Conversational\n Sentence Scoring","summary":" Recent advances in machine learning and deep learning have led to the\nwidespread use of Conversational AI in many practical applications. However, it\nis still very challenging to leverage auxiliary information that can provide\nconversational context or personalized tuning to improve the quality of\nconversations. For example, there has only been limited research on using an\nindividuals persona information to improve conversation quality, and even\nstate-of-the-art conversational AI techniques are unable to effectively\nleverage signals from heterogeneous sources of auxiliary data, such as\nmulti-modal interaction data, demographics, SDOH data, etc. In this paper, we\npresent a novel Persona-Coded Poly-Encoder method that leverages persona\ninformation in a multi-stream encoding scheme to improve the quality of\nresponse generation for conversations. To show the efficacy of the proposed\nmethod, we evaluate our method on two different persona-based conversational\ndatasets, and compared against two state-of-the-art methods. Our experimental\nresults and analysis demonstrate that our method can improve conversation\nquality over the baseline method Poly-Encoder by 3.32% and 2.94% in terms of\nBLEU score and HR@1, respectively. More significantly, our method offers a path\nto better utilization of multi-modal data in conversational tasks. Lastly, our\nstudy outlines several challenges and future research directions for advancing\npersonalized conversational AI technology.\n","authors":["Junfeng Liu","Christopher Symons","Ranga Raju Vatsavai"],"pdf_url":"https://arxiv.org/pdf/2309.16770v2.pdf","comment":"The 35th IEEE International Conference on Tools with Artificial\n Intelligence (ICTAI)"},{"id":"http://arxiv.org/abs/2312.00763v1","updated":"2023-12-01T18:31:28Z","published":"2023-12-01T18:31:28Z","title":"Beyond ChatBots: ExploreLLM for Structured Thoughts and Personalized\n Model Responses","summary":" Large language model (LLM) powered chatbots are primarily text-based today,\nand impose a large interactional cognitive load, especially for exploratory or\nsensemaking tasks such as planning a trip or learning about a new city. Because\nthe interaction is textual, users have little scaffolding in the way of\nstructure, informational \"scent\", or ability to specify high-level preferences\nor goals. We introduce ExploreLLM that allows users to structure thoughts, help\nexplore different options, navigate through the choices and recommendations,\nand to more easily steer models to generate more personalized responses. We\nconduct a user study and show that users find it helpful to use ExploreLLM for\nexploratory or planning tasks, because it provides a useful schema-like\nstructure to the task, and guides users in planning. The study also suggests\nthat users can more easily personalize responses with high-level preferences\nwith ExploreLLM. Together, ExploreLLM points to a future where users interact\nwith LLMs beyond the form of chatbots, and instead designed to support complex\nuser tasks with a tighter integration between natural language and graphical\nuser interfaces.\n","authors":["Xiao Ma","Swaroop Mishra","Ariel Liu","Sophie Su","Jilin Chen","Chinmay Kulkarni","Heng-Tze Cheng","Quoc Le","Ed Chi"],"pdf_url":"https://arxiv.org/pdf/2312.00763v1.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.00751v1","updated":"2023-12-01T17:52:47Z","published":"2023-12-01T17:52:47Z","title":"Mitigating Over-smoothing in Transformers via Regularized Nonlocal\n Functionals","summary":" Transformers have achieved remarkable success in a wide range of natural\nlanguage processing and computer vision applications. However, the\nrepresentation capacity of a deep transformer model is degraded due to the\nover-smoothing issue in which the token representations become identical when\nthe model's depth grows. In this work, we show that self-attention layers in\ntransformers minimize a functional which promotes smoothness, thereby causing\ntoken uniformity. We then propose a novel regularizer that penalizes the norm\nof the difference between the smooth output tokens from self-attention and the\ninput tokens to preserve the fidelity of the tokens. Minimizing the resulting\nregularized energy functional, we derive the Neural Transformer with a\nRegularized Nonlocal Functional (NeuTRENO), a novel class of transformer models\nthat can mitigate the over-smoothing issue. We empirically demonstrate the\nadvantages of NeuTRENO over the baseline transformers and state-of-the-art\nmethods in reducing the over-smoothing of token representations on various\npractical tasks, including object classification, image segmentation, and\nlanguage modeling.\n","authors":["Tam Nguyen","Tan M. Nguyen","Richard G. Baraniuk"],"pdf_url":"https://arxiv.org/pdf/2312.00751v1.pdf","comment":"24 papes"},{"id":"http://arxiv.org/abs/2312.00738v1","updated":"2023-12-01T17:17:56Z","published":"2023-12-01T17:17:56Z","title":"SeaLLMs -- Large Language Models for Southeast Asia","summary":" Despite the remarkable achievements of large language models (LLMs) in\nvarious tasks, there remains a linguistic bias that favors high-resource\nlanguages, such as English, often at the expense of low-resource and regional\nlanguages. To address this imbalance, we introduce SeaLLMs, an innovative\nseries of language models that specifically focuses on Southeast Asian (SEA)\nlanguages. SeaLLMs are built upon the Llama-2 model and further advanced\nthrough continued pre-training with an extended vocabulary, specialized\ninstruction and alignment tuning to better capture the intricacies of regional\nlanguages. This allows them to respect and reflect local cultural norms,\ncustoms, stylistic preferences, and legal considerations. Our comprehensive\nevaluation demonstrates that SeaLLM-13b models exhibit superior performance\nacross a wide spectrum of linguistic tasks and assistant-style\ninstruction-following capabilities relative to comparable open-source models.\nMoreover, they outperform ChatGPT-3.5 in non-Latin languages, such as Thai,\nKhmer, Lao, and Burmese, by large margins while remaining lightweight and\ncost-effective to operate.\n","authors":["Xuan-Phi Nguyen","Wenxuan Zhang","Xin Li","Mahani Aljunied","Qingyu Tan","Liying Cheng","Guanzheng Chen","Yue Deng","Sen Yang","Chaoqun Liu","Hang Zhang","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2312.00738v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.00688v1","updated":"2023-12-01T16:11:38Z","published":"2023-12-01T16:11:38Z","title":"Towards Transparency in Coreference Resolution: A Quantum-Inspired\n Approach","summary":" Guided by grammatical structure, words compose to form sentences, and guided\nby discourse structure, sentences compose to form dialogues and documents. The\ncompositional aspect of sentence and discourse units is often overlooked by\nmachine learning algorithms. A recent initiative called Quantum Natural\nLanguage Processing (QNLP) learns word meanings as points in a Hilbert space\nand acts on them via a translation of grammatical structure into Parametrised\nQuantum Circuits (PQCs). Previous work extended the QNLP translation to\ndiscourse structure using points in a closure of Hilbert spaces. In this paper,\nwe evaluate this translation on a Winograd-style pronoun resolution task. We\ntrain a Variational Quantum Classifier (VQC) for binary classification and\nimplement an end-to-end pronoun resolution system. The simulations executed on\nIBMQ software converged with an F1 score of 87.20%. The model outperformed two\nout of three classical coreference resolution systems and neared\nstate-of-the-art SpanBERT. A mixed quantum-classical model yet improved these\nresults with an F1 score increase of around 6%.\n","authors":["Hadi Wazni","Mehrnoosh Sadrzadeh"],"pdf_url":"https://arxiv.org/pdf/2312.00688v1.pdf","comment":"CRAC 2023, the Sixth Workshop on Computational Models of Reference,\n Anaphora and Coreference, EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.00680v1","updated":"2023-12-01T16:04:00Z","published":"2023-12-01T16:04:00Z","title":"Contextualized word senses: from attention to compositionality","summary":" The neural architectures of language models are becoming increasingly\ncomplex, especially that of Transformers, based on the attention mechanism.\nAlthough their application to numerous natural language processing tasks has\nproven to be very fruitful, they continue to be models with little or no\ninterpretability and explainability. One of the tasks for which they are best\nsuited is the encoding of the contextual sense of words using contextualized\nembeddings. In this paper we propose a transparent, interpretable, and\nlinguistically motivated strategy for encoding the contextual sense of words by\nmodeling semantic compositionality. Particular attention is given to dependency\nrelations and semantic notions such as selection preferences and paradigmatic\nclasses. A partial implementation of the proposed model is carried out and\ncompared with Transformer-based architectures for a given semantic task, namely\nthe similarity calculation of word senses in context. The results obtained show\nthat it is possible to be competitive with linguistically motivated models\ninstead of using the black boxes underlying complex neural architectures.\n","authors":["Pablo Gamallo"],"pdf_url":"https://arxiv.org/pdf/2312.00680v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00678v1","updated":"2023-12-01T16:00:25Z","published":"2023-12-01T16:00:25Z","title":"The Efficiency Spectrum of Large Language Models: An Algorithmic Survey","summary":" The rapid growth of Large Language Models (LLMs) has been a driving force in\ntransforming various domains, reshaping the artificial general intelligence\nlandscape. However, the increasing computational and memory demands of these\nmodels present substantial challenges, hindering both academic research and\npractical applications. To address these issues, a wide array of methods,\nincluding both algorithmic and hardware solutions, have been developed to\nenhance the efficiency of LLMs. This survey delivers a comprehensive review of\nalgorithmic advancements aimed at improving LLM efficiency. Unlike other\nsurveys that typically focus on specific areas such as training or model\ncompression, this paper examines the multi-faceted dimensions of efficiency\nessential for the end-to-end algorithmic development of LLMs. Specifically, it\ncovers various topics related to efficiency, including scaling laws, data\nutilization, architectural innovations, training and tuning strategies, and\ninference techniques. This paper aims to serve as a valuable resource for\nresearchers and practitioners, laying the groundwork for future innovations in\nthis critical research area. Our repository of relevant references is\nmaintained at url{https://github.com/tding1/Efficient-LLM-Survey}.\n","authors":["Tianyu Ding","Tianyi Chen","Haidong Zhu","Jiachen Jiang","Yiqi Zhong","Jinxin Zhou","Guangzhi Wang","Zhihui Zhu","Ilya Zharkov","Luming Liang"],"pdf_url":"https://arxiv.org/pdf/2312.00678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00662v1","updated":"2023-12-01T15:40:30Z","published":"2023-12-01T15:40:30Z","title":"Nonparametric Variational Regularisation of Pretrained Transformers","summary":" The current paradigm of large-scale pre-training and fine-tuning Transformer\nlarge language models has lead to significant improvements across the board in\nnatural language processing. However, such large models are susceptible to\noverfitting to their training data, and as a result the models perform poorly\nwhen the domain changes. Also, due to the model's scale, the cost of\nfine-tuning the model to the new domain is large. Nonparametric Variational\nInformation Bottleneck (NVIB) has been proposed as a regulariser for training\ncross-attention in Transformers, potentially addressing the overfitting\nproblem. We extend the NVIB framework to replace all types of attention\nfunctions in Transformers, and show that existing pretrained Transformers can\nbe reinterpreted as Nonparametric Variational (NV) models using a proposed\nidentity initialisation. We then show that changing the initialisation\nintroduces a novel, information-theoretic post-training regularisation in the\nattention mechanism, which improves out-of-domain generalisation without any\ntraining. This success supports the hypothesis that pretrained Transformers are\nimplicitly NV Bayesian models.\n","authors":["Fabio Fehr","James Henderson"],"pdf_url":"https://arxiv.org/pdf/2312.00662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18765v2","updated":"2023-12-01T15:38:31Z","published":"2023-11-30T18:05:52Z","title":"MLLMs-Augmented Visual-Language Representation Learning","summary":" Visual-language pre-training (VLP) has achieved remarkable success in\nmulti-modal tasks, largely attributed to the availability of large-scale\nimage-text datasets. In this work, we demonstrate that multi-modal large\nlanguage models (MLLMs) can enhance visual-language representation learning by\nimproving data quality. Our approach is simple, utilizing MLLMs to extend\nmultiple captions for each image. To prevent the bias introduced by MLLMs'\nhallucinations and intrinsic caption styles, we propose \"text shearing\" to\nmaintain the same length for extended captions as that of the original\ncaptions. In image-text retrieval, our method consistently obtains 5.6 ~ 35.0%\nand 16.8 ~ 46.1% improvement on R@1 under the fine-tuning and zero-shot\nsettings, respectively. Notably, we obtain zero-shot results that are\ncomparable to fine-tuning on target datasets, which encourages more exploration\nof the versatile use of MLLMs.\n","authors":["Yanqing Liu","Kai Wang","Wenqi Shao","Ping Luo","Yu Qiao","Mike Zheng Shou","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2311.18765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03687v2","updated":"2023-12-01T15:37:07Z","published":"2023-11-07T03:25:56Z","title":"Dissecting the Runtime Performance of the Training, Fine-tuning, and\n Inference of Large Language Models","summary":" Large Language Models (LLMs) have seen great advance in both academia and\nindustry, and their popularity results in numerous open-source frameworks and\ntechniques in accelerating LLM pre-training, fine-tuning, and inference.\nTraining and deploying LLMs are expensive as it requires considerable computing\nresources and memory, hence many efficient approaches have been developed for\nimproving system pipelines as well as operators. However, the runtime\nperformance can vary significantly across hardware and software stacks, which\nmakes it difficult to choose the best configuration. In this work, we aim to\nbenchmark the performance from both macro and micro perspectives. First, we\nbenchmark the end-to-end performance of pre-training, fine-tuning, and serving\nLLMs in different sizes , i.e., 7, 13, and 70 billion parameters (7B, 13B, and\n70B) on three 8-GPU platforms with and without individual optimization\ntechniques, including ZeRO, quantization, recomputation, FlashAttention. Then,\nwe dive deeper to provide a detailed runtime analysis of the sub-modules,\nincluding computing and communication operators in LLMs. For end users, our\nbenchmark and findings help better understand different optimization\ntechniques, training and inference frameworks, together with hardware platforms\nin choosing configurations for deploying LLMs. For researchers, our in-depth\nmodule-wise analyses discover potential opportunities for future work to\nfurther optimize the runtime performance of LLMs.\n","authors":["Longteng Zhang","Xiang Liu","Zeyu Li","Xinglin Pan","Peijie Dong","Ruibo Fan","Rui Guo","Xin Wang","Qiong Luo","Shaohuai Shi","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2311.03687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03977v2","updated":"2023-12-01T15:16:19Z","published":"2023-05-06T08:43:33Z","title":"An Adversarial Non-Autoregressive Model for Text Generation with\n Incomplete Information","summary":" Non-autoregressive models have been widely studied in the Complete\nInformation Scenario (CIS), in which the input has complete information of\ncorresponding output. However, their explorations in the Incomplete Information\nScenario (IIS) are extremely limited. Our analyses reveal that the IIS's\nincomplete input information will augment the inherent limitations of existing\nnon-autoregressive models trained under Maximum Likelihood Estimation. In this\npaper, we propose for the IIS an Adversarial Non-autoregressive Transformer\n(ANT) which has two features: 1) Position-Aware Self-Modulation to provide more\nreasonable hidden representations, and 2) Dependency Feed Forward Network to\nstrengthen its capacity in dependency modeling. We compare ANT with other\nmainstream models in the IIS and demonstrate that ANT can achieve comparable\nperformance with much fewer decoding iterations. Furthermore, we show its great\npotential in various applications like latent interpolation and semi-supervised\nlearning.\n","authors":["Da Ren","Yi Cai","Qing Li"],"pdf_url":"https://arxiv.org/pdf/2305.03977v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06767v4","updated":"2023-12-01T14:28:06Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v4.pdf","comment":"29 pages, 12 figures, Published in Transactions on Machine Learning\n Research (TMLR)"},{"id":"http://arxiv.org/abs/2312.00584v1","updated":"2023-12-01T13:48:46Z","published":"2023-12-01T13:48:46Z","title":"The Ethics of Automating Legal Actors","summary":" The introduction of large public legal datasets has brought about a\nrenaissance in legal NLP. Many of these datasets are comprised of legal\njudgements - the product of judges deciding cases. This fact, together with the\nway machine learning works, means that several legal NLP models are models of\njudges. While some have argued for the automation of judges, in this position\npiece, we argue that automating the role of the judge raises difficult ethical\nchallenges, in particular for common law legal systems. Our argument follows\nfrom the social role of the judge in actively shaping the law, rather than\nmerely applying it. Since current NLP models come nowhere close to having the\nfacilities necessary for this task, they should not be used to automate judges.\nFurthermore, even in the case the models could achieve human-level\ncapabilities, there would still be remaining ethical concerns inherent in the\nautomation of the legal process.\n","authors":["Josef Valvoda","Alec Thompson","Ryan Cotterell","Simone Teufel"],"pdf_url":"https://arxiv.org/pdf/2312.00584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00575v1","updated":"2023-12-01T13:31:02Z","published":"2023-12-01T13:31:02Z","title":"Instruction-tuning Aligns LLMs to the Human Brain","summary":" Instruction-tuning is a widely adopted method of finetuning that enables\nlarge language models (LLMs) to generate output that more closely resembles\nhuman responses to natural language queries, in many cases leading to\nhuman-level performance on diverse testbeds. However, it remains unclear\nwhether instruction-tuning truly makes LLMs more similar to how humans process\nlanguage. We investigate the effect of instruction-tuning on LLM-human\nsimilarity in two ways: (1) brain alignment, the similarity of LLM internal\nrepresentations to neural activity in the human language system, and (2)\nbehavioral alignment, the similarity of LLM and human behavior on a reading\ntask. We assess 25 vanilla and instruction-tuned LLMs across three datasets\ninvolving humans reading naturalistic stories and sentences. We discover that\ninstruction-tuning generally enhances brain alignment by an average of 6%, but\ndoes not have a similar effect on behavioral alignment. To identify the factors\nunderlying LLM-brain alignment, we compute correlations between the brain\nalignment of LLMs and various model properties, such as model size, various\nproblem-solving abilities, and performance on tasks requiring world knowledge\nspanning various domains. Notably, we find a strong positive correlation\nbetween brain alignment and model size (r = 0.95), as well as performance on\ntasks requiring world knowledge (r = 0.81). Our results demonstrate that\ninstruction-tuning LLMs improves both world knowledge representations and brain\nalignment, suggesting that mechanisms that encode world knowledge in LLMs also\nimprove representational alignment to the human brain.\n","authors":["Khai Loong Aw","Syrielle Montariol","Badr AlKhamissi","Martin Schrimpf","Antoine Bosselut"],"pdf_url":"https://arxiv.org/pdf/2312.00575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19103v3","updated":"2023-12-01T13:25:03Z","published":"2023-05-30T15:06:28Z","title":"Does Conceptual Representation Require Embodiment? Insights From Large\n Language Models","summary":" To what extent can language alone give rise to complex concepts, or is\nembodied experience essential? Recent advancements in large language models\n(LLMs) offer fresh perspectives on this question. Although LLMs are trained on\nrestricted modalities, they exhibit human-like performance in diverse\npsychological tasks. Our study compared representations of 4,442 lexical\nconcepts between humans and ChatGPTs (GPT-3.5 and GPT-4) across multiple\ndimensions, including five key domains: emotion, salience, mental\nvisualization, sensory, and motor experience. We identify two main findings: 1)\nBoth models strongly align with human representations in non-sensorimotor\ndomains but lag in sensory and motor areas, with GPT-4 outperforming GPT-3.5;\n2) GPT-4's gains are associated with its additional visual learning, which also\nappears to benefit related dimensions like haptics and imageability. These\nresults highlight the limitations of language in isolation, and that the\nintegration of diverse modalities of inputs leads to a more human-like\nconceptual representation.\n","authors":["Qihui Xu","Yingying Peng","Samuel A. Nastase","Martin Chodorow","Minghua Wu","Ping Li"],"pdf_url":"https://arxiv.org/pdf/2305.19103v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00567v1","updated":"2023-12-01T13:22:35Z","published":"2023-12-01T13:22:35Z","title":"Explanatory Argument Extraction of Correct Answers in Resident Medical\n Exams","summary":" Developing the required technology to assist medical experts in their\neveryday activities is currently a hot topic in the Artificial Intelligence\nresearch field. Thus, a number of large language models (LLMs) and automated\nbenchmarks have recently been proposed with the aim of facilitating information\nextraction in Evidence-Based Medicine (EBM) using natural language as a tool\nfor mediating in human-AI interaction. The most representative benchmarks are\nlimited to either multiple-choice or long-form answers and are available only\nin English. In order to address these shortcomings, in this paper we present a\nnew dataset which, unlike previous work: (i) includes not only explanatory\narguments for the correct answer, but also arguments to reason why the\nincorrect answers are not correct; (ii) the explanations are written originally\nby medical doctors to answer questions from the Spanish Residency Medical\nExams. Furthermore, this new benchmark allows us to setup a novel extractive\ntask which consists of identifying the explanation of the correct answer\nwritten by medical doctors. An additional benefit of our setting is that we can\nleverage the extractive QA paradigm to automatically evaluate performance of\nLLMs without resorting to costly manual evaluation by medical experts.\nComprehensive experimentation with language models for Spanish shows that\nsometimes multilingual models fare better than monolingual ones, even\noutperforming models which have been adapted to the medical domain.\nFurthermore, results across the monolingual models are mixed, with supposedly\nsmaller and inferior models performing competitively. In any case, the obtained\nresults show that our novel dataset and approach can be an effective technique\nto help medical practitioners in identifying relevant evidence-based\nexplanations for medical questions.\n","authors":["Iakes Goenaga","Aitziber Atutxa","Koldo Gojenola","Maite Oronoz","Rodrigo Agerri"],"pdf_url":"https://arxiv.org/pdf/2312.00567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00554v1","updated":"2023-12-01T13:00:45Z","published":"2023-12-01T13:00:45Z","title":"Questioning Biases in Case Judgment Summaries: Legal Datasets or Large\n Language Models?","summary":" The evolution of legal datasets and the advent of large language models\n(LLMs) have significantly transformed the legal field, particularly in the\ngeneration of case judgment summaries. However, a critical concern arises\nregarding the potential biases embedded within these summaries. This study\nscrutinizes the biases present in case judgment summaries produced by legal\ndatasets and large language models. The research aims to analyze the impact of\nbiases on legal decision making. By interrogating the accuracy, fairness, and\nimplications of biases in these summaries, this study contributes to a better\nunderstanding of the role of technology in legal contexts and the implications\nfor justice systems worldwide. In this study, we investigate biases wrt\nGender-related keywords, Race-related keywords, Keywords related to crime\nagainst women, Country names and religious keywords. The study shows\ninteresting evidences of biases in the outputs generated by the large language\nmodels and pre-trained abstractive summarization models. The reasoning behind\nthese biases needs further studies.\n","authors":["Aniket Deroy","Subhankar Maity"],"pdf_url":"https://arxiv.org/pdf/2312.00554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00552v1","updated":"2023-12-01T12:59:32Z","published":"2023-12-01T12:59:32Z","title":"Improving Unsupervised Relation Extraction by Augmenting Diverse\n Sentence Pairs","summary":" Unsupervised relation extraction (URE) aims to extract relations between\nnamed entities from raw text without requiring manual annotations or\npre-existing knowledge bases. In recent studies of URE, researchers put a\nnotable emphasis on contrastive learning strategies for acquiring relation\nrepresentations. However, these studies often overlook two important aspects:\nthe inclusion of diverse positive pairs for contrastive learning and the\nexploration of appropriate loss functions. In this paper, we propose AugURE\nwith both within-sentence pairs augmentation and augmentation through\ncross-sentence pairs extraction to increase the diversity of positive pairs and\nstrengthen the discriminative power of contrastive learning. We also identify\nthe limitation of noise-contrastive estimation (NCE) loss for relation\nrepresentation learning and propose to apply margin loss for sentence pairs.\nExperiments on NYT-FB and TACRED datasets demonstrate that the proposed\nrelation representation learning and a simple K-Means clustering achieves\nstate-of-the-art performance.\n","authors":["Qing Wang","Kang Zhou","Qiao Qiao","Yuepei Li","Qi Li"],"pdf_url":"https://arxiv.org/pdf/2312.00552v1.pdf","comment":"Accepted by EMNLP 2023 Main Conference"},{"id":"http://arxiv.org/abs/2211.01722v3","updated":"2023-12-01T12:54:42Z","published":"2022-11-03T11:23:36Z","title":"H_eval: A new hybrid evaluation metric for automatic speech recognition\n tasks","summary":" Many studies have examined the shortcomings of word error rate (WER) as an\nevaluation metric for automatic speech recognition (ASR) systems. Since WER\nconsiders only literal word-level correctness, new evaluation metrics based on\nsemantic similarity such as semantic distance (SD) and BERTScore have been\ndeveloped. However, we found that these metrics have their own limitations,\nsuch as a tendency to overly prioritise keywords. We propose H_eval, a new\nhybrid evaluation metric for ASR systems that considers both semantic\ncorrectness and error rate and performs significantly well in scenarios where\nWER and SD perform poorly. Due to lighter computation compared to BERTScore, it\noffers 49 times reduction in metric computation time. Furthermore, we show that\nH_eval correlates strongly with downstream NLP tasks. Also, to reduce the\nmetric calculation time, we built multiple fast and lightweight models using\ndistillation techniques\n","authors":["Zitha Sasindran","Harsha Yelchuri","T. V. Prabhakar","Supreeth Rao"],"pdf_url":"https://arxiv.org/pdf/2211.01722v3.pdf","comment":"Accepted in ASRU 2023"},{"id":"http://arxiv.org/abs/2312.00536v1","updated":"2023-12-01T12:15:58Z","published":"2023-12-01T12:15:58Z","title":"Trained MT Metrics Learn to Cope with Machine-translated References","summary":" Neural metrics trained on human evaluations of MT tend to correlate well with\nhuman judgments, but their behavior is not fully understood. In this paper, we\nperform a controlled experiment and compare a baseline metric that has not been\ntrained on human evaluations (Prism) to a trained version of the same metric\n(Prism+FT). Surprisingly, we find that Prism+FT becomes more robust to\nmachine-translated references, which are a notorious problem in MT evaluation.\nThis suggests that the effects of metric training go beyond the intended effect\nof improving overall correlation with human judgments.\n","authors":["Jannis Vamvas","Tobias Domhan","Sony Trenous","Rico Sennrich","Eva Hasler"],"pdf_url":"https://arxiv.org/pdf/2312.00536v1.pdf","comment":"WMT 2023"},{"id":"http://arxiv.org/abs/2312.00525v1","updated":"2023-12-01T12:01:04Z","published":"2023-12-01T12:01:04Z","title":"SurreyAI 2023 Submission for the Quality Estimation Shared Task","summary":" Quality Estimation (QE) systems are important in situations where it is\nnecessary to assess the quality of translations, but there is no reference\navailable. This paper describes the approach adopted by the SurreyAI team for\naddressing the Sentence-Level Direct Assessment shared task in WMT23. The\nproposed approach builds upon the TransQuest framework, exploring various\nautoencoder pre-trained language models within the MonoTransQuest architecture\nusing single and ensemble settings. The autoencoder pre-trained language models\nemployed in the proposed systems are XLMV, InfoXLM-large, and XLMR-large. The\nevaluation utilizes Spearman and Pearson correlation coefficients, assessing\nthe relationship between machine-predicted quality scores and human judgments\nfor 5 language pairs (English-Gujarati, English-Hindi, English-Marathi,\nEnglish-Tamil and English-Telugu). The MonoTQ-InfoXLM-large approach emerges as\na robust strategy, surpassing all other individual models proposed in this\nstudy by significantly improving over the baseline for the majority of the\nlanguage pairs.\n","authors":["Archchana Sindhujan","Diptesh Kanojia","Constantin Orasan","Tharindu Ranasinghe"],"pdf_url":"https://arxiv.org/pdf/2312.00525v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00513v1","updated":"2023-12-01T11:34:37Z","published":"2023-12-01T11:34:37Z","title":"Summarization-based Data Augmentation for Document Classification","summary":" Despite the prevalence of pretrained language models in natural language\nunderstanding tasks, understanding lengthy text such as document is still\nchallenging due to the data sparseness problem. Inspired by that humans develop\ntheir ability of understanding lengthy text from reading shorter text, we\npropose a simple yet effective summarization-based data augmentation, SUMMaug,\nfor document classification. We first obtain easy-to-learn examples for the\ntarget document classification task by summarizing the input of the original\ntraining examples, while optionally merging the original labels to conform to\nthe summarized input. We then use the generated pseudo examples to perform\ncurriculum learning. Experimental results on two datasets confirmed the\nadvantage of our method compared to existing baseline methods in terms of\nrobustness and accuracy. We release our code and data at\nhttps://github.com/etsurin/summaug.\n","authors":["Yueguan Wang","Naoki Yoshinaga"],"pdf_url":"https://arxiv.org/pdf/2312.00513v1.pdf","comment":"The 4th New Frontiers in Summarization (with LLMs) Workshop"},{"id":"http://arxiv.org/abs/2305.12191v2","updated":"2023-12-01T10:37:56Z","published":"2023-05-20T13:34:34Z","title":"Pointwise Mutual Information Based Metric and Decoding Strategy for\n Faithful Generation in Document Grounded Dialogs","summary":" A major concern in using deep learning based generative models for\ndocument-grounded dialogs is the potential generation of responses that are not\n\\textit{faithful} to the underlying document. Existing automated metrics used\nfor evaluating the faithfulness of response with respect to the grounding\ndocument measure the degree of similarity between the generated response and\nthe document's content. However, these automated metrics are far from being\nwell aligned with human judgments. Therefore, to improve the measurement of\nfaithfulness, we propose a new metric that utilizes (Conditional) Point-wise\nMutual Information (PMI) between the generated response and the source\ndocument, conditioned on the dialogue. PMI quantifies the extent to which the\ndocument influences the generated response -- with a higher PMI indicating a\nmore faithful response. We build upon this idea to create a new decoding\ntechnique that incorporates PMI into the response generation process to predict\nmore faithful responses. Our experiments on the BEGIN benchmark demonstrate an\nimproved correlation of our metric with human evaluation. We also show that our\ndecoding technique is effective in generating more faithful responses when\ncompared to standard decoding techniques on a set of publicly available\ndocument-grounded dialog datasets.\n","authors":["Yatin Nandwani","Vineet Kumar","Dinesh Raghu","Sachindra Joshi","Luis A. Lastras"],"pdf_url":"https://arxiv.org/pdf/2305.12191v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.00480v1","updated":"2023-12-01T10:23:15Z","published":"2023-12-01T10:23:15Z","title":"Japanese Tort-case Dataset for Rationale-supported Legal Judgment\n Prediction","summary":" This paper presents the first dataset for Japanese Legal Judgment Prediction\n(LJP), the Japanese Tort-case Dataset (JTD), which features two tasks: tort\nprediction and its rationale extraction. The rationale extraction task\nidentifies the court's accepting arguments from alleged arguments by plaintiffs\nand defendants, which is a novel task in the field. JTD is constructed based on\nannotated 3,477 Japanese Civil Code judgments by 41 legal experts, resulting in\n7,978 instances with 59,697 of their alleged arguments from the involved\nparties. Our baseline experiments show the feasibility of the proposed two\ntasks, and our error analysis by legal experts identifies sources of errors and\nsuggests future directions of the LJP research.\n","authors":["Hiroaki Yamada","Takenobu Tokunaga","Ryutaro Ohara","Akira Tokutsu","Keisuke Takeshita","Mihoko Sumida"],"pdf_url":"https://arxiv.org/pdf/2312.00480v1.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.00413v1","updated":"2023-12-01T08:37:27Z","published":"2023-12-01T08:37:27Z","title":"Abstract Syntax Tree for Programming Language Understanding and\n Representation: How Far Are We?","summary":" Programming language understanding and representation (a.k.a code\nrepresentation learning) has always been a hot and challenging task in software\nengineering. It aims to apply deep learning techniques to produce numerical\nrepresentations of the source code features while preserving its semantics.\nThese representations can be used for facilitating subsequent code-related\ntasks. The abstract syntax tree (AST), a fundamental code feature, illustrates\nthe syntactic information of the source code and has been widely used in code\nrepresentation learning. However, there is still a lack of systematic and\nquantitative evaluation of how well AST-based code representation facilitates\nsubsequent code-related tasks. In this paper, we first conduct a comprehensive\nempirical study to explore the effectiveness of the AST-based code\nrepresentation in facilitating follow-up code-related tasks. To do so, we\ncompare the performance of models trained with code token sequence (Token for\nshort) based code representation and AST-based code representation on three\npopular types of code-related tasks. Surprisingly, the overall quantitative\nstatistical results demonstrate that models trained with AST-based code\nrepresentation consistently perform worse across all three tasks compared to\nmodels trained with Token-based code representation. Our further quantitative\nanalysis reveals that models trained with AST-based code representation\noutperform models trained with Token-based code representation in certain\nsubsets of samples across all three tasks. We also conduct comprehensive\nexperiments to evaluate and reveal the impact of the choice of AST\nparsing/preprocessing/encoding methods on AST-based code representation and\nsubsequent code-related tasks. Our study provides future researchers with\ndetailed guidance on how to select solutions at each stage to fully exploit\nAST.\n","authors":["Weisong Sun","Chunrong Fang","Yun Miao","Yudu You","Mengzhe Yuan","Yuchen Chen","Quanjun Zhang","An Guo","Xiang Chen","Yang Liu","Zhenyu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00413v1.pdf","comment":"submitted to ACM Transactions on Software Engineering and\n Methodology. arXiv admin note: text overlap with arXiv:2103.10668 by other\n authors"},{"id":"http://arxiv.org/abs/2312.00407v1","updated":"2023-12-01T08:02:16Z","published":"2023-12-01T08:02:16Z","title":"CoLLiE: Collaborative Training of Large Language Models in an Efficient\n Way","summary":" Large language models (LLMs) are increasingly pivotal in a wide range of\nnatural language processing tasks. Access to pre-trained models, courtesy of\nthe open-source community, has made it possible to adapt these models to\nspecific applications for enhanced performance. However, the substantial\nresources required for training these models necessitate efficient solutions.\nThis paper introduces CoLLiE, an efficient library that facilitates\ncollaborative training of large language models using 3D parallelism,\nparameter-efficient fine-tuning (PEFT) methods, and optimizers such as Lion,\nAdan, Sophia, LOMO and AdaLomo. With its modular design and comprehensive\nfunctionality, CoLLiE offers a balanced blend of efficiency, ease of use, and\ncustomization. CoLLiE has proven superior training efficiency in comparison\nwith prevalent solutions in pre-training and fine-tuning scenarios.\nFurthermore, we provide an empirical evaluation of the correlation between\nmodel size and GPU memory consumption under different optimization methods, as\nwell as an analysis of the throughput. Lastly, we carry out a comprehensive\ncomparison of various optimizers and PEFT methods within the instruction-tuning\ncontext. CoLLiE is available at https://github.com/OpenLMLab/collie.\n","authors":["Kai Lv","Shuo Zhang","Tianle Gu","Shuhao Xing","Jiawei Hong","Keyu Chen","Xiaoran Liu","Yuqing Yang","Honglin Guo","Tengxiao Liu","Yu Sun","Qipeng Guo","Hang Yan","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2312.00407v1.pdf","comment":"To appear at EMNLP 2023 Demo; Code is available at\n https://github.com/OpenLMLab/collie"},{"id":"http://arxiv.org/abs/2308.16911v2","updated":"2023-12-01T07:55:16Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have shown a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, enabling LLMs to understand point clouds and offering a new\navenue beyond 2D visual data. PointLLM understands colored object point clouds\nwith human instructions and generates contextually appropriate responses,\nillustrating its grasp of point clouds and common sense. Specifically, it\nleverages a point cloud encoder with a powerful LLM to effectively fuse\ngeometric, appearance, and linguistic information. We collect a novel dataset\ncomprising 660K simple and 70K complex point-text instruction pairs to enable a\ntwo-stage training strategy: aligning latent spaces and subsequently\ninstruction-tuning the unified model. To rigorously evaluate the perceptual and\ngeneralization capabilities of PointLLM, we establish two benchmarks:\nGenerative 3D Object Classification and 3D Object Captioning, assessed through\nthree different methods, including human evaluation, GPT-4/ChatGPT evaluation,\nand traditional metrics. Experimental results reveal PointLLM's superior\nperformance over existing 2D and 3D baselines, with a notable achievement in\nhuman-evaluated object captioning tasks where it surpasses human annotators in\nover 50% of the samples. Codes, datasets, and benchmarks are available at\nhttps://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v2.pdf","comment":"28 pages. Empowering large language models with 3D point cloud\n understanding, accompanied by a novel dataset and carefully designed\n benchmarks. Project page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2309.01885v2","updated":"2023-12-01T07:04:05Z","published":"2023-09-05T01:39:09Z","title":"QuantEase: Optimization-based Quantization for Language Models","summary":" With the rising popularity of Large Language Models (LLMs), there has been an\nincreasing interest in compression techniques that enable their efficient\ndeployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs.\nDrawing from recent advances, our work introduces QuantEase, a layer-wise\nquantization framework where individual layers undergo separate quantization.\nThe problem is framed as a discrete-structured non-convex optimization,\nprompting the development of algorithms rooted in Coordinate Descent (CD)\ntechniques. These CD-based methods provide high-quality solutions to the\ncomplex non-convex layer-wise quantization problems. Notably, our CD-based\napproach features straightforward updates, relying solely on matrix and vector\noperations, circumventing the need for matrix inversion or decomposition. We\nalso explore an outlier-aware variant of our approach, allowing for retaining\nsignificant weights (outliers) with complete precision. Our proposal attains\nstate-of-the-art performance in terms of perplexity and zero-shot accuracy in\nempirical evaluations across various LLMs and datasets, with relative\nimprovements up to 15% over methods such as GPTQ. Leveraging careful linear\nalgebra optimizations, QuantEase can quantize models like Falcon-180B on a\nsingle NVIDIA A100 GPU in $\\sim$3 hours. Particularly noteworthy is our\noutlier-aware algorithm's capability to achieve near or sub-3-bit quantization\nof LLMs with an acceptable drop in accuracy, obviating the need for non-uniform\nquantization or grouping techniques, improving upon methods such as SpQR by up\nto two times in terms of perplexity.\n","authors":["Kayhan Behdin","Ayan Acharya","Aman Gupta","Qingquan Song","Siyu Zhu","Sathiya Keerthi","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2309.01885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10847v2","updated":"2023-12-01T06:38:18Z","published":"2023-11-17T20:07:54Z","title":"Token-Level Adaptation of LoRA Adapters for Downstream Task\n Generalization","summary":" This paper introduces a method for adapting LoRA adapters in smaller-sized\nlanguage models to arbitrary downstream tasks. Unlike standard\nmixture-of-expert architectures, our method employs a gradient-free routing\nfunction to choose a weighted combination of experts without increasing the\ncompute requirements for training or inference. The results show that\ntoken-level adaptation of LoRA adapters outperforms the base Llama-2-7b model\nacross mathematical (GSM8K), scientific (ARC-Challenge), reading comprehension\n(SQuAD), and coding (CodeAlpaca-20k) tasks. Further evaluations also show that\nthe average performance of token-level adaptation outperforms individual models\nfine-tuned for each of the tasks with the best performance observed in\nadaptation of every-other token during inference. The code for this study is\nmade available through a public repository.\n","authors":["Joshua Belofsky"],"pdf_url":"https://arxiv.org/pdf/2311.10847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00372v1","updated":"2023-12-01T06:30:31Z","published":"2023-12-01T06:30:31Z","title":"Event-driven Real-time Retrieval in Web Search","summary":" Information retrieval in real-time search presents unique challenges distinct\nfrom those encountered in classical web search. These challenges are\nparticularly pronounced due to the rapid change of user search intent, which is\ninfluenced by the occurrence and evolution of breaking news events, such as\nearthquakes, elections, and wars. Previous dense retrieval methods, which\nprimarily focused on static semantic representation, lack the capacity to\ncapture immediate search intent, leading to inferior performance in retrieving\nthe most recent event-related documents in time-sensitive scenarios. To address\nthis issue, this paper expands the query with event information that represents\nreal-time search intent. The Event information is then integrated with the\nquery through a cross-attention mechanism, resulting in a time-context query\nrepresentation. We further enhance the model's capacity for event\nrepresentation through multi-task training. Since publicly available datasets\nsuch as MS-MARCO do not contain any event information on the query side and\nhave few time-sensitive queries, we design an automatic data collection and\nannotation pipeline to address this issue, which includes ModelZoo-based Coarse\nAnnotation and LLM-driven Fine Annotation processes. In addition, we share the\ntraining tricks such as two-stage training and hard negative sampling. Finally,\nwe conduct a set of offline experiments on a million-scale production dataset\nto evaluate our approach and deploy an A/B testing in a real online system to\nverify the performance. Extensive experimental results demonstrate that our\nproposed approach significantly outperforms existing state-of-the-art baseline\nmethods.\n","authors":["Nan Yang","Yannan Zhang","Xiaoling Bai","Hualong Deng","Tianhua Zhou","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00353v1","updated":"2023-12-01T05:08:47Z","published":"2023-12-01T05:08:47Z","title":"On Exploring the Reasoning Capability of Large Language Models with\n Knowledge Graphs","summary":" This paper examines the capacity of LLMs to reason with knowledge graphs\nusing their internal knowledge graph, i.e., the knowledge graph they learned\nduring pre-training. Two research questions are formulated to investigate the\naccuracy of LLMs in recalling information from pre-training knowledge graphs\nand their ability to infer knowledge graph relations from context. To address\nthese questions, we employ LLMs to perform four distinct knowledge graph\nreasoning tasks. Furthermore, we identify two types of hallucinations that may\noccur during knowledge reasoning with LLMs: content and ontology hallucination.\nOur experimental results demonstrate that LLMs can successfully tackle both\nsimple and complex knowledge graph reasoning tasks from their own memory, as\nwell as infer from input context.\n","authors":["Pei-Chi Lo","Yi-Hang Tsai","Ee-Peng Lim","San-Yih Hwang"],"pdf_url":"https://arxiv.org/pdf/2312.00353v1.pdf","comment":"Presented at the Generative-IR Workshop during SIGIR 2023.\n https://coda.io/@sigir/gen-ir"},{"id":"http://arxiv.org/abs/2312.00349v1","updated":"2023-12-01T04:55:29Z","published":"2023-12-01T04:55:29Z","title":"The Case for Scalable, Data-Driven Theory: A Paradigm for Scientific\n Progress in NLP","summary":" I propose a paradigm for scientific progress in NLP centered around\ndeveloping scalable, data-driven theories of linguistic structure. The idea is\nto collect data in tightly scoped, carefully defined ways which allow for\nexhaustive annotation of behavioral phenomena of interest, and then use machine\nlearning to construct explanatory theories of these phenomena which can form\nbuilding blocks for intelligible AI systems. After laying some conceptual\ngroundwork, I describe several investigations into data-driven theories of\nshallow semantic structure using Question-Answer driven Semantic Role Labeling\n(QA-SRL), a schema for annotating verbal predicate-argument relations using\nhighly constrained question-answer pairs. While this only scratches the surface\nof the complex language behaviors of interest in AI, I outline principles for\ndata collection and theoretical modeling which can inform future scientific\nprogress. This note summarizes and draws heavily on my PhD thesis.\n","authors":["Julian Michael"],"pdf_url":"https://arxiv.org/pdf/2312.00349v1.pdf","comment":"13 pages, 3 figures, 2 tables. Presented at The Big Picture Workshop\n at EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.00347v1","updated":"2023-12-01T04:51:01Z","published":"2023-12-01T04:51:01Z","title":"RTQ: Rethinking Video-language Understanding Based on Image-text Model","summary":" Recent advancements in video-language understanding have been established on\nthe foundation of image-text models, resulting in promising outcomes due to the\nshared knowledge between images and videos. However, video-language\nunderstanding presents unique challenges due to the inclusion of highly complex\nsemantic details, which result in information redundancy, temporal dependency,\nand scene complexity. Current techniques have only partially tackled these\nissues, and our quantitative analysis indicates that some of these methods are\ncomplementary. In light of this, we propose a novel framework called RTQ\n(Refine, Temporal model, and Query), which addresses these challenges\nsimultaneously. The approach involves refining redundant information within\nframes, modeling temporal relations among frames, and querying task-specific\ninformation from the videos. Remarkably, our model demonstrates outstanding\nperformance even in the absence of video-language pre-training, and the results\nare comparable with or superior to those achieved by state-of-the-art\npre-training methods.\n","authors":["Xiao Wang","Yaoyu Li","Tian Gan","Zheng Zhang","Jingjing Lv","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.00347v1.pdf","comment":"Accepted by ACM MM 2023 as Oral representation"},{"id":"http://arxiv.org/abs/2310.10631v2","updated":"2023-12-01T03:51:33Z","published":"2023-10-16T17:54:07Z","title":"Llemma: An Open Language Model For Mathematics","summary":" We present Llemma, a large language model for mathematics. We continue\npretraining Code Llama on the Proof-Pile-2, a mixture of scientific papers, web\ndata containing mathematics, and mathematical code, yielding Llemma. On the\nMATH benchmark Llemma outperforms all known open base models, as well as the\nunreleased Minerva model suite on an equi-parameter basis. Moreover, Llemma is\ncapable of tool use and formal theorem proving without any further finetuning.\nWe openly release all artifacts, including 7 billion and 34 billion parameter\nmodels, the Proof-Pile-2, and code to replicate our experiments.\n","authors":["Zhangir Azerbayev","Hailey Schoelkopf","Keiran Paster","Marco Dos Santos","Stephen McAleer","Albert Q. Jiang","Jia Deng","Stella Biderman","Sean Welleck"],"pdf_url":"https://arxiv.org/pdf/2310.10631v2.pdf","comment":"Updated references; corrected description of COPRA search budget"},{"id":"http://arxiv.org/abs/2312.00326v1","updated":"2023-12-01T03:44:54Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging Large Language Models for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM-based agents\nhave become revolutionary in data engineering and have been applied creatively\nin various domains, their potential for OM remains underexplored. This study\nintroduces a novel agent-powered LLM-based design paradigm for OM systems. With\nthoughtful consideration of several specific challenges to leverage LLMs for\nOM, we propose a generic framework, namely Agent-OM, consisting of two Siamese\nagents for retrieval and matching, with a set of simple prompt-based OM tools.\nOur framework is implemented in a proof-of-concept system. Evaluations of three\nOntology Alignment Evaluation Initiative (OAEI) tracks over state-of-the-art OM\nsystems show that our system can achieve very close results to the best\nlong-standing performance on simple OM tasks and significantly improve the\nperformance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v1.pdf","comment":"14 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2303.10868v3","updated":"2023-12-01T02:58:09Z","published":"2023-03-20T05:07:41Z","title":"Retrieving Multimodal Information for Augmented Generation: A Survey","summary":" As Large Language Models (LLMs) become popular, there emerged an important\ntrend of using multimodality to augment the LLMs' generation ability, which\nenables LLMs to better interact with the world. However, there lacks a unified\nperception of at which stage and how to incorporate different modalities. In\nthis survey, we review methods that assist and augment generative models by\nretrieving multimodal knowledge, whose formats range from images, codes,\ntables, graphs, to audio. Such methods offer a promising solution to important\nconcerns such as factuality, reasoning, interpretability, and robustness. By\nproviding an in-depth review, this survey is expected to provide scholars with\na deeper understanding of the methods' applications and encourage them to adapt\nexisting techniques to the fast-growing field of LLMs.\n","authors":["Ruochen Zhao","Hailin Chen","Weishi Wang","Fangkai Jiao","Xuan Long Do","Chengwei Qin","Bosheng Ding","Xiaobao Guo","Minzhi Li","Xingxuan Li","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2303.10868v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00293v1","updated":"2023-12-01T02:13:34Z","published":"2023-12-01T02:13:34Z","title":"PsyAttention: Psychological Attention Model for Personality Detection","summary":" Work on personality detection has tended to incorporate psychological\nfeatures from different personality models, such as BigFive and MBTI. There are\nmore than 900 psychological features, each of which is helpful for personality\ndetection. However, when used in combination, the application of different\ncalculation standards among these features may result in interference between\nfeatures calculated using distinct systems, thereby introducing noise and\nreducing performance. This paper adapts different psychological models in the\nproposed PsyAttention for personality detection, which can effectively encode\npsychological features, reducing their number by 85%. In experiments on the\nBigFive and MBTI models, PysAttention achieved average accuracy of 65.66% and\n86.30%, respectively, outperforming state-of-the-art methods, indicating that\nit is effective at encoding psychological features.\n","authors":["Baohua Zhang","Yongyi Huang","Wenyao Cui","Huaping Zhang","Jianyun Shang"],"pdf_url":"https://arxiv.org/pdf/2312.00293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00292v1","updated":"2023-12-01T02:13:25Z","published":"2023-12-01T02:13:25Z","title":"SEPSIS: I Can Catch Your Lies -- A New Paradigm for Deception Detection","summary":" Deception is the intentional practice of twisting information. It is a\nnuanced societal practice deeply intertwined with human societal evolution,\ncharacterized by a multitude of facets. This research explores the problem of\ndeception through the lens of psychology, employing a framework that\ncategorizes deception into three forms: lies of omission, lies of commission,\nand lies of influence. The primary focus of this study is specifically on\ninvestigating only lies of omission. We propose a novel framework for deception\ndetection leveraging NLP techniques. We curated an annotated dataset of 876,784\nsamples by amalgamating a popular large-scale fake news dataset and scraped\nnews headlines from the Twitter handle of Times of India, a well-known Indian\nnews media house. Each sample has been labeled with four layers, namely: (i)\nthe type of omission (speculation, bias, distortion, sounds factual, and\nopinion), (ii) colors of lies(black, white, etc), and (iii) the intention of\nsuch lies (to influence, etc) (iv) topic of lies (political, educational,\nreligious, etc). We present a novel multi-task learning pipeline that leverages\nthe dataless merging of fine-tuned language models to address the deception\ndetection task mentioned earlier. Our proposed model achieved an F1 score of\n0.87, demonstrating strong performance across all layers including the type,\ncolor, intent, and topic aspects of deceptive content. Finally, our research\nexplores the relationship between lies of omission and propaganda techniques.\nTo accomplish this, we conducted an in-depth analysis, uncovering compelling\nfindings. For instance, our analysis revealed a significant correlation between\nloaded language and opinion, shedding light on their interconnectedness. To\nencourage further research in this field, we will be making the models and\ndataset available with the MIT License, making it favorable for open-source\nresearch.\n","authors":["Anku Rani","Dwip Dalal","Shreya Gautam","Pankaj Gupta","Vinija Jain","Aman Chadha","Amit Sheth","Amitava Das"],"pdf_url":"https://arxiv.org/pdf/2312.00292v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.00267v2","updated":"2023-12-01T01:41:44Z","published":"2023-09-01T05:53:33Z","title":"RLAIF: Scaling Reinforcement Learning from Human Feedback with AI\n Feedback","summary":" Reinforcement learning from human feedback (RLHF) has proven effective in\naligning large language models (LLMs) with human preferences. However,\ngathering high-quality human preference labels can be a time-consuming and\nexpensive endeavor. RL from AI Feedback (RLAIF), introduced by Bai et al.,\noffers a promising alternative that leverages a powerful off-the-shelf LLM to\ngenerate preferences in lieu of human annotators. Across the tasks of\nsummarization, helpful dialogue generation, and harmless dialogue generation,\nRLAIF achieves comparable or superior performance to RLHF, as rated by human\nevaluators. Furthermore, RLAIF demonstrates the ability to outperform a\nsupervised fine-tuned baseline even when the LLM preference labeler is the same\nsize as the policy. In another experiment, directly prompting the LLM for\nreward scores achieves superior performance to the canonical RLAIF setup, where\nLLM preference labels are first distilled into a reward model. Finally, we\nconduct extensive studies on techniques for generating aligned AI preferences.\nOur results suggest that RLAIF can achieve human-level performance, offering a\npotential solution to the scalability limitations of RLHF.\n","authors":["Harrison Lee","Samrat Phatale","Hassan Mansoor","Thomas Mesnard","Johan Ferret","Kellie Lu","Colton Bishop","Ethan Hall","Victor Carbune","Abhinav Rastogi","Sushant Prakash"],"pdf_url":"https://arxiv.org/pdf/2309.00267v2.pdf","comment":"Added two more tasks and many more experiments and analyses (e.g.\n same-size RLAIF, direct RLAIF, cost analysis)"},{"id":"http://arxiv.org/abs/2306.03438v2","updated":"2023-12-01T01:27:37Z","published":"2023-06-06T06:35:27Z","title":"Large Language Models of Code Fail at Completing Code with Potential\n Bugs","summary":" Large language models of code (Code-LLMs) have recently brought tremendous\nadvances to code completion, a fundamental feature of programming assistance\nand code intelligence. However, most existing works ignore the possible\npresence of bugs in the code context for generation, which are inevitable in\nsoftware development. Therefore, we introduce and study the buggy-code\ncompletion problem, inspired by the realistic scenario of real-time code\nsuggestion where the code context contains potential bugs -- anti-patterns that\ncan become bugs in the completed program. To systematically study the task, we\nintroduce two datasets: one with synthetic bugs derived from semantics-altering\noperator changes (buggy-HumanEval) and one with realistic bugs derived from\nuser submissions to coding problems (buggy-FixEval). We find that the presence\nof potential bugs significantly degrades the generation performance of the\nhigh-performing Code-LLMs. For instance, the passing rates of CODEGEN-2B-MONO\non test cases of buggy-HumanEval drop more than 50% given a single potential\nbug in the context. Finally, we investigate several post-hoc methods for\nmitigating the adverse effect of potential bugs and find that there remains a\nsignificant gap in post-mitigation performance.\n","authors":["Tuan Dinh","Jinman Zhao","Samson Tan","Renato Negrinho","Leonard Lausen","Sheng Zha","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2306.03438v2.pdf","comment":"27 pages, accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00277v1","updated":"2023-12-01T01:26:38Z","published":"2023-12-01T01:26:38Z","title":"Text Attribute Control via Closed-Loop Disentanglement","summary":" Changing an attribute of a text without changing the content usually requires\nto first disentangle the text into irrelevant attributes and content\nrepresentations. After that, in the inference phase, the representation of one\nattribute is tuned to a different value, expecting that the corresponding\nattribute of the text can also be changed accordingly. The usual way of\ndisentanglement is to add some constraints on the latent space of an\nencoder-decoder architecture, including adversarial-based constraints and\nmutual-information-based constraints. However, the previous semi-supervised\nprocesses of attribute change are usually not enough to guarantee the success\nof attribute change and content preservation. In this paper, we propose a novel\napproach to achieve a robust control of attributes while enhancing content\npreservation. In this approach, we use a semi-supervised contrastive learning\nmethod to encourage the disentanglement of attributes in latent spaces.\nDifferently from previous works, we re-disentangle the reconstructed sentence\nand compare the re-disentangled latent space with the original latent space,\nwhich makes a closed-loop disentanglement process. This also helps content\npreservation. In addition, the contrastive learning method is also able to\nreplace the role of minimizing mutual information and adversarial training in\nthe disentanglement process, which alleviates the computation cost. We\nconducted experiments on three text datasets, including the Yelp Service review\ndataset, the Amazon Product review dataset, and the GoEmotions dataset. The\nexperimental results show the effectiveness of our model.\n","authors":["Lei Sha","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.00277v1.pdf","comment":"accepted by TACL 2023"},{"id":"http://arxiv.org/abs/2312.00273v1","updated":"2023-12-01T01:22:46Z","published":"2023-12-01T01:22:46Z","title":"Mark My Words: Analyzing and Evaluating Language Model Watermarks","summary":" The capabilities of large language models have grown significantly in recent\nyears and so too have concerns about their misuse. In this context, the ability\nto distinguish machine-generated text from human-authored content becomes\nimportant. Prior works have proposed numerous schemes to watermark text, which\nwould benefit from a systematic evaluation framework. This work focuses on text\nwatermarking techniques - as opposed to image watermarks - and proposes a\ncomprehensive benchmark for them under different tasks as well as practical\nattacks. We focus on three main metrics: quality, size (e.g. the number of\ntokens needed to detect a watermark), and tamper-resistance. Current\nwatermarking techniques are good enough to be deployed: Kirchenbauer et al. can\nwatermark Llama2-7B-chat with no perceivable loss in quality in under 100\ntokens, and with good tamper-resistance to simple attacks, regardless of\ntemperature. We argue that watermark indistinguishability is too strong a\nrequirement: schemes that slightly modify logit distributions outperform their\nindistinguishable counterparts with no noticeable loss in generation quality.\nWe publicly release our benchmark.\n","authors":["Julien Piet","Chawin Sitawarin","Vivian Fang","Norman Mu","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2312.00273v1.pdf","comment":"19 pages, 14 figures"},{"id":"http://arxiv.org/abs/2310.10083v2","updated":"2023-12-01T00:29:37Z","published":"2023-10-16T05:28:28Z","title":"JMedLoRA:Medical Domain Adaptation on Japanese Large Language Models\n using Instruction-tuning","summary":" In the ongoing wave of impact driven by large language models (LLMs) like\nChatGPT, the adaptation of LLMs to medical domain has emerged as a crucial\nresearch frontier. Since mainstream LLMs tend to be designed for\ngeneral-purpose applications, constructing a medical LLM through domain\nadaptation is a huge challenge. While instruction-tuning is used to fine-tune\nsome LLMs, its precise roles in domain adaptation remain unknown. Here we show\nthe contribution of LoRA-based instruction-tuning to performance in Japanese\nmedical question-answering tasks. In doing so, we employ a multifaceted\nevaluation for multiple-choice questions, including scoring based on \"Exact\nmatch\" and \"Gestalt distance\" in addition to the conventional accuracy. Our\nfindings suggest that LoRA-based instruction-tuning can partially incorporate\ndomain-specific knowledge into LLMs, with larger models demonstrating more\npronounced effects. Furthermore, our results underscore the potential of\nadapting English-centric models for Japanese applications in domain adaptation,\nwhile also highlighting the persisting limitations of Japanese-centric models.\nThis initiative represents a pioneering effort in enabling medical institutions\nto fine-tune and operate models without relying on external services.\n","authors":["Issey Sukeda","Masahiro Suzuki","Hiroki Sakaji","Satoshi Kodera"],"pdf_url":"https://arxiv.org/pdf/2310.10083v2.pdf","comment":"8 pages, 1 figures"},{"id":"http://arxiv.org/abs/2305.14772v3","updated":"2023-12-01T00:11:04Z","published":"2023-05-24T06:23:02Z","title":"A Question Answering Framework for Decontextualizing User-facing\n Snippets from Scientific Documents","summary":" Many real-world applications (e.g., note taking, search) require extracting a\nsentence or paragraph from a document and showing that snippet to a human\noutside of the source document. Yet, users may find snippets difficult to\nunderstand as they lack context from the original document. In this work, we\nuse language models to rewrite snippets from scientific documents to be read on\ntheir own. First, we define the requirements and challenges for this\nuser-facing decontextualization task, such as clarifying where edits occur and\nhandling references to other documents. Second, we propose a framework that\ndecomposes the task into three stages: question generation, question answering,\nand rewriting. Using this framework, we collect gold decontextualizations from\nexperienced scientific article readers. We then conduct a range of experiments\nacross state-of-the-art commercial and open-source language models to identify\nhow to best provide missing-but-relevant information to models for our task.\nFinally, we develop QaDecontext, a simple prompting strategy inspired by our\nframework that improves over end-to-end prompting. We conclude with analysis\nthat finds, while rewriting is easy, question generation and answering remain\nchallenging for today's models.\n","authors":["Benjamin Newman","Luca Soldaini","Raymond Fok","Arman Cohan","Kyle Lo"],"pdf_url":"https://arxiv.org/pdf/2305.14772v3.pdf","comment":"19 pages, 2 figures, 8 tables, EMNLP2023"},{"id":"http://arxiv.org/abs/2312.00968v1","updated":"2023-12-01T23:04:27Z","published":"2023-12-01T23:04:27Z","title":"Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of\n Low-rank Experts","summary":" Large multi-modal models (LMMs) exhibit remarkable performance across\nnumerous tasks. However, generalist LMMs often suffer from performance\ndegradation when tuned over a large collection of tasks. Recent research\nsuggests that Mixture of Experts (MoE) architectures are useful for instruction\ntuning, but for LMMs of parameter size around O(50-100B), the prohibitive cost\nof replicating and storing the expert models severely limits the number of\nexperts we can use. We propose Omni-SMoLA, an architecture that uses the Soft\nMoE approach to (softly) mix many multimodal low rank experts, and avoids\nintroducing a significant number of new parameters compared to conventional MoE\nmodels. The core intuition here is that the large model provides a foundational\nbackbone, while different lightweight experts residually learn specialized\nknowledge, either per-modality or multimodally. Extensive experiments\ndemonstrate that the SMoLA approach helps improve the generalist performance\nacross a broad range of generative vision-and-language tasks, achieving new\nSoTA generalist performance that often matches or outperforms single\nspecialized LMM baselines, as well as new SoTA specialist performance.\n","authors":["Jialin Wu","Xia Hu","Yaqing Wang","Bo Pang","Radu Soricut"],"pdf_url":"https://arxiv.org/pdf/2312.00968v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.04729v4","updated":"2023-12-01T22:34:34Z","published":"2023-03-08T17:15:58Z","title":"Stealing the Decoding Algorithms of Language Models","summary":" A key component of generating text from modern language models (LM) is the\nselection and tuning of decoding algorithms. These algorithms determine how to\ngenerate text from the internal probability distribution generated by the LM.\nThe process of choosing a decoding algorithm and tuning its hyperparameters\ntakes significant time, manual effort, and computation, and it also requires\nextensive human evaluation. Therefore, the identity and hyperparameters of such\ndecoding algorithms are considered to be extremely valuable to their owners. In\nthis work, we show, for the first time, that an adversary with typical API\naccess to an LM can steal the type and hyperparameters of its decoding\nalgorithms at very low monetary costs. Our attack is effective against popular\nLMs used in text generation APIs, including GPT-2, GPT-3 and GPT-Neo. We\ndemonstrate the feasibility of stealing such information with only a few\ndollars, e.g., $\\$0.8$, $\\$1$, $\\$4$, and $\\$40$ for the four versions of\nGPT-3.\n","authors":["Ali Naseh","Kalpesh Krishna","Mohit Iyyer","Amir Houmansadr"],"pdf_url":"https://arxiv.org/pdf/2303.04729v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00960v1","updated":"2023-12-01T22:27:12Z","published":"2023-12-01T22:27:12Z","title":"The Cost of Compression: Investigating the Impact of Compression on\n Parametric Knowledge in Language Models","summary":" Compressing large language models (LLMs), often consisting of billions of\nparameters, provides faster inference, smaller memory footprints, and enables\nlocal deployment. Two standard compression techniques are pruning and\nquantization, with the former eliminating redundant connections in model layers\nand the latter representing model parameters with fewer bits. The key tradeoff\nis between the degree of compression and the impact on the quality of the\ncompressed model. Existing research on LLM compression primarily focuses on\nperformance in terms of general metrics like perplexity or downstream task\naccuracy. More fine-grained metrics, such as those measuring parametric\nknowledge, remain significantly underexplored. To help bridge this gap, we\npresent a comprehensive analysis across multiple model families (ENCODER,\nENCODER-DECODER, and DECODER) using the LAMA and LM-HARNESS benchmarks in order\nto systematically quantify the effect of commonly employed compression\ntechniques on model performance. A particular focus is on tradeoffs involving\nparametric knowledge, with the goal of providing practitioners with practical\ninsights to help make informed decisions on compression. We release our\ncodebase1 to enable further research.\n","authors":["Satya Sai Srinath Namburi","Makesh Sreedhar","Srinath Srinivasan","Frederic Sala"],"pdf_url":"https://arxiv.org/pdf/2312.00960v1.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2312.00949v1","updated":"2023-12-01T22:03:12Z","published":"2023-12-01T22:03:12Z","title":"Hyperparameter Optimization for Large Language Model Instruction-Tuning","summary":" The fine-tuning of Large Language Models (LLMs) has enabled them to recently\nachieve milestones in natural language processing applications. The emergence\nof ever larger LLMs has paved the way for more efficient fine-tuning methods.\nAmong these, the Low-Rank Adaptation (LoRA) method keeps most of the weights of\nthe pre-trained LLM frozen while introducing a low-rank decomposition of the\nweight matrix, enabling the tuning of only a very small proportion of the\nnetwork. The performance on downstream tasks of models fine-tuned with LoRA\nheavily relies on a set of hyperparameters including the rank of the\ndecomposition. In this work, we investigate the choice of these hyperparameters\nthrough two main blackbox optimization (BBO) techniques. We examine the whole\npipeline of performing fine-tuning and validation on a pre-trained LLM as a\nblackbox and efficiently explore the space of hyperparameters with the \\nomad\nalgorithm, achieving a boost in performance and human alignment of the tuned\nmodel.\n","authors":["Christophe Tribes","Sacha Benarroch-Lelong","Peng Lu","Ivan Kobyzev"],"pdf_url":"https://arxiv.org/pdf/2312.00949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00912v1","updated":"2023-12-01T20:27:42Z","published":"2023-12-01T20:27:42Z","title":"Quick Back-Translation for Unsupervised Machine Translation","summary":" The field of unsupervised machine translation has seen significant\nadvancement from the marriage of the Transformer and the back-translation\nalgorithm. The Transformer is a powerful generative model, and back-translation\nleverages Transformer's high-quality translations for iterative\nself-improvement. However, the Transformer is encumbered by the run-time of\nautoregressive inference during back-translation, and back-translation is\nlimited by a lack of synthetic data efficiency. We propose a two-for-one\nimprovement to Transformer back-translation: Quick Back-Translation (QBT). QBT\nre-purposes the encoder as a generative model, and uses encoder-generated\nsequences to train the decoder in conjunction with the original autoregressive\nback-translation step, improving data throughput and utilization. Experiments\non various WMT benchmarks demonstrate that a relatively small number of\nrefining steps of QBT improve current unsupervised machine translation models,\nand that QBT dramatically outperforms standard back-translation only method in\nterms of training efficiency for comparable translation qualities.\n","authors":["Benjamin Brimacombe","Jiawei Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.00912v1.pdf","comment":"Published in EMNLP 2023 Findings. Source code can be found at the\n following link: https://github.com/bbrimacombe/Quick-Back-Translation"},{"id":"http://arxiv.org/abs/2311.17213v2","updated":"2023-12-01T19:56:22Z","published":"2023-11-28T20:34:40Z","title":"General-Purpose vs. Domain-Adapted Large Language Models for Extraction\n of Data from Thoracic Radiology Reports","summary":" Radiologists produce unstructured data that could be valuable for clinical\ncare when consumed by information systems. However, variability in style limits\nusage. Study compares performance of system using domain-adapted language model\n(RadLing) and general-purpose large language model (GPT-4) in extracting common\ndata elements (CDE) from thoracic radiology reports. Three radiologists\nannotated a retrospective dataset of 1300 thoracic reports (900 training, 400\ntest) and mapped to 21 pre-selected relevant CDEs. RadLing was used to generate\nembeddings for sentences and identify CDEs using cosine-similarity, which were\nmapped to values using light-weight mapper. GPT-4 system used OpenAI's\ngeneral-purpose embeddings to identify relevant CDEs and used GPT-4 to map to\nvalues. The output CDE:value pairs were compared to the reference standard; an\nidentical match was considered true positive. Precision (positive predictive\nvalue) was 96% (2700/2824) for RadLing and 99% (2034/2047) for GPT-4. Recall\n(sensitivity) was 94% (2700/2876) for RadLing and 70% (2034/2887) for GPT-4;\nthe difference was statistically significant (P<.001). RadLing's domain-adapted\nembeddings were more sensitive in CDE identification (95% vs 71%) and its\nlight-weight mapper had comparable precision in value assignment (95.4% vs\n95.0%). RadLing system exhibited higher performance than GPT-4 system in\nextracting CDEs from radiology reports. RadLing system's domain-adapted\nembeddings outperform general-purpose embeddings from OpenAI in CDE\nidentification and its light-weight value mapper achieves comparable precision\nto large GPT-4. RadLing system offers operational advantages including local\ndeployment and reduced runtime costs. Domain-adapted RadLing system surpasses\nGPT-4 system in extracting common data elements from radiology reports, while\nproviding benefits of local deployment and lower costs.\n","authors":["Ali H. Dhanaliwala","Rikhiya Ghosh","Sanjeev Kumar Karn","Poikavila Ullaskrishnan","Oladimeji Farri","Dorin Comaniciu","Charles E. Kahn"],"pdf_url":"https://arxiv.org/pdf/2311.17213v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14735v3","updated":"2023-12-01T19:45:25Z","published":"2023-05-24T05:15:36Z","title":"Centering the Margins: Outlier-Based Identification of Harmed\n Populations in Toxicity Detection","summary":" The impact of AI models on marginalized communities has traditionally been\nmeasured by identifying performance differences between specified demographic\nsubgroups. Though this approach aims to center vulnerable groups, it risks\nobscuring patterns of harm faced by intersectional subgroups or shared across\nmultiple groups. To address this, we draw on theories of marginalization from\ndisability studies and related disciplines, which state that people farther\nfrom the norm face greater adversity, to consider the \"margins\" in the domain\nof toxicity detection. We operationalize the \"margins\" of a dataset by\nemploying outlier detection to identify text about people with demographic\nattributes distant from the \"norm\". We find that model performance is\nconsistently worse for demographic outliers, with mean squared error (MSE)\nbetween outliers and non-outliers up to 70.4% worse across toxicity types. It\nis also worse for text outliers, with a MSE up to 68.4% higher for outliers\nthan non-outliers. We also find text and demographic outliers to be\nparticularly susceptible to errors in the classification of severe toxicity and\nidentity attacks. Compared to analysis of disparities using traditional\ndemographic breakdowns, we find that our outlier analysis frequently surfaces\ngreater harms faced by a larger, more intersectional group, which suggests that\noutlier analysis is particularly beneficial for identifying harms against those\ngroups.\n","authors":["Vyoma Raman","Eve Fleisig","Dan Klein"],"pdf_url":"https://arxiv.org/pdf/2305.14735v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2310.09219v5","updated":"2023-12-01T19:21:20Z","published":"2023-10-13T16:12:57Z","title":"\"Kelly is a Warm Person, Joseph is a Role Model\": Gender Biases in\n LLM-Generated Reference Letters","summary":" Large Language Models (LLMs) have recently emerged as an effective tool to\nassist individuals in writing various types of content, including professional\ndocuments such as recommendation letters. Though bringing convenience, this\napplication also introduces unprecedented fairness concerns. Model-generated\nreference letters might be directly used by users in professional scenarios. If\nunderlying biases exist in these model-constructed letters, using them without\nscrutinization could lead to direct societal harms, such as sabotaging\napplication success rates for female applicants. In light of this pressing\nissue, it is imminent and necessary to comprehensively study fairness issues\nand associated harms in this real-world use case. In this paper, we critically\nexamine gender biases in LLM-generated reference letters. Drawing inspiration\nfrom social science findings, we design evaluation methods to manifest biases\nthrough 2 dimensions: (1) biases in language style and (2) biases in lexical\ncontent. We further investigate the extent of bias propagation by analyzing the\nhallucination bias of models, a term that we define to be bias exacerbation in\nmodel-hallucinated contents. Through benchmarking evaluation on 2 popular LLMs-\nChatGPT and Alpaca, we reveal significant gender biases in LLM-generated\nrecommendation letters. Our findings not only warn against using LLMs for this\napplication without scrutinization, but also illuminate the importance of\nthoroughly studying hidden biases and harms in LLM-generated professional\ndocuments.\n","authors":["Yixin Wan","George Pu","Jiao Sun","Aparna Garimella","Kai-Wei Chang","Nanyun Peng"],"pdf_url":"https://arxiv.org/pdf/2310.09219v5.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2309.03886v2","updated":"2023-12-01T19:13:45Z","published":"2023-09-07T17:47:26Z","title":"FIND: A Function Description Benchmark for Evaluating Interpretability\n Methods","summary":" Labeling neural network submodules with human-legible descriptions is useful\nfor many downstream tasks: such descriptions can surface failures, guide\ninterventions, and perhaps even explain important model behaviors. To date,\nmost mechanistic descriptions of trained networks have involved small models,\nnarrowly delimited phenomena, and large amounts of human labor. Labeling all\nhuman-interpretable sub-computations in models of increasing size and\ncomplexity will almost certainly require tools that can generate and validate\ndescriptions automatically. Recently, techniques that use learned models\nin-the-loop for labeling have begun to gain traction, but methods for\nevaluating their efficacy are limited and ad-hoc. How should we validate and\ncompare open-ended labeling tools? This paper introduces FIND (Function\nINterpretation and Description), a benchmark suite for evaluating the building\nblocks of automated interpretability methods. FIND contains functions that\nresemble components of trained neural networks, and accompanying descriptions\nof the kind we seek to generate. The functions span textual and numeric\ndomains, and involve a range of real-world complexities. We evaluate methods\nthat use pretrained language models (LMs) to produce descriptions of function\nbehavior in natural language and code. Additionally, we introduce a new\ninteractive method in which an Automated Interpretability Agent (AIA) generates\nfunction descriptions. We find that an AIA, built from an LM with black-box\naccess to functions, can infer function structure, acting as a scientist by\nforming hypotheses, proposing experiments, and updating descriptions in light\nof new data. However, AIA descriptions tend to capture global function behavior\nand miss local details. These results suggest that FIND will be useful for\nevaluating more sophisticated interpretability methods before they are applied\nto real-world models.\n","authors":["Sarah Schwettmann","Tamar Rott Shaham","Joanna Materzynska","Neil Chowdhury","Shuang Li","Jacob Andreas","David Bau","Antonio Torralba"],"pdf_url":"https://arxiv.org/pdf/2309.03886v2.pdf","comment":"28 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.00874v1","updated":"2023-12-01T19:03:38Z","published":"2023-12-01T19:03:38Z","title":"Hi-ArG: Exploring the Integration of Hierarchical Argumentation Graphs\n in Language Pretraining","summary":" The knowledge graph is a structure to store and represent knowledge, and\nrecent studies have discussed its capability to assist language models for\nvarious applications. Some variations of knowledge graphs aim to record\narguments and their relations for computational argumentation tasks. However,\nmany must simplify semantic types to fit specific schemas, thus losing\nflexibility and expression ability. In this paper, we propose the Hierarchical\nArgumentation Graph (Hi-ArG), a new structure to organize arguments. We also\nintroduce two approaches to exploit Hi-ArG, including a text-graph multi-modal\nmodel GreaseArG and a new pre-training framework augmented with graph\ninformation. Experiments on two argumentation tasks have shown that after\nfurther pre-training and fine-tuning, GreaseArG supersedes same-scale language\nmodels on these tasks, while incorporating graph information during further\npre-training can also improve the performance of vanilla language models. Code\nfor this paper is available at https://github.com/ljcleo/Hi-ArG .\n","authors":["Jingcong Liang","Rong Ye","Meng Han","Qi Zhang","Ruofei Lai","Xinyu Zhang","Zhao Cao","Xuanjing Huang","Zhongyu Wei"],"pdf_url":"https://arxiv.org/pdf/2312.00874v1.pdf","comment":"to be published in EMNLP 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.00786v1","updated":"2023-12-01T18:59:59Z","published":"2023-12-01T18:59:59Z","title":"Dense Optical Tracking: Connecting the Dots","summary":" Recent approaches to point tracking are able to recover the trajectory of any\nscene point through a large portion of a video despite the presence of\nocclusions. They are, however, too slow in practice to track every point\nobserved in a single frame in a reasonable amount of time. This paper\nintroduces DOT, a novel, simple and efficient method for solving this problem.\nIt first extracts a small set of tracks from key regions at motion boundaries\nusing an off-the-shelf point tracking algorithm. Given source and target\nframes, DOT then computes rough initial estimates of a dense flow field and\nvisibility mask through nearest-neighbor interpolation, before refining them\nusing a learnable optical flow estimator that explicitly handles occlusions and\ncan be trained on synthetic data with ground-truth correspondences. We show\nthat DOT is significantly more accurate than current optical flow techniques,\noutperforms sophisticated \"universal\" trackers like OmniMotion, and is on par\nwith, or better than, the best point tracking algorithms like CoTracker while\nbeing at least two orders of magnitude faster. Quantitative and qualitative\nexperiments with synthetic and real videos validate the promise of the proposed\napproach. Code, data, and videos showcasing the capabilities of our approach\nare available in the project webpage: https://16lemoing.github.io/dot .\n","authors":["Guillaume Le Moing","Jean Ponce","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2312.00786v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00785v1","updated":"2023-12-01T18:59:57Z","published":"2023-12-01T18:59:57Z","title":"Sequential Modeling Enables Scalable Learning for Large Vision Models","summary":" We introduce a novel sequential modeling approach which enables learning a\nLarge Vision Model (LVM) without making use of any linguistic data. To do this,\nwe define a common format, \"visual sentences\", in which we can represent raw\nimages and videos as well as annotated data sources such as semantic\nsegmentations and depth reconstructions without needing any meta-knowledge\nbeyond the pixels. Once this wide variety of visual data (comprising 420\nbillion tokens) is represented as sequences, the model can be trained to\nminimize a cross-entropy loss for next token prediction. By training across\nvarious scales of model architecture and data diversity, we provide empirical\nevidence that our models scale effectively. Many different vision tasks can be\nsolved by designing suitable visual prompts at test time.\n","authors":["Yutong Bai","Xinyang Geng","Karttikeya Mangalam","Amir Bar","Alan Yuille","Trevor Darrell","Jitendra Malik","Alexei A Efros"],"pdf_url":"https://arxiv.org/pdf/2312.00785v1.pdf","comment":"Website: https://yutongbai.com/lvm.html"},{"id":"http://arxiv.org/abs/2312.00784v1","updated":"2023-12-01T18:59:56Z","published":"2023-12-01T18:59:56Z","title":"Making Large Multimodal Models Understand Arbitrary Visual Prompts","summary":" While existing large vision-language multimodal models focus on whole image\nunderstanding, there is a prominent gap in achieving region-specific\ncomprehension. Current approaches that use textual coordinates or spatial\nencodings often fail to provide a user-friendly interface for visual prompting.\nTo address this challenge, we introduce a novel multimodal model capable of\ndecoding arbitrary visual prompts. This allows users to intuitively mark images\nand interact with the model using natural cues like a \"red bounding box\" or\n\"pointed arrow\". Our simple design directly overlays visual markers onto the\nRGB image, eliminating the need for complex region encodings, yet achieves\nstate-of-the-art performance on region-understanding tasks like Visual7W,\nPointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present\nViP-Bench, a comprehensive benchmark to assess the capability of models in\nunderstanding visual prompts across multiple dimensions, enabling future\nresearch in this domain. Code, data, and model are publicly available.\n","authors":["Mu Cai","Haotian Liu","Siva Karthik Mustikovela","Gregory P. Meyer","Yuning Chai","Dennis Park","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.00784v1.pdf","comment":"Project page: https://vip-llava.github.io/"},{"id":"http://arxiv.org/abs/2308.08529v2","updated":"2023-12-01T18:57:27Z","published":"2023-08-16T17:39:15Z","title":"Diagnosing Human-object Interaction Detectors","summary":" We have witnessed significant progress in human-object interaction (HOI)\ndetection. The reliance on mAP (mean Average Precision) scores as a summary\nmetric, however, does not provide sufficient insight into the nuances of model\nperformance (e.g., why one model is better than another), which can hinder\nfurther innovation in this field. To address this issue, in this paper, we\nintroduce a diagnosis toolbox to provide detailed quantitative break-down\nanalysis of HOI detection models, inspired by the success of object detection\ndiagnosis toolboxes. We first conduct holistic investigations in the pipeline\nof HOI detection. By defining a set of errors and the oracles to fix each of\nthem, we can have a quantitative analysis of the significance of different\nerrors according to the mAP improvement obtained from fixing each error. We\nthen delve into two sub-tasks of HOI detection: human-object pair detection and\ninteraction classification, respectively. For the first detection task, we\ncompute the coverage of ground-truth human-object pairs as well as the\nnoisiness level in the detection results. For the second classification task,\nwe measure a model's performance of differentiating positive and negative\ndetection results and also classifying the actual interactions when the\nhuman-object pairs are correctly detected. We analyze eight state-of-the-art\nHOI detection models and provide valuable diagnosis insights to foster future\nresearch. For instance, our diagnosis shows that state-of-the-art model RLIPv2\noutperforms others mainly because it significantly improves the multi-label\ninteraction classification accuracy. Our toolbox is applicable for different\nmethods across different datasets and available at\nhttps://github.com/neu-vi/Diag-HOI.\n","authors":["Fangrui Zhu","Yiming Xie","Weidi Xie","Huaizu Jiang"],"pdf_url":"https://arxiv.org/pdf/2308.08529v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00778v1","updated":"2023-12-01T18:55:53Z","published":"2023-12-01T18:55:53Z","title":"MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular\n RGB-D Video","summary":" Neural rendering has demonstrated remarkable success in dynamic scene\nreconstruction. Thanks to the expressiveness of neural representations, prior\nworks can accurately capture the motion and achieve high-fidelity\nreconstruction of the target object. Despite this, real-world video scenarios\noften feature large unobserved regions where neural representations struggle to\nachieve realistic completion. To tackle this challenge, we introduce MorpheuS,\na framework for dynamic 360{\\deg} surface reconstruction from a casually\ncaptured RGB-D video. Our approach models the target scene as a canonical field\nthat encodes its geometry and appearance, in conjunction with a deformation\nfield that warps points from the current frame to the canonical space. We\nleverage a view-dependent diffusion prior and distill knowledge from it to\nachieve realistic completion of unobserved regions. Experimental results on\nvarious real-world and synthetic datasets show that our method can achieve\nhigh-fidelity 360{\\deg} surface reconstruction of a deformable object from a\nmonocular RGB-D video.\n","authors":["Hengyi Wang","Jingwen Wang","Lourdes Agapito"],"pdf_url":"https://arxiv.org/pdf/2312.00778v1.pdf","comment":"Project page: https://hengyiwang.github.io/projects/morpheus"},{"id":"http://arxiv.org/abs/2312.00777v1","updated":"2023-12-01T18:55:40Z","published":"2023-12-01T18:55:40Z","title":"VideoBooth: Diffusion-based Video Generation with Image Prompts","summary":" Text-driven video generation witnesses rapid progress. However, merely using\ntext prompts is not enough to depict the desired subject appearance that\naccurately aligns with users' intents, especially for customized content\ncreation. In this paper, we study the task of video generation with image\nprompts, which provide more accurate and direct content control beyond the text\nprompts. Specifically, we propose a feed-forward framework VideoBooth, with two\ndedicated designs: 1) We propose to embed image prompts in a coarse-to-fine\nmanner. Coarse visual embeddings from image encoder provide high-level\nencodings of image prompts, while fine visual embeddings from the proposed\nattention injection module provide multi-scale and detailed encoding of image\nprompts. These two complementary embeddings can faithfully capture the desired\nappearance. 2) In the attention injection module at fine level, multi-scale\nimage prompts are fed into different cross-frame attention layers as additional\nkeys and values. This extra spatial information refines the details in the\nfirst frame and then it is propagated to the remaining frames, which maintains\ntemporal consistency. Extensive experiments demonstrate that VideoBooth\nachieves state-of-the-art performance in generating customized high-quality\nvideos with subjects specified in image prompts. Notably, VideoBooth is a\ngeneralizable framework where a single model works for a wide range of image\nprompts with feed-forward pass.\n","authors":["Yuming Jiang","Tianxing Wu","Shuai Yang","Chenyang Si","Dahua Lin","Yu Qiao","Chen Change Loy","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.00777v1.pdf","comment":"Project page: https://vchitect.github.io/VideoBooth-project/"},{"id":"http://arxiv.org/abs/2312.00775v1","updated":"2023-12-01T18:54:12Z","published":"2023-12-01T18:54:12Z","title":"Towards Generalizable Zero-Shot Manipulation via Translating Human\n Interaction Plans","summary":" We pursue the goal of developing robots that can interact zero-shot with\ngeneric unseen objects via a diverse repertoire of manipulation skills and show\nhow passive human videos can serve as a rich source of data for learning such\ngeneralist robots. Unlike typical robot learning approaches which directly\nlearn how a robot should act from interaction data, we adopt a factorized\napproach that can leverage large-scale human videos to learn how a human would\naccomplish a desired task (a human plan), followed by translating this plan to\nthe robots embodiment. Specifically, we learn a human plan predictor that,\ngiven a current image of a scene and a goal image, predicts the future hand and\nobject configurations. We combine this with a translation module that learns a\nplan-conditioned robot manipulation policy, and allows following humans plans\nfor generic manipulation tasks in a zero-shot manner with no deployment-time\ntraining. Importantly, while the plan predictor can leverage large-scale human\nvideos for learning, the translation module only requires a small amount of\nin-domain data, and can generalize to tasks not seen during training. We show\nthat our learned system can perform over 16 manipulation skills that generalize\nto 40 objects, encompassing 100 real-world tasks for table-top manipulation and\ndiverse in-the-wild manipulation. https://homangab.github.io/hopman/\n","authors":["Homanga Bharadhwaj","Abhinav Gupta","Vikash Kumar","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2312.00775v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2312.00766v1","updated":"2023-12-01T18:41:22Z","published":"2023-12-01T18:41:22Z","title":"Automated Material Properties Extraction For Enhanced Beauty Product\n Discovery and Makeup Virtual Try-on","summary":" The multitude of makeup products available can make it challenging to find\nthe ideal match for desired attributes. An intelligent approach for product\ndiscovery is required to enhance the makeup shopping experience to make it more\nconvenient and satisfying. However, enabling accurate and efficient product\ndiscovery requires extracting detailed attributes like color and finish type.\nOur work introduces an automated pipeline that utilizes multiple customized\nmachine learning models to extract essential material attributes from makeup\nproduct images. Our pipeline is versatile and capable of handling various\nmakeup products. To showcase the efficacy of our pipeline, we conduct extensive\nexperiments on eyeshadow products (both single and multi-shade ones), a\nchallenging makeup product known for its diverse range of shapes, colors, and\nfinish types. Furthermore, we demonstrate the applicability of our approach by\nsuccessfully extending it to other makeup categories like lipstick and\nfoundation, showcasing its adaptability and effectiveness across different\nbeauty products. Additionally, we conduct ablation experiments to demonstrate\nthe superiority of our machine learning pipeline over human labeling methods in\nterms of reliability. Our proposed method showcases its effectiveness in\ncross-category product discovery, specifically in recommending makeup products\nthat perfectly match a specified outfit. Lastly, we also demonstrate the\napplication of these material attributes in enabling virtual-try-on experiences\nwhich makes makeup shopping experience significantly more engaging.\n","authors":["Fatemeh Taheri Dezaki","Himanshu Arora","Rahul Suresh","Amin Banitalebi-Dehkordi"],"pdf_url":"https://arxiv.org/pdf/2312.00766v1.pdf","comment":"Presented in Fifth Workshop on Recommender Systems in\n Fashion(fashionxrecsys) of ACM Conference on Recommender Systems"},{"id":"http://arxiv.org/abs/2312.00761v1","updated":"2023-12-01T18:29:08Z","published":"2023-12-01T18:29:08Z","title":"Deep Unlearning: Fast and Efficient Training-free Approach to Controlled\n Forgetting","summary":" Machine unlearning has emerged as a prominent and challenging area of\ninterest, driven in large part by the rising regulatory demands for industries\nto delete user data upon request and the heightened awareness of privacy.\nExisting approaches either retrain models from scratch or use several\nfinetuning steps for every deletion request, often constrained by computational\nresource limitations and restricted access to the original training data. In\nthis work, we introduce a novel class unlearning algorithm designed to\nstrategically eliminate an entire class or a group of classes from the learned\nmodel. To that end, our algorithm first estimates the Retain Space and the\nForget Space, representing the feature or activation spaces for samples from\nclasses to be retained and unlearned, respectively. To obtain these spaces, we\npropose a novel singular value decomposition-based technique that requires\nlayer wise collection of network activations from a few forward passes through\nthe network. We then compute the shared information between these spaces and\nremove it from the forget space to isolate class-discriminatory feature space\nfor unlearning. Finally, we project the model weights in the orthogonal\ndirection of the class-discriminatory space to obtain the unlearned model. We\ndemonstrate our algorithm's efficacy on ImageNet using a Vision Transformer\nwith only $\\sim$1.5% drop in retain accuracy compared to the original model\nwhile maintaining under 1% accuracy on the unlearned class samples. Further,\nour algorithm consistently performs well when subject to Membership Inference\nAttacks showing 7.8% improvement on average across a variety of image\nclassification datasets and network architectures, as compared to other\nbaselines while being $\\sim$6x more computationally efficient.\n","authors":["Sangamesh Kodge","Gobinda Saha","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2312.00761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.01300v2","updated":"2023-12-01T18:25:19Z","published":"2023-08-02T17:39:30Z","title":"Revisiting DETR Pre-training for Object Detection","summary":" Motivated by the remarkable achievements of DETR-based approaches on COCO\nobject detection and segmentation benchmarks, recent endeavors have been\ndirected towards elevating their performance through self-supervised\npre-training of Transformers while preserving a frozen backbone. Noteworthy\nadvancements in accuracy have been documented in certain studies. Our\ninvestigation delved deeply into a representative approach, DETReg, and its\nperformance assessment in the context of emerging models like\n$\\mathcal{H}$-Deformable-DETR. Regrettably, DETReg proves inadequate in\nenhancing the performance of robust DETR-based models under full data\nconditions. To dissect the underlying causes, we conduct extensive experiments\non COCO and PASCAL VOC probing elements such as the selection of pre-training\ndatasets and strategies for pre-training target generation. By contrast, we\nemploy an optimized approach named Simple Self-training which leads to marked\nenhancements through the combination of an improved box predictor and the\nObjects$365$ benchmark. The culmination of these endeavors results in a\nremarkable AP score of $59.3\\%$ on the COCO val set, outperforming\n$\\mathcal{H}$-Deformable-DETR + Swin-L without pre-training by $1.4\\%$.\nMoreover, a series of synthetic pre-training datasets, generated by merging\ncontemporary image-to-text(LLaVA) and text-to-image (SDXL) models,\nsignificantly amplifies object detection capabilities.\n","authors":["Yan Ma","Weicong Liang","Bohan Chen","Yiduo Hao","Bojian Hou","Xiangyu Yue","Chao Zhang","Yuhui Yuan"],"pdf_url":"https://arxiv.org/pdf/2308.01300v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03940v3","updated":"2023-12-01T17:57:47Z","published":"2023-10-05T23:09:19Z","title":"Hard View Selection for Self-Supervised Learning","summary":" Many Self-Supervised Learning (SSL) methods train their models to be\ninvariant to different \"views\" of an image input for which a good data\naugmentation pipeline is crucial. While considerable efforts were directed\ntowards improving pre-text tasks, architectures, or robustness (e.g., Siamese\nnetworks or teacher-softmax centering), the majority of these methods remain\nstrongly reliant on the random sampling of operations within the image\naugmentation pipeline, such as the random resized crop or color distortion\noperation. In this paper, we argue that the role of the view generation and its\neffect on performance has so far received insufficient attention. To address\nthis, we propose an easy, learning-free, yet powerful Hard View Selection (HVS)\nstrategy designed to extend the random view generation to expose the pretrained\nmodel to harder samples during SSL training. It encompasses the following\niterative steps: 1) randomly sample multiple views and create pairs of two\nviews, 2) run forward passes for each view pair on the currently trained model,\n3) adversarially select the pair yielding the worst loss, and 4) run the\nbackward pass with the selected pair. In our empirical analysis we show that\nunder the hood, HVS increases task difficulty by controlling the Intersection\nover Union of views during pretraining. With only 300-epoch pretraining, HVS is\nable to closely rival the 800-epoch DINO baseline which remains very favorable\neven when factoring in the slowdown induced by the additional forwards of HVS.\nAdditionally, HVS consistently achieves accuracy improvements on ImageNet\nbetween 0.4% and 1.9% on linear evaluation and similar improvements on transfer\ntasks across multiple SSL methods, such as DINO, SimSiam, iBOT, and SimCLR.\n","authors":["Fabio Ferreira","Ivo Rapant","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2310.03940v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10196v2","updated":"2023-12-01T17:49:37Z","published":"2023-08-20T08:02:23Z","title":"Blind Face Restoration for Under-Display Camera via Dictionary Guided\n Transformer","summary":" By hiding the front-facing camera below the display panel, Under-Display\nCamera (UDC) provides users with a full-screen experience. However, due to the\ncharacteristics of the display, images taken by UDC suffer from significant\nquality degradation. Methods have been proposed to tackle UDC image restoration\nand advances have been achieved. There are still no specialized methods and\ndatasets for restoring UDC face images, which may be the most common problem in\nthe UDC scene. To this end, considering color filtering, brightness\nattenuation, and diffraction in the imaging process of UDC, we propose a\ntwo-stage network UDC Degradation Model Network named UDC-DMNet to synthesize\nUDC images by modeling the processes of UDC imaging. Then we use UDC-DMNet and\nhigh-quality face images from FFHQ and CelebA-Test to create UDC face training\ndatasets FFHQ-P/T and testing datasets CelebA-Test-P/T for UDC face\nrestoration. We propose a novel dictionary-guided transformer network named\nDGFormer. Introducing the facial component dictionary and the characteristics\nof the UDC image in the restoration makes DGFormer capable of addressing blind\nface restoration in UDC scenarios. Experiments show that our DGFormer and\nUDC-DMNet achieve state-of-the-art performance.\n","authors":["Jingfan Tan","Xiaoxu Chen","Tao Wang","Kaihao Zhang","Wenhan Luo","Xiaocun Cao"],"pdf_url":"https://arxiv.org/pdf/2308.10196v2.pdf","comment":"To appear in IEEE TCSVT"},{"id":"http://arxiv.org/abs/2212.04705v2","updated":"2023-12-01T17:37:52Z","published":"2022-12-09T07:33:49Z","title":"Physics-based Indirect Illumination for Inverse Rendering","summary":" We present a physics-based inverse rendering method that learns the\nillumination, geometry, and materials of a scene from posed multi-view RGB\nimages. To model the illumination of a scene, existing inverse rendering works\neither completely ignore the indirect illumination or model it by coarse\napproximations, leading to sub-optimal illumination, geometry, and material\nprediction of the scene. In this work, we propose a physics-based illumination\nmodel that first locates surface points through an efficient refined sphere\ntracing algorithm, then explicitly traces the incoming indirect lights at each\nsurface point based on reflection. Then, we estimate each identified indirect\nlight through an efficient neural network. Moreover, we utilize the Leibniz's\nintegral rule to resolve non-differentiability in the proposed illumination\nmodel caused by boundary lights inspired by differentiable irradiance in\ncomputer graphics. As a result, the proposed differentiable illumination model\ncan be learned end-to-end together with geometry and materials estimation. As a\nside product, our physics-based inverse rendering model also facilitates\nflexible and realistic material editing as well as relighting. Extensive\nexperiments on synthetic and real-world datasets demonstrate that the proposed\nmethod performs favorably against existing inverse rendering methods on novel\nview synthesis and inverse rendering.\n","authors":["Youming Deng","Xueting Li","Sifei Liu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2212.04705v2.pdf","comment":"3DV 2024; Project Page: https://denghilbert.github.io/pii"},{"id":"http://arxiv.org/abs/2312.00739v1","updated":"2023-12-01T17:20:47Z","published":"2023-12-01T17:20:47Z","title":"Adversarial Score Distillation: When score distillation meets GAN","summary":" Existing score distillation methods are sensitive to classifier-free guidance\n(CFG) scale: manifested as over-smoothness or instability at small CFG scales,\nwhile over-saturation at large ones. To explain and analyze these issues, we\nrevisit the derivation of Score Distillation Sampling (SDS) and decipher\nexisting score distillation with the Wasserstein Generative Adversarial Network\n(WGAN) paradigm. With the WGAN paradigm, we find that existing score\ndistillation either employs a fixed sub-optimal discriminator or conducts\nincomplete discriminator optimization, resulting in the scale-sensitive issue.\nWe propose the Adversarial Score Distillation (ASD), which maintains an\noptimizable discriminator and updates it using the complete optimization\nobjective. Experiments show that the proposed ASD performs favorably in 2D\ndistillation and text-to-3D tasks against existing methods. Furthermore, to\nexplore the generalization ability of our WGAN paradigm, we extend ASD to the\nimage editing task, which achieves competitive results. The project page and\ncode are at https://github.com/2y7c3/ASD.\n","authors":["Min Wei","Jingkai Zhou","Junyao Sun","Xuesong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.00739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00732v1","updated":"2023-12-01T17:09:31Z","published":"2023-12-01T17:09:31Z","title":"Gaussian Grouping: Segment and Edit Anything in 3D Scenes","summary":" The recent Gaussian Splatting achieves high-quality and real-time novel-view\nsynthesis of the 3D scenes. However, it is solely concentrated on the\nappearance and geometry modeling, while lacking in fine-grained object-level\nscene understanding. To address this issue, we propose Gaussian Grouping, which\nextends Gaussian Splatting to jointly reconstruct and segment anything in\nopen-world 3D scenes. We augment each Gaussian with a compact Identity\nEncoding, allowing the Gaussians to be grouped according to their object\ninstance or stuff membership in the 3D scene. Instead of resorting to expensive\n3D labels, we supervise the Identity Encodings during the differentiable\nrendering by leveraging the 2D mask predictions by SAM, along with introduced\n3D spatial consistency regularization. Comparing to the implicit NeRF\nrepresentation, we show that the discrete and grouped 3D Gaussians can\nreconstruct, segment and edit anything in 3D with high visual quality, fine\ngranularity and efficiency. Based on Gaussian Grouping, we further propose a\nlocal Gaussian Editing scheme, which shows efficacy in versatile scene editing\napplications, including 3D object removal, inpainting, colorization and scene\nrecomposition. Our code and models will be at\nhttps://github.com/lkeab/gaussian-grouping.\n","authors":["Mingqiao Ye","Martin Danelljan","Fisher Yu","Lei Ke"],"pdf_url":"https://arxiv.org/pdf/2312.00732v1.pdf","comment":"We propose Gaussian Grouping, which extends Gaussian Splatting to\n fine-grained open-world 3D scene understanding. Github:\n https://github.com/lkeab/gaussian-grouping"},{"id":"http://arxiv.org/abs/2312.00703v1","updated":"2023-12-01T16:38:14Z","published":"2023-12-01T16:38:14Z","title":"PointBeV: A Sparse Approach to BeV Predictions","summary":" Bird's-eye View (BeV) representations have emerged as the de-facto shared\nspace in driving applications, offering a unified space for sensor data fusion\nand supporting various downstream tasks. However, conventional models use grids\nwith fixed resolution and range and face computational inefficiencies due to\nthe uniform allocation of resources across all cells. To address this, we\npropose PointBeV, a novel sparse BeV segmentation model operating on sparse BeV\ncells instead of dense grids. This approach offers precise control over memory\nusage, enabling the use of long temporal contexts and accommodating\nmemory-constrained platforms. PointBeV employs an efficient two-pass strategy\nfor training, enabling focused computation on regions of interest. At inference\ntime, it can be used with various memory/performance trade-offs and flexibly\nadjusts to new specific use cases. PointBeV achieves state-of-the-art results\non the nuScenes dataset for vehicle, pedestrian, and lane segmentation,\nshowcasing superior performance in static and temporal settings despite being\ntrained solely with sparse signals. We will release our code along with two new\nefficient modules used in the architecture: Sparse Feature Pulling, designed\nfor the effective extraction of features from images to BeV, and Submanifold\nAttention, which enables efficient temporal modeling. Our code is available at\nhttps://github.com/valeoai/PointBeV.\n","authors":["Loick Chambon","Eloi Zablocki","Mickael Chen","Florent Bartoccioni","Patrick Perez","Matthieu Cord"],"pdf_url":"https://arxiv.org/pdf/2312.00703v1.pdf","comment":"https://github.com/valeoai/PointBeV"},{"id":"http://arxiv.org/abs/2312.00700v1","updated":"2023-12-01T16:33:57Z","published":"2023-12-01T16:33:57Z","title":"GIFT: Generative Interpretable Fine-Tuning Transformers","summary":" We present GIFT (Generative Interpretable Fine-tuning Transformers) for\nfine-tuning pretrained (often large) Transformer models at downstream tasks in\na parameter-efficient way with built-in interpretability. Our GIFT is a deep\nparameter-residual learning method, which addresses two problems in fine-tuning\na pretrained Transformer model: Where to apply the parameter-efficient\nfine-tuning (PEFT) to be extremely lightweight yet sufficiently expressive, and\nHow to learn the PEFT to better exploit the knowledge of the pretrained model\nin a direct way? For the former, we select the final projection (linear) layer\nin the multi-head self-attention of a Transformer model, and verify its\neffectiveness. For the latter, in contrast to the prior art that directly\nintroduce new model parameters (often in low-rank approximation form) to be\nlearned in fine-tuning with downstream data, we propose a method for learning\nto generate the fine-tuning parameters. Our GIFT is a hyper-Transformer which\ntake as input the pretrained parameters of the projection layer to generate its\nfine-tuning parameters using a proposed Parameter-to-Cluster Attention (PaCa).\nThe PaCa results in a simple clustering-based forward explainer that plays the\nrole of semantic segmentation in testing. In experiments, our proposed GIFT is\ntested on the VTAB benchmark and the fine-grained visual classification (FGVC)\nbenchmark. It obtains significantly better performance than the prior art. Our\ncode is available at https://github.com/savadikarc/gift\n","authors":["Chinmay Savadikar","Xi Song","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00700v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2312.00699v1","updated":"2023-12-01T16:31:17Z","published":"2023-12-01T16:31:17Z","title":"Rethinking Detection Based Table Structure Recognition for Visually Rich\n Documents","summary":" Table Structure Recognition (TSR) aims at transforming unstructured table\nimages into structured formats, such as HTML sequences. One type of popular\nsolution is using detection models to detect components of a table, such as\ncolumns and rows, then applying a rule-based post-processing method to convert\ndetection results into HTML sequences. However, existing detection-based\nstudies often have the following limitations. First, these studies usually pay\nmore attention to improving the detection performance, which does not\nnecessarily lead to better performance regarding cell-level metrics, such as\nTEDS. Second, some solutions over-simplify the problem and can miss some\ncritical information. Lastly, even though some studies defined the problem to\ndetect more components to provide as much information as other types of\nsolutions, these studies ignore the fact this problem definition is a\nmulti-label detection because row, projected row header and column header can\nshare identical bounding boxes. Besides, there is often a performance gap\nbetween two-stage and transformer-based detection models regarding the\nstructure-only TEDS, even though they have similar performance regarding the\nCOCO metrics. Therefore, we revisit the limitations of existing detection-based\nsolutions, compare two-stage and transformer-based detection models, and\nidentify the key design aspects for the success of a two-stage detection model\nfor the TSR task, including the multi-class problem definition, the aspect\nratio for anchor box generation, and the feature generation of the backbone\nnetwork. We applied simple methods to improve these aspects of the Cascade\nR-CNN model, achieved state-of-the-art performance, and improved the baseline\nCascade R-CNN model by 19.32%, 11.56% and 14.77% regarding the structure-only\nTEDS on SciTSR, FinTabNet, and PubTables1M datasets.\n","authors":["Bin Xiao","Murat Simsek","Burak Kantarci","Ala Abu Alkheir"],"pdf_url":"https://arxiv.org/pdf/2312.00699v1.pdf","comment":"submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2312.00694v1","updated":"2023-12-01T16:27:48Z","published":"2023-12-01T16:27:48Z","title":"Object Detector Differences when using Synthetic and Real Training Data","summary":" To train well-performing generalizing neural networks, sufficiently large and\ndiverse datasets are needed. Collecting data while adhering to privacy\nlegislation becomes increasingly difficult and annotating these large datasets\nis both a resource-heavy and time-consuming task. An approach to overcome these\ndifficulties is to use synthetic data since it is inherently scalable and can\nbe automatically annotated. However, how training on synthetic data affects the\nlayers of a neural network is still unclear. In this paper, we train the YOLOv3\nobject detector on real and synthetic images from city environments. We perform\na similarity analysis using Centered Kernel Alignment (CKA) to explore the\neffects of training on synthetic data on a layer-wise basis. The analysis\ncaptures the architecture of the detector while showing both different and\nsimilar patterns between different models. With this similarity analysis we\nwant to give insights on how training synthetic data affects each layer and to\ngive a better understanding of the inner workings of complex neural networks.\nThe results show that the largest similarity between a detector trained on real\ndata and a detector trained on synthetic data was in the early layers, and the\nlargest difference was in the head part. The results also show that no major\ndifference in performance or similarity could be seen between frozen and\nunfrozen backbone.\n","authors":["Martin Georg Ljungqvist","Otto Nordander","Markus Skans","Arvid Mildner","Tony Liu","Pierre Nugues"],"pdf_url":"https://arxiv.org/pdf/2312.00694v1.pdf","comment":"27 pages. The Version of Record of this article is published in\n Springer Nature Computer Science 2023, and is available online at\n https://doi.org/10.1007/s42979-023-01704-5"},{"id":"http://arxiv.org/abs/2312.00692v1","updated":"2023-12-01T16:18:55Z","published":"2023-12-01T16:18:55Z","title":"VisionaryVR: An Optical Simulation Tool for Evaluating and Optimizing\n Vision Correction Solutions in Virtual Reality","summary":" Developing and evaluating vision science methods require robust and efficient\ntools for assessing their performance in various real-world scenarios. This\nstudy presents a novel virtual reality (VR) simulation tool that simulates\nreal-world optical methods while giving high experimental control to the\nexperiment. The tool incorporates an experiment controller, to smoothly and\neasily handle multiple conditions, a generic eye-tracking controller, that\nworks with most common VR eye-trackers, a configurable defocus simulator, and a\ngeneric VR questionnaire loader to assess participants' behavior in virtual\nreality. This VR-based simulation tool bridges the gap between theoretical and\napplied research on new optical methods, corrections, and therapies. It enables\nvision scientists to increase their research tools with a robust, realistic,\nand fast research environment.\n","authors":["Benedikt W. Hosp","Martin Dechant","Yannick Sauer","Rajat Agarwala","Siegfried Wahl"],"pdf_url":"https://arxiv.org/pdf/2312.00692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00690v1","updated":"2023-12-01T16:17:16Z","published":"2023-12-01T16:17:16Z","title":"Open-vocabulary object 6D pose estimation","summary":" We introduce the new setting of open-vocabulary object 6D pose estimation, in\nwhich a textual prompt is used to specify the object of interest. In contrast\nto existing approaches, in our setting (i) the object of interest is specified\nsolely through the textual prompt, (ii) no object model (e.g. CAD or video\nsequence) is required at inference, (iii) the object is imaged from two\ndifferent viewpoints of two different scenes, and (iv) the object was not\nobserved during the training phase. To operate in this setting, we introduce a\nnovel approach that leverages a Vision-Language Model to segment the object of\ninterest from two distinct scenes and to estimate its relative 6D pose. The key\nof our approach is a carefully devised strategy to fuse object-level\ninformation provided by the prompt with local image features, resulting in a\nfeature space that can generalize to novel concepts. We validate our approach\non a new benchmark based on two popular datasets, REAL275 and Toyota-Light,\nwhich collectively encompass 39 object instances appearing in four thousand\nimage pairs. The results demonstrate that our approach outperforms both a\nwell-established hand-crafted method and a recent deep learning-based baseline\nin estimating the relative 6D pose of objects in different scenes. Project\nwebsite: https://jcorsetti.github.io/oryon-website/.\n","authors":["Jaime Corsetti","Davide Boscaini","Changjae Oh","Andrea Cavallaro","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00690v1.pdf","comment":"Technical report. 21 pages, 15 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.00689v1","updated":"2023-12-01T16:16:46Z","published":"2023-12-01T16:16:46Z","title":"Infrared Image Super-Resolution via GAN","summary":" The ability of generative models to accurately fit data distributions has\nresulted in their widespread adoption and success in fields such as computer\nvision and natural language processing. In this chapter, we provide a brief\noverview of the application of generative models in the domain of infrared (IR)\nimage super-resolution, including a discussion of the various challenges and\nadversarial training methods employed. We propose potential areas for further\ninvestigation and advancement in the application of generative models for IR\nimage super-resolution.\n","authors":["Yongsong Huang","Shinichiro Omachi"],"pdf_url":"https://arxiv.org/pdf/2312.00689v1.pdf","comment":"Applications of Generative AI, Chapter 28"},{"id":"http://arxiv.org/abs/2312.00677v1","updated":"2023-12-01T16:00:16Z","published":"2023-12-01T16:00:16Z","title":"Unsupervised Adaptive Implicit Neural Representation Learning for\n Scan-Specific MRI Reconstruction","summary":" In recent studies on MRI reconstruction, advances have shown significant\npromise for further accelerating the MRI acquisition. Most state-of-the-art\nmethods require a large amount of fully-sampled data to optimise reconstruction\nmodels, which is impractical and expensive under certain clinical settings. On\nthe other hand, for unsupervised scan-specific reconstruction methods,\noverfitting is likely to happen due to insufficient supervision, while\nrestrictions on acceleration rates and under-sampling patterns further limit\ntheir applicability. To this end, we propose an unsupervised, adaptive\ncoarse-to-fine framework that enhances reconstruction quality without being\nconstrained by the sparsity levels or patterns in under-sampling. The framework\nemploys an implicit neural representation for scan-specific MRI reconstruction,\nlearning a mapping from multi-dimensional coordinates to their corresponding\nsignal intensities. Moreover, we integrate a novel learning strategy that\nprogressively refines the use of acquired k-space signals for self-supervision.\nThis approach effectively adjusts the proportion of supervising signals from\nunevenly distributed information across different frequency bands, thus\nmitigating the issue of overfitting while improving the overall reconstruction.\nComprehensive evaluation on a public dataset, including both 2D and 3D data,\nhas shown that our method outperforms current state-of-the-art scan-specific\nMRI reconstruction techniques, for up to 8-fold under-sampling.\n","authors":["Junwei Yang","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2312.00677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.07618v3","updated":"2023-12-01T16:00:14Z","published":"2023-05-12T17:17:01Z","title":"Uncertainty Estimation and Out-of-Distribution Detection for Deep\n Learning-Based Image Reconstruction using the Local Lipschitz","summary":" Accurate image reconstruction is at the heart of diagnostics in medical\nimaging. Supervised deep learning-based approaches have been investigated for\nsolving inverse problems including image reconstruction. However, these trained\nmodels encounter unseen data distributions that are widely shifted from\ntraining data during deployment. Therefore, it is essential to assess whether a\ngiven input falls within the training data distribution for diagnostic\npurposes. Uncertainty estimation approaches exist but focus on providing an\nuncertainty map to radiologists, rather than assessing the training\ndistribution fit. In this work, we propose a method based on the local\nLipschitz-based metric to distinguish out-of-distribution images from\nin-distribution with an area under the curve of 99.94%. Empirically, we\ndemonstrate a very strong relationship between the local Lipschitz value and\nmean absolute error (MAE), supported by a high Spearman's rank correlation\ncoefficient of 0.8475, which determines the uncertainty estimation threshold\nfor optimal model performance. Through the identification of false positives,\nthe local Lipschitz and MAE relationship was used to guide data augmentation\nand reduce model uncertainty. Our study was validated using the AUTOMAP\narchitecture for sensor-to-image Magnetic Resonance Imaging (MRI)\nreconstruction. We compare our proposed approach with baseline methods:\nMonte-Carlo dropout and deep ensembles, and further analysis included MRI\ndenoising and Computed Tomography (CT) sparse-to-full view reconstruction using\nUNET architectures. We show that our approach is applicable to various\narchitectures and learned functions, especially in the realm of medical image\nreconstruction, where preserving the diagnostic accuracy of reconstructed\nimages remains paramount.\n","authors":["Danyal F. Bhutto","Bo Zhu","Jeremiah Z. Liu","Neha Koonjoo","Hongwei B. Li","Bruce R. Rosen","Matthew S. Rosen"],"pdf_url":"https://arxiv.org/pdf/2305.07618v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00674v1","updated":"2023-12-01T15:54:55Z","published":"2023-12-01T15:54:55Z","title":"LightCLIP: Learning Multi-Level Interaction for Lightweight\n Vision-Language Models","summary":" Vision-language pre-training like CLIP has shown promising performance on\nvarious downstream tasks such as zero-shot image classification and image-text\nretrieval. Most of the existing CLIP-alike works usually adopt relatively large\nimage encoders like ResNet50 and ViT, while the lightweight counterparts are\nrarely discussed. In this paper, we propose a multi-level interaction paradigm\nfor training lightweight CLIP models. Firstly, to mitigate the problem that\nsome image-text pairs are not strictly one-to-one correspondence, we improve\nthe conventional global instance-level alignment objective by softening the\nlabel of negative samples progressively. Secondly, a relaxed bipartite matching\nbased token-level alignment objective is introduced for finer-grained alignment\nbetween image patches and textual words. Moreover, based on the observation\nthat the accuracy of CLIP model does not increase correspondingly as the\nparameters of text encoder increase, an extra objective of masked language\nmodeling (MLM) is leveraged for maximizing the potential of the shortened text\nencoder. In practice, an auxiliary fusion module injecting unmasked image\nembedding into masked text embedding at different network stages is proposed\nfor enhancing the MLM. Extensive experiments show that without introducing\nadditional computational cost during inference, the proposed method achieves a\nhigher performance on multiple downstream tasks.\n","authors":["Ying Nie","Wei He","Kai Han","Yehui Tang","Tianyu Guo","Fanyi Du","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2312.00674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06388v4","updated":"2023-12-01T15:52:32Z","published":"2023-03-11T11:42:01Z","title":"Generalized 3D Self-supervised Learning Framework via Prompted\n Foreground-Aware Feature Contrast","summary":" Contrastive learning has recently demonstrated great potential for\nunsupervised pre-training in 3D scene understanding tasks. However, most\nexisting work randomly selects point features as anchors while building\ncontrast, leading to a clear bias toward background points that often dominate\nin 3D scenes. Also, object awareness and foreground-to-background\ndiscrimination are neglected, making contrastive learning less effective. To\ntackle these issues, we propose a general foreground-aware feature contrast\nFAC++ framework to learn more effective point cloud representations in\npre-training. FAC++ consists of two novel contrast designs to construct more\neffective and informative contrast pairs. The first is building positive pairs\nwithin the same foreground segment where points tend to have the same\nsemantics. The second is that we prevent over-discrimination between 3D\nsegments/objects and encourage grouped foreground-to-background distinctions at\nthe segment level with adaptive feature learning in a Siamese correspondence\nnetwork, which adaptively learns feature correlations within and across point\ncloud views effectively. Moreover, we have designed the foreground-prompted\nregional sampling to enhance more balanced foreground-aware learning, which is\ntermed FAC++. Visualization with point activation maps shows that our contrast\npairs capture clear correspondences among foreground regions during\npre-training. Quantitative experiments also show that FAC++ achieves superior\nknowledge transfer and data efficiency in various downstream 3D semantic\nsegmentation, instance segmentation as well as object detection tasks. All\ncodes, data, and models are available at:\nhttps://github.com/KangchengLiu/FAC_Foreground_Aware_Contrast\n","authors":["Kangcheng Liu","Xinhu Zheng","Chaoqun Wang","Kai Tang","Ming Liu","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2303.06388v4.pdf","comment":"International Journal of Computer Vision, Manuscript Info: 28 Pages,\n 14 Figures, and 11 Tables"},{"id":"http://arxiv.org/abs/2307.00309v2","updated":"2023-12-01T15:51:55Z","published":"2023-07-01T11:46:36Z","title":"Adversarial Attacks and Defenses on 3D Point Cloud Classification: A\n Survey","summary":" Deep learning has successfully solved a wide range of tasks in 2D vision as a\ndominant AI technique. Recently, deep learning on 3D point clouds is becoming\nincreasingly popular for addressing various tasks in this field. Despite\nremarkable achievements, deep learning algorithms are vulnerable to adversarial\nattacks. These attacks are imperceptible to the human eye but can easily fool\ndeep neural networks in the testing and deployment stage. To encourage future\nresearch, this survey summarizes the current progress on adversarial attack and\ndefense techniques on point cloud classification.This paper first introduces\nthe principles and characteristics of adversarial attacks and summarizes and\nanalyzes adversarial example generation methods in recent years. Additionally,\nit provides an overview of defense strategies, organized into data-focused and\nmodel-focused methods. Finally, it presents several current challenges and\npotential future research directions in this domain.\n","authors":["Hanieh Naderi","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.00309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00671v1","updated":"2023-12-01T15:50:20Z","published":"2023-12-01T15:50:20Z","title":"CellMixer: Annotation-free Semantic Cell Segmentation of Heterogeneous\n Cell Populations","summary":" In recent years, several unsupervised cell segmentation methods have been\npresented, trying to omit the requirement of laborious pixel-level annotations\nfor the training of a cell segmentation model. Most if not all of these methods\nhandle the instance segmentation task by focusing on the detection of different\ncell instances ignoring their type. While such models prove adequate for\ncertain tasks, like cell counting, other applications require the\nidentification of each cell's type. In this paper, we present CellMixer, an\ninnovative annotation-free approach for the semantic segmentation of\nheterogeneous cell populations. Our augmentation-based method enables the\ntraining of a segmentation model from image-level labels of homogeneous cell\npopulations. Our results show that CellMixer can achieve competitive\nsegmentation performance across multiple cell types and imaging modalities,\ndemonstrating the method's scalability and potential for broader applications\nin medical imaging, cellular biology, and diagnostics.\n","authors":["Mehdi Naouar","Gabriel Kalweit","Anusha Klett","Yannick Vogt","Paula Silvestrini","Diana Laura Infante Ramirez","Roland Mertelsmann","Joschka Boedecker","Maria Kalweit"],"pdf_url":"https://arxiv.org/pdf/2312.00671v1.pdf","comment":"Medical Imaging Meets NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00663v1","updated":"2023-12-01T15:47:04Z","published":"2023-12-01T15:47:04Z","title":"Generalized Label-Efficient 3D Scene Parsing via Hierarchical Feature\n Aligned Pre-Training and Region-Aware Fine-tuning","summary":" Deep neural network models have achieved remarkable progress in 3D scene\nunderstanding while trained in the closed-set setting and with full labels.\nHowever, the major bottleneck for current 3D recognition approaches is that\nthey do not have the capacity to recognize any unseen novel classes beyond the\ntraining categories in diverse kinds of real-world applications. In the\nmeantime, current state-of-the-art 3D scene understanding approaches primarily\nrequire high-quality labels to train neural networks, which merely perform well\nin a fully supervised manner. This work presents a generalized and simple\nframework for dealing with 3D scene understanding when the labeled scenes are\nquite limited. To extract knowledge for novel categories from the pre-trained\nvision-language models, we propose a hierarchical feature-aligned pre-training\nand knowledge distillation strategy to extract and distill meaningful\ninformation from large-scale vision-language models, which helps benefit the\nopen-vocabulary scene understanding tasks. To leverage the boundary\ninformation, we propose a novel energy-based loss with boundary awareness\nbenefiting from the region-level boundary predictions. To encourage latent\ninstance discrimination and to guarantee efficiency, we propose the\nunsupervised region-level semantic contrastive learning scheme for point\nclouds, using confident predictions of the neural network to discriminate the\nintermediate feature embeddings at multiple stages. Extensive experiments with\nboth indoor and outdoor scenes demonstrated the effectiveness of our approach\nin both data-efficient learning and open-world few-shot learning. All codes,\nmodels, and data are made publicly available at:\nhttps://drive.google.com/drive/folders/1M58V-PtR8DBEwD296zJkNg_m2qq-MTAP?usp=sharing.\n","authors":["Kangcheng Liu","Yong-Jin Liu","Kai Tang","Ming Liu","Baoquan Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00663v1.pdf","comment":"IEEE Transactions on Pattern Analysis and Machine Intelligence,\n Manuscript Info: 22 Pages, 16 Figures, and 8 Tables"},{"id":"http://arxiv.org/abs/2311.14049v3","updated":"2023-12-01T15:44:18Z","published":"2023-11-23T15:05:12Z","title":"Assessment of Deep Learning Segmentation for Real-Time Free-Breathing\n Cardiac Magnetic Resonance Imaging","summary":" In recent years, a variety of deep learning networks for cardiac MRI (CMR)\nsegmentation have been developed and analyzed. However, nearly all of them are\nfocused on cine CMR under breathold. In this work, accuracy of deep learning\nmethods is assessed for volumetric analysis (via segmentation) of the left\nventricle in real-time free-breathing CMR at rest and under exercise stress.\nData from healthy volunteers (n=15) for cine and real-time free-breathing CMR\nwere analyzed retrospectively. Segmentations of a commercial software (comDL)\nand a freely available neural network (nnU-Net), were compared to a reference\ncreated via the manual correction of comDL segmentation. Segmentation of left\nventricular endocardium (LV), left ventricular myocardium (MYO), and right\nventricle (RV) is evaluated for both end-systolic and end-diastolic phases and\nanalyzed with Dice's coefficient (DC). The volumetric analysis includes LV\nend-diastolic volume (EDV), LV end-systolic volume (ESV), and LV ejection\nfraction (EF). For cine CMR, nnU-Net and comDL achieve a DC above 0.95 for LV\nand 0.9 for MYO, and RV. For real-time CMR, the accuracy of nnU-Net exceeds\nthat of comDL overall. For real-time CMR at rest, nnU-Net achieves a DC of 0.94\nfor LV, 0.89 for MYO, and 0.90 for RV; mean absolute differences between\nnnU-Net and reference are 2.9mL for EDV, 3.5mL for ESV and 2.6% for EF. For\nreal-time CMR under exercise stress, nnU-Net achieves a DC of 0.92 for LV, 0.85\nfor MYO, and 0.83 for RV; mean absolute differences between nnU-Net and\nreference are 11.4mL for EDV, 2.9mL for ESV and 3.6% for EF. Deep learning\nmethods designed or trained for cine CMR segmentation can perform well on\nreal-time CMR. For real-time free-breathing CMR at rest, the performance of\ndeep learning methods is comparable to inter-observer variability in cine CMR\nand is usable or fully automatic segmentation.\n","authors":["Martin Schilling","Christina Unterberg-Buchwald","Joachim Lotz","Martin Uecker"],"pdf_url":"https://arxiv.org/pdf/2311.14049v3.pdf","comment":"Martin Schilling and Christina Unterberg-Buchwald contributed equally\n to this work"},{"id":"http://arxiv.org/abs/2312.00661v1","updated":"2023-12-01T15:40:26Z","published":"2023-12-01T15:40:26Z","title":"Dual-Domain Multi-Contrast MRI Reconstruction with Synthesis-based\n Fusion Network","summary":" Purpose: To develop an efficient dual-domain reconstruction framework for\nmulti-contrast MRI, with the focus on minimising cross-contrast misalignment in\nboth the image and the frequency domains to enhance optimisation. Theory and\nMethods: Our proposed framework, based on deep learning, facilitates the\noptimisation for under-sampled target contrast using fully-sampled reference\ncontrast that is quicker to acquire. The method consists of three key steps: 1)\nLearning to synthesise data resembling the target contrast from the reference\ncontrast; 2) Registering the multi-contrast data to reduce inter-scan motion;\nand 3) Utilising the registered data for reconstructing the target contrast.\nThese steps involve learning in both domains with regularisation applied to\nensure their consistency. We also compare the reconstruction performance with\nexisting deep learning-based methods using a dataset of brain MRI scans.\nResults: Extensive experiments demonstrate the superiority of our proposed\nframework, for up to an 8-fold acceleration rate, compared to state-of-the-art\nalgorithms. Comprehensive analysis and ablation studies further present the\neffectiveness of the proposed components. Conclusion:Our dual-domain framework\noffers a promising approach to multi-contrast MRI reconstruction. It can also\nbe integrated with existing methods to further enhance the reconstruction.\n","authors":["Junwei Yang","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2312.00661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18765v2","updated":"2023-12-01T15:38:31Z","published":"2023-11-30T18:05:52Z","title":"MLLMs-Augmented Visual-Language Representation Learning","summary":" Visual-language pre-training (VLP) has achieved remarkable success in\nmulti-modal tasks, largely attributed to the availability of large-scale\nimage-text datasets. In this work, we demonstrate that multi-modal large\nlanguage models (MLLMs) can enhance visual-language representation learning by\nimproving data quality. Our approach is simple, utilizing MLLMs to extend\nmultiple captions for each image. To prevent the bias introduced by MLLMs'\nhallucinations and intrinsic caption styles, we propose \"text shearing\" to\nmaintain the same length for extended captions as that of the original\ncaptions. In image-text retrieval, our method consistently obtains 5.6 ~ 35.0%\nand 16.8 ~ 46.1% improvement on R@1 under the fine-tuning and zero-shot\nsettings, respectively. Notably, we obtain zero-shot results that are\ncomparable to fine-tuning on target datasets, which encourages more exploration\nof the versatile use of MLLMs.\n","authors":["Yanqing Liu","Kai Wang","Wenqi Shao","Ping Luo","Yu Qiao","Mike Zheng Shou","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2311.18765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00651v1","updated":"2023-12-01T15:24:38Z","published":"2023-12-01T15:24:38Z","title":"TrackDiffusion: Multi-object Tracking Data Generation via Diffusion\n Models","summary":" Diffusion models have gained prominence in generating data for perception\ntasks such as image classification and object detection. However, the potential\nin generating high-quality tracking sequences, a crucial aspect in the field of\nvideo perception, has not been fully investigated. To address this gap, we\npropose TrackDiffusion, a novel architecture designed to generate continuous\nvideo sequences from the tracklets. TrackDiffusion represents a significant\ndeparture from the traditional layout-to-image (L2I) generation and copy-paste\nsynthesis focusing on static image elements like bounding boxes by empowering\nimage diffusion models to encompass dynamic and continuous tracking\ntrajectories, thereby capturing complex motion nuances and ensuring instance\nconsistency among video frames. For the first time, we demonstrate that the\ngenerated video sequences can be utilized for training multi-object tracking\n(MOT) systems, leading to significant improvement in tracker performance.\nExperimental results show that our model significantly enhances instance\nconsistency in generated video sequences, leading to improved perceptual\nmetrics. Our approach achieves an improvement of 8.7 in TrackAP and 11.8 in\nTrackAP$_{50}$ on the YTVIS dataset, underscoring its potential to redefine the\nstandards of video data generation for MOT tasks and beyond.\n","authors":["Pengxiang Li","Zhili Liu","Kai Chen","Lanqing Hong","Yunzhi Zhuge","Dit-Yan Yeung","Huchuan Lu","Xu Jia"],"pdf_url":"https://arxiv.org/pdf/2312.00651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00648v1","updated":"2023-12-01T15:20:58Z","published":"2023-12-01T15:20:58Z","title":"SPOT: Self-Training with Patch-Order Permutation for Object-Centric\n Learning with Autoregressive Transformers","summary":" Unsupervised object-centric learning aims to decompose scenes into\ninterpretable object entities, termed slots. Slot-based auto-encoders stand out\nas a prominent method for this task. Within them, crucial aspects include\nguiding the encoder to generate object-specific slots and ensuring the decoder\nutilizes them during reconstruction. This work introduces two novel techniques,\n(i) an attention-based self-training approach, which distills superior\nslot-based attention masks from the decoder to the encoder, enhancing object\nsegmentation, and (ii) an innovative patch-order permutation strategy for\nautoregressive transformers that strengthens the role of slot vectors in\nreconstruction. The effectiveness of these strategies is showcased\nexperimentally. The combined approach significantly surpasses prior slot-based\nautoencoder methods in unsupervised object segmentation, especially with\ncomplex real-world images. We provide the implementation code at\nhttps://github.com/gkakogeorgiou/spot .\n","authors":["Ioannis Kakogeorgiou","Spyros Gidaris","Konstantinos Karantzalos","Nikos Komodakis"],"pdf_url":"https://arxiv.org/pdf/2312.00648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13080v2","updated":"2023-12-01T15:18:18Z","published":"2023-02-25T13:58:37Z","title":"Does a Neural Network Really Encode Symbolic Concepts?","summary":" Recently, a series of studies have tried to extract interactions between\ninput variables modeled by a DNN and define such interactions as concepts\nencoded by the DNN. However, strictly speaking, there still lacks a solid\nguarantee whether such interactions indeed represent meaningful concepts.\nTherefore, in this paper, we examine the trustworthiness of interaction\nconcepts from four perspectives. Extensive empirical studies have verified that\na well-trained DNN usually encodes sparse, transferable, and discriminative\nconcepts, which is partially aligned with human intuition.\n","authors":["Mingjie Li","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2203.13278v4","updated":"2023-12-01T15:17:38Z","published":"2022-03-24T18:11:31Z","title":"Practical Blind Image Denoising via Swin-Conv-UNet and Data Synthesis","summary":" While recent years have witnessed a dramatic upsurge of exploiting deep\nneural networks toward solving image denoising, existing methods mostly rely on\nsimple noise assumptions, such as additive white Gaussian noise (AWGN), JPEG\ncompression noise and camera sensor noise, and a general-purpose blind\ndenoising method for real images remains unsolved. In this paper, we attempt to\nsolve this problem from the perspective of network architecture design and\ntraining data synthesis. Specifically, for the network architecture design, we\npropose a swin-conv block to incorporate the local modeling ability of residual\nconvolutional layer and non-local modeling ability of swin transformer block,\nand then plug it as the main building block into the widely-used image-to-image\ntranslation UNet architecture. For the training data synthesis, we design a\npractical noise degradation model which takes into consideration different\nkinds of noise (including Gaussian, Poisson, speckle, JPEG compression, and\nprocessed camera sensor noises) and resizing, and also involves a random\nshuffle strategy and a double degradation strategy. Extensive experiments on\nAGWN removal and real image denoising demonstrate that the new network\narchitecture design achieves state-of-the-art performance and the new\ndegradation model can help to significantly improve the practicability. We\nbelieve our work can provide useful insights into current denoising research.\n","authors":["Kai Zhang","Yawei Li","Jingyun Liang","Jiezhang Cao","Yulun Zhang","Hao Tang","Deng-Ping Fan","Radu Timofte","Luc Van Gool"],"pdf_url":"https://arxiv.org/pdf/2203.13278v4.pdf","comment":"Codes: https://github.com/cszn/SCUNet"},{"id":"http://arxiv.org/abs/2311.09680v4","updated":"2023-12-01T15:12:06Z","published":"2023-11-16T08:49:46Z","title":"Trustworthy Large Models in Vision: A Survey","summary":" The rapid progress of Large Models (LMs) has recently revolutionized various\nfields of deep learning with remarkable grades, ranging from Natural Language\nProcessing (NLP) to Computer Vision (CV). However, LMs are increasingly\nchallenged and criticized by academia and industry due to their powerful\nperformance but untrustworthy behavior, which urgently needs to be alleviated\nby reliable methods. Despite the abundance of literature on trustworthy LMs in\nNLP, a systematic survey specifically delving into the trustworthiness of LMs\nin CV remains absent. In order to mitigate this gap, we summarize four relevant\nconcerns that obstruct the trustworthy usage in vision of LMs in this survey,\nincluding 1) human misuse, 2) vulnerability, 3) inherent issue and 4)\ninterpretability. By highlighting corresponding challenge, countermeasures, and\ndiscussion in each topic, we hope this survey will facilitate readers'\nunderstanding of this field, promote alignment of LMs with human expectations\nand enable trustworthy LMs to serve as welfare rather than disaster for human\nsociety.\n","authors":["Ziyan Guo","Li Xu","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.09680v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00639v1","updated":"2023-12-01T14:59:43Z","published":"2023-12-01T14:59:43Z","title":"EvE: Exploiting Generative Priors for Radiance Field Enrichment","summary":" Modeling large-scale scenes from unconstrained image collections in-the-wild\nhas proven to be a major challenge in computer vision. Existing methods\ntackling in-the-wild neural rendering operate in a closed-world setting, where\nknowledge is limited to a scene's captured images within a training set. We\npropose EvE, which is, to the best of our knowledge, the first method\nleveraging generative priors to improve in-the-wild scene modeling. We employ\npre-trained generative networks to enrich K-Planes representations with\nextrinsic knowledge. To this end, we define an alternating training procedure\nto conduct optimization guidance of K-Planes trained on the training set. We\ncarry out extensive experiments and verify the merit of our method on synthetic\ndata as well as real tourism photo collections. EvE enhances rendered scenes\nwith richer details and outperforms the state of the art on the task of novel\nview synthesis in-the-wild. Our project page can be found at\nhttps://eve-nvs.github.io .\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Jeremie Mary","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2312.00639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00634v1","updated":"2023-12-01T14:54:44Z","published":"2023-12-01T14:54:44Z","title":"A Recent Survey of Vision Transformers for Medical Image Segmentation","summary":" Medical image segmentation plays a crucial role in various healthcare\napplications, enabling accurate diagnosis, treatment planning, and disease\nmonitoring. In recent years, Vision Transformers (ViTs) have emerged as a\npromising technique for addressing the challenges in medical image\nsegmentation. In medical images, structures are usually highly interconnected\nand globally distributed. ViTs utilize their multi-scale attention mechanism to\nmodel the long-range relationships in the images. However, they do lack\nimage-related inductive bias and translational invariance, potentially\nimpacting their performance. Recently, researchers have come up with various\nViT-based approaches that incorporate CNNs in their architectures, known as\nHybrid Vision Transformers (HVTs) to capture local correlation in addition to\nthe global information in the images. This survey paper provides a detailed\nreview of the recent advancements in ViTs and HVTs for medical image\nsegmentation. Along with the categorization of ViT and HVT-based medical image\nsegmentation approaches we also present a detailed overview of their real-time\napplications in several medical image modalities. This survey may serve as a\nvaluable resource for researchers, healthcare practitioners, and students in\nunderstanding the state-of-the-art approaches for ViT-based medical image\nsegmentation.\n","authors":["Asifullah Khan","Zunaira Rauf","Abdul Rehman Khan","Saima Rathore","Saddam Hussain Khan","Sahar Shah","Umair Farooq","Hifsa Asif","Aqsa Asif","Umme Zahoora","Rafi Ullah Khalil","Suleman Qamar","Umme Hani Asif","Faiza Babar Khan","Abdul Majid","Jeonghwan Gwak"],"pdf_url":"https://arxiv.org/pdf/2312.00634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00633v1","updated":"2023-12-01T14:52:59Z","published":"2023-12-01T14:52:59Z","title":"Towards Efficient 3D Object Detection in Bird's-Eye-View Space for\n Autonomous Driving: A Convolutional-Only Approach","summary":" 3D object detection in Bird's-Eye-View (BEV) space has recently emerged as a\nprevalent approach in the field of autonomous driving. Despite the demonstrated\nimprovements in accuracy and velocity estimation compared to perspective view\nmethods, the deployment of BEV-based techniques in real-world autonomous\nvehicles remains challenging. This is primarily due to their reliance on\nvision-transformer (ViT) based architectures, which introduce quadratic\ncomplexity with respect to the input resolution. To address this issue, we\npropose an efficient BEV-based 3D detection framework called BEVENet, which\nleverages a convolutional-only architectural design to circumvent the\nlimitations of ViT models while maintaining the effectiveness of BEV-based\nmethods. Our experiments show that BEVENet is 3$\\times$ faster than\ncontemporary state-of-the-art (SOTA) approaches on the NuScenes challenge,\nachieving a mean average precision (mAP) of 0.456 and a nuScenes detection\nscore (NDS) of 0.555 on the NuScenes validation dataset, with an inference\nspeed of 47.6 frames per second. To the best of our knowledge, this study\nstands as the first to achieve such significant efficiency improvements for\nBEV-based methods, highlighting their enhanced feasibility for real-world\nautonomous driving applications.\n","authors":["Yuxin Li","Qiang Han","Mengying Yu","Yuxin Jiang","Chaikiat Yeo","Yiheng Li","Zihang Huang","Nini Liu","Hsuanhan Chen","Xiaojun Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00627v1","updated":"2023-12-01T14:43:28Z","published":"2023-12-01T14:43:28Z","title":"Rethinking the Domain Gap in Near-infrared Face Recognition","summary":" Heterogeneous face recognition (HFR) involves the intricate task of matching\nface images across the visual domains of visible (VIS) and near-infrared (NIR).\nWhile much of the existing literature on HFR identifies the domain gap as a\nprimary challenge and directs efforts towards bridging it at either the input\nor feature level, our work deviates from this trend. We observe that large\nneural networks, unlike their smaller counterparts, when pre-trained on large\nscale homogeneous VIS data, demonstrate exceptional zero-shot performance in\nHFR, suggesting that the domain gap might be less pronounced than previously\nbelieved. By approaching the HFR problem as one of low-data fine-tuning, we\nintroduce a straightforward framework: comprehensive pre-training, succeeded by\na regularized fine-tuning strategy, that matches or surpasses the current\nstate-of-the-art on four publicly available benchmarks. Corresponding codes can\nbe found at https://github.com/michaeltrs/RethinkNIRVIS.\n","authors":["Michail Tarasiou","Jiankang Deng","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.00627v1.pdf","comment":"5 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2306.00854v3","updated":"2023-12-01T14:30:06Z","published":"2023-06-01T16:10:10Z","title":"Spatio-Angular Convolutions for Super-resolution in Diffusion MRI","summary":" Diffusion MRI (dMRI) is a widely used imaging modality, but requires long\nscanning times to acquire high resolution datasets. By leveraging the unique\ngeometry present within this domain, we present a novel approach to dMRI\nangular super-resolution that extends upon the parametric continuous\nconvolution (PCConv) framework. We introduce several additions to the operation\nincluding a Fourier feature mapping, global coordinates, and domain specific\ncontext. Using this framework, we build a fully parametric continuous\nconvolution network (PCCNN) and compare against existing models. We demonstrate\nthe PCCNN performs competitively while using significantly less parameters.\nMoreover, we show that this formulation generalises well to clinically relevant\ndownstream analyses such as fixel-based analysis, and neurite orientation\ndispersion and density imaging.\n","authors":["Matthew Lyon","Paul Armitage","Mauricio A Álvarez"],"pdf_url":"https://arxiv.org/pdf/2306.00854v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14505v2","updated":"2023-12-01T14:29:01Z","published":"2023-06-26T08:24:37Z","title":"AME-CAM: Attentive Multiple-Exit CAM for Weakly Supervised Segmentation\n on MRI Brain Tumor","summary":" Magnetic resonance imaging (MRI) is commonly used for brain tumor\nsegmentation, which is critical for patient evaluation and treatment planning.\nTo reduce the labor and expertise required for labeling, weakly-supervised\nsemantic segmentation (WSSS) methods with class activation mapping (CAM) have\nbeen proposed. However, existing CAM methods suffer from low resolution due to\nstrided convolution and pooling layers, resulting in inaccurate predictions. In\nthis study, we propose a novel CAM method, Attentive Multiple-Exit CAM\n(AME-CAM), that extracts activation maps from multiple resolutions to\nhierarchically aggregate and improve prediction accuracy. We evaluate our\nmethod on the BraTS 2021 dataset and show that it outperforms state-of-the-art\nmethods.\n","authors":["Yu-Jen Chen","Xinrong Hu","Yiyu Shi","Tsung-Yi Ho"],"pdf_url":"https://arxiv.org/pdf/2306.14505v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2306.05476"},{"id":"http://arxiv.org/abs/2304.06767v4","updated":"2023-12-01T14:28:06Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v4.pdf","comment":"29 pages, 12 figures, Published in Transactions on Machine Learning\n Research (TMLR)"},{"id":"http://arxiv.org/abs/2304.04336v3","updated":"2023-12-01T14:07:01Z","published":"2023-04-10T00:25:15Z","title":"Split, Merge, and Refine: Fitting Tight Bounding Boxes via\n Over-Segmentation and Iterative Search","summary":" Achieving tight bounding boxes of a shape while guaranteeing complete\nboundness is an essential task for efficient geometric operations and\nunsupervised semantic part detection. But previous methods fail to achieve both\nfull coverage and tightness. Neural-network-based methods are not suitable for\nthese goals due to the non-differentiability of the objective, while classic\niterative search methods suffer from their sensitivity to the initialization.\nWe propose a novel framework for finding a set of tight bounding boxes of a 3D\nshape via over-segmentation and iterative merging and refinement. Our result\nshows that utilizing effective search methods with appropriate objectives is\nthe key to producing bounding boxes with both properties. We employ an existing\npre-segmentation to split the shape and obtain over-segmentation. Then, we\napply hierarchical merging with our novel tightness-aware merging and stopping\ncriteria. To overcome the sensitivity to the initialization, we also define\nactions to refine the bounding box parameters in an Markov Decision Process\n(MDP) setup with a soft reward function promoting a wider exploration. Lastly,\nwe further improve the refinement step with Monte Carlo Tree Search (MCTS)\nbased multi-action space exploration. By thoughtful evaluation on diverse 3D\nshapes, we demonstrate full coverage, tightness, and an adequate number of\nbounding boxes of our method without requiring any training data or\nsupervision. It thus can be applied to various downstream tasks in computer\nvision and graphics.\n","authors":["Chanhyeok Park","Minhyuk Sung"],"pdf_url":"https://arxiv.org/pdf/2304.04336v3.pdf","comment":"3DV 2024"},{"id":"http://arxiv.org/abs/2312.00598v1","updated":"2023-12-01T14:03:30Z","published":"2023-12-01T14:03:30Z","title":"Learning from One Continuous Video Stream","summary":" We introduce a framework for online learning from a single continuous video\nstream -- the way people and animals learn, without mini-batches, data\naugmentation or shuffling. This poses great challenges given the high\ncorrelation between consecutive video frames and there is very little prior\nwork on it. Our framework allows us to do a first deep dive into the topic and\nincludes a collection of streams and tasks composed from two existing video\ndatasets, plus methodology for performance evaluation that considers both\nadaptation and generalization. We employ pixel-to-pixel modelling as a\npractical and flexible way to switch between pre-training and single-stream\nevaluation as well as between arbitrary tasks, without ever requiring changes\nto models and always using the same pixel loss. Equipped with this framework we\nobtained large single-stream learning gains from pre-training with a novel\nfamily of future prediction tasks, found that momentum hurts, and that the pace\nof weight updates matters. The combination of these insights leads to matching\nthe performance of IID learning with batch size 1, when using the same\narchitecture and without costly replay buffers.\n","authors":["João Carreira","Michael King","Viorica Pătrăucean","Dilara Gokay","Cătălin Ionescu","Yi Yang","Daniel Zoran","Joseph Heyward","Carl Doersch","Yusuf Aytar","Dima Damen","Andrew Zisserman"],"pdf_url":"https://arxiv.org/pdf/2312.00598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00597v1","updated":"2023-12-01T14:02:16Z","published":"2023-12-01T14:02:16Z","title":"UAVs and Birds: Enhancing Short-Range Navigation through Budgerigar\n Flight Studies","summary":" This study delves into the flight behaviors of Budgerigars (Melopsittacus\nundulatus) to gain insights into their flight trajectories and movements. Using\n3D reconstruction from stereo video camera recordings, we closely examine the\nvelocity and acceleration patterns during three flight motion takeoff, flying\nand landing. The findings not only contribute to our understanding of bird\nbehaviors but also hold significant implications for the advancement of\nalgorithms in Unmanned Aerial Vehicles (UAVs). The research aims to bridge the\ngap between biological principles observed in birds and the application of\nthese insights in developing more efficient and autonomous UAVs. In the context\nof the increasing use of drones, this study focuses on the biologically\ninspired principles drawn from bird behaviors, particularly during takeoff,\nflying and landing flight, to enhance UAV capabilities. The dataset created for\nthis research sheds light on Budgerigars' takeoff, flying, and landing\ntechniques, emphasizing their ability to control speed across different\nsituations and surfaces. The study underscores the potential of incorporating\nthese principles into UAV algorithms, addressing challenges related to\nshort-range navigation, takeoff, flying, and landing.\n","authors":["Md. Mahmudur Rahman","Sajid Islam","Showren Chowdhury","Sadia Jahan Zeba","Debajyoti Karmaker"],"pdf_url":"https://arxiv.org/pdf/2312.00597v1.pdf","comment":"26 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.00596v1","updated":"2023-12-01T14:01:48Z","published":"2023-12-01T14:01:48Z","title":"BCN: Batch Channel Normalization for Image Classification","summary":" Normalization techniques have been widely used in the field of deep learning\ndue to their capability of enabling higher learning rates and are less careful\nin initialization. However, the effectiveness of popular normalization\ntechnologies is typically limited to specific areas. Unlike the standard Batch\nNormalization (BN) and Layer Normalization (LN), where BN computes the mean and\nvariance along the (N,H,W) dimensions and LN computes the mean and variance\nalong the (C,H,W) dimensions (N, C, H and W are the batch, channel, spatial\nheight and width dimension, respectively), this paper presents a novel\nnormalization technique called Batch Channel Normalization (BCN). To exploit\nboth the channel and batch dependence and adaptively and combine the advantages\nof BN and LN based on specific datasets or tasks, BCN separately normalizes\ninputs along the (N, H, W) and (C, H, W) axes, then combines the normalized\noutputs based on adaptive parameters. As a basic block, BCN can be easily\nintegrated into existing models for various applications in the field of\ncomputer vision. Empirical results show that the proposed technique can be\nseamlessly applied to various versions of CNN or Vision Transformer\narchitecture. The code is publicly available at\nhttps://github.com/AfifaKhaled/BatchChannel-Normalization\n","authors":["Afifa Khaled","Chao Li","Jia Ning","Kun He"],"pdf_url":"https://arxiv.org/pdf/2312.00596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00593v1","updated":"2023-12-01T13:57:29Z","published":"2023-12-01T13:57:29Z","title":"Event Recognition in Laparoscopic Gynecology Videos with Hybrid\n Transformers","summary":" Analyzing laparoscopic surgery videos presents a complex and multifaceted\nchallenge, with applications including surgical training, intra-operative\nsurgical complication prediction, and post-operative surgical assessment.\nIdentifying crucial events within these videos is a significant prerequisite in\na majority of these applications. In this paper, we introduce a comprehensive\ndataset tailored for relevant event recognition in laparoscopic gynecology\nvideos. Our dataset includes annotations for critical events associated with\nmajor intra-operative challenges and post-operative complications. To validate\nthe precision of our annotations, we assess event recognition performance using\nseveral CNN-RNN architectures. Furthermore, we introduce and evaluate a hybrid\ntransformer architecture coupled with a customized training-inference framework\nto recognize four specific events in laparoscopic surgery videos. Leveraging\nthe Transformer networks, our proposed architecture harnesses inter-frame\ndependencies to counteract the adverse effects of relevant content occlusion,\nmotion blur, and surgical scene variation, thus significantly enhancing event\nrecognition accuracy. Moreover, we present a frame sampling strategy designed\nto manage variations in surgical scenes and the surgeons' skill level,\nresulting in event recognition with high temporal resolution. We empirically\ndemonstrate the superiority of our proposed methodology in event recognition\ncompared to conventional CNN-RNN architectures through a series of extensive\nexperiments.\n","authors":["Sahar Nasirihaghighi","Negin Ghamsarian","Heinrich Husslein","Klaus Schoeffmann"],"pdf_url":"https://arxiv.org/pdf/2312.00593v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00592v1","updated":"2023-12-01T13:56:28Z","published":"2023-12-01T13:56:28Z","title":"Tracking Object Positions in Reinforcement Learning: A Metric for\n Keypoint Detection (extended version)","summary":" Reinforcement learning (RL) for robot control typically requires a detailed\nrepresentation of the environment state, including information about\ntask-relevant objects not directly measurable. Keypoint detectors, such as\nspatial autoencoders (SAEs), are a common approach to extracting a\nlow-dimensional representation from high-dimensional image data. SAEs aim at\nspatial features such as object positions, which are often useful\nrepresentations in robotic RL. However, whether an SAE is actually able to\ntrack objects in the scene and thus yields a spatial state representation well\nsuited for RL tasks has rarely been examined due to a lack of established\nmetrics. In this paper, we propose to assess the performance of an SAE instance\nby measuring how well keypoints track ground truth objects in images. We\npresent a computationally lightweight metric and use it to evaluate common\nbaseline SAE architectures on image data from a simulated robot task. We find\nthat common SAEs differ substantially in their spatial extraction capability.\nFurthermore, we validate that SAEs that perform well in our metric achieve\nsuperior performance when used in downstream RL. Thus, our metric is an\neffective and lightweight indicator of RL performance before executing\nexpensive RL training. Building on these insights, we identify three key\nmodifications of SAE architectures to improve tracking performance. We make our\ncode available at anonymous.4open.science/r/sae-rl.\n","authors":["Emma Cramer","Jonas Reiher","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2312.00592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00591v1","updated":"2023-12-01T13:56:01Z","published":"2023-12-01T13:56:01Z","title":"Less is More: Learning Reference Knowledge Using No-Reference Image\n Quality Assessment","summary":" Image Quality Assessment (IQA) with reference images have achieved great\nsuccess by imitating the human vision system, in which the image quality is\neffectively assessed by comparing the query image with its pristine reference\nimage. However, for the images in the wild, it is quite difficult to access\naccurate reference images. We argue that it is possible to learn reference\nknowledge under the No-Reference Image Quality Assessment (NR-IQA) setting,\nwhich is effective and efficient empirically. Concretely, by innovatively\nintroducing a novel feature distillation method in IQA, we propose a new\nframework to learn comparative knowledge from non-aligned reference images. And\nthen, to achieve fast convergence and avoid overfitting, we further propose an\ninductive bias regularization. Such a framework not only solves the congenital\ndefects of NR-IQA but also improves the feature extraction framework, enabling\nit to express more abundant quality information. Surprisingly, our method\nutilizes less input while obtaining a more significant improvement compared to\nthe teacher models. Extensive experiments on eight standard NR-IQA datasets\ndemonstrate the superior performance to the state-of-the-art NR-IQA methods,\ni.e., achieving the PLCC values of 0.917 (vs. 0.884 in LIVEC) and 0.686 (vs.\n0.661 in LIVEFB).\n","authors":["Xudong Li","Jingyuan Zheng","Xiawu Zheng","Runze Hu","Enwei Zhang","Yuting Gao","Yunhang Shen","Ke Li","Yutao Liu","Pingyang Dai","Yan Zhang","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.00591v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05920v2","updated":"2023-12-01T13:31:15Z","published":"2023-10-09T17:59:26Z","title":"SimPLR: A Simple and Plain Transformer for Object Detection and\n Segmentation","summary":" The ability to detect objects in images at varying scales has played a\npivotal role in the design of modern object detectors. Despite considerable\nprogress in removing hand-crafted components and simplifying the architecture\nwith transformers, multi-scale feature maps and/or pyramid design remain a key\nfactor for their empirical success. In this paper, we show that this reliance\non either feature pyramids or an hierarchical backbone is unnecessary and a\ntransformer-based detector with scale-aware attention enables the plain\ndetector `SimPLR' whose backbone and detection head are both non-hierarchical\nand operate on single-scale features. The plain architecture allows SimPLR to\neffectively take advantages of self-supervised learning and scaling approaches\nwith ViTs, yielding competitive performance compared to hierarchical and\nmulti-scale counterparts. We demonstrate through our experiments that when\nscaling to larger ViT backbones, SimPLR indicates better performance than\nend-to-end segmentation models (Mask2Former) and plain-backbone detectors\n(ViTDet), while consistently being faster. The code will be released.\n","authors":["Duy-Kien Nguyen","Martin R. Oswald","Cees G. M. Snoek"],"pdf_url":"https://arxiv.org/pdf/2310.05920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00570v1","updated":"2023-12-01T13:25:39Z","published":"2023-12-01T13:25:39Z","title":"Generative models for visualising abstract social processes: Guiding\n streetview image synthesis of StyleGAN2 with indices of deprivation","summary":" This paper presents a novel application of Generative Adverserial Networks\n(GANs) to study visual aspects of social processes. I train a a StyleGAN2-model\non a custom dataset of 14,564 images of London, sourced from Google Streetview\ntaken in London. After training, I invert the images in the training set,\nfinding points in the model's latent space that correspond to them, and compare\nresults from three inversion techniques. I connect each data point with\nmetadata from the Indices of Multiple Deprivation, describing income, health\nand environmental quality in the area where the photographs were taken. It is\nthen possible to map which parts of the model's latent space encode visual\nfeatures that are distinctive for health, income and environmental quality, and\ncondition the synthesis of new images based on these factors. The synthetic\nimages created reflect visual features of social processes that were previously\nunknown and difficult to study, describing recurring visual differences between\ndeprived and privileged areas in London. GANs are known for their capability to\nproduce a continuous range of images that exhibit visual differences. The paper\ntests how to exploit this ability through visual comparisons in still images as\nwell as through an interactive website where users can guide image synthesis\nwith sliders. Though conditioned synthesis has its limitations and the results\nare difficult to validate, the paper points to the potential for generative\nmodels to be repurposed to be parts of social scientific methods.\n","authors":["Aleksi Knuutila"],"pdf_url":"https://arxiv.org/pdf/2312.00570v1.pdf","comment":"10 pages, 3 figures, 1 table, associated website with interactive\n interface at http://site.knuutila.net/thisinequalitydoesnotexist"},{"id":"http://arxiv.org/abs/2212.10428v5","updated":"2023-12-01T13:23:47Z","published":"2022-12-20T17:06:32Z","title":"HouseCat6D -- A Large-Scale Multi-Modal Category Level 6D Object\n Perception Dataset with Household Objects in Realistic Scenarios","summary":" Estimating 6D object poses is a major challenge in 3D computer vision.\nBuilding on successful instance-level approaches, research is shifting towards\ncategory-level pose estimation for practical applications. Current\ncategory-level datasets, however, fall short in annotation quality and pose\nvariety. Addressing this, we introduce HouseCat6D, a new category-level 6D pose\ndataset. It features 1) multi-modality with Polarimetric RGB and Depth\n(RGBD+P), 2) encompasses 194 diverse objects across 10 household categories,\nincluding two photometrically challenging ones, and 3) provides high-quality\npose annotations with an error range of only 1.35 mm to 1.74 mm. The dataset\nalso includes 4) 41 large-scale scenes with comprehensive viewpoint and\nocclusion coverage, 5) a checkerboard-free environment, and 6) dense 6D\nparallel-jaw robotic grasp annotations. Additionally, we present benchmark\nresults for leading category-level pose estimation networks.\n","authors":["HyunJun Jung","Guangyao Zhai","Shun-Cheng Wu","Patrick Ruhkamp","Hannah Schieber","Giulia Rizzoli","Pengyuan Wang","Hongcheng Zhao","Lorenzo Garattoni","Sven Meier","Daniel Roth","Nassir Navab","Benjamin Busam"],"pdf_url":"https://arxiv.org/pdf/2212.10428v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2010.04055v2","updated":"2023-12-01T12:49:42Z","published":"2020-10-08T15:19:22Z","title":"A Unified Approach to Interpreting and Boosting Adversarial\n Transferability","summary":" In this paper, we use the interaction inside adversarial perturbations to\nexplain and boost the adversarial transferability. We discover and prove the\nnegative correlation between the adversarial transferability and the\ninteraction inside adversarial perturbations. The negative correlation is\nfurther verified through different DNNs with various inputs. Moreover, this\nnegative correlation can be regarded as a unified perspective to understand\ncurrent transferability-boosting methods. To this end, we prove that some\nclassic methods of enhancing the transferability essentially decease\ninteractions inside adversarial perturbations. Based on this, we propose to\ndirectly penalize interactions during the attacking process, which\nsignificantly improves the adversarial transferability.\n","authors":["Xin Wang","Jie Ren","Shuyun Lin","Xiangming Zhu","Yisen Wang","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2010.04055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00548v1","updated":"2023-12-01T12:48:41Z","published":"2023-12-01T12:48:41Z","title":"Domain Adaptive Imitation Learning with Visual Observation","summary":" In this paper, we consider domain-adaptive imitation learning with visual\nobservation, where an agent in a target domain learns to perform a task by\nobserving expert demonstrations in a source domain. Domain adaptive imitation\nlearning arises in practical scenarios where a robot, receiving visual sensory\ndata, needs to mimic movements by visually observing other robots from\ndifferent angles or observing robots of different shapes. To overcome the\ndomain shift in cross-domain imitation learning with visual observation, we\npropose a novel framework for extracting domain-independent behavioral features\nfrom input observations that can be used to train the learner, based on dual\nfeature extraction and image reconstruction. Empirical results demonstrate that\nour approach outperforms previous algorithms for imitation learning from visual\nobservation with domain shift.\n","authors":["Sungho Choi","Seungyul Han","Woojun Kim","Jongseong Chae","Whiyoung Jung","Youngchul Sung"],"pdf_url":"https://arxiv.org/pdf/2312.00548v1.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.17618v3","updated":"2023-12-01T12:46:13Z","published":"2023-11-29T13:26:29Z","title":"ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model","summary":" The advent of large language models, enabling flexibility through\ninstruction-driven approaches, has revolutionized many traditional generative\ntasks, but large models for 3D data, particularly in comprehensively handling\n3D shapes with other modalities, are still under-explored. By achieving\ninstruction-based shape generations, versatile multimodal generative shape\nmodels can significantly benefit various fields like 3D virtual construction\nand network-aided design. In this work, we present ShapeGPT, a shape-included\nmulti-modal framework to leverage strong pre-trained language models to address\nmultiple shape-relevant tasks. Specifically, ShapeGPT employs a\nword-sentence-paragraph framework to discretize continuous shapes into shape\nwords, further assembles these words for shape sentences, as well as integrates\nshape with instructional text for multi-modal paragraphs. To learn this\nshape-language model, we use a three-stage training scheme, including shape\nrepresentation, multimodal alignment, and instruction-based generation, to\nalign shape-language codebooks and learn the intricate correlations among these\nmodalities. Extensive experiments demonstrate that ShapeGPT achieves comparable\nperformance across shape-relevant tasks, including text-to-shape,\nshape-to-text, shape completion, and shape editing.\n","authors":["Fukun Yin","Xin Chen","Chi Zhang","Biao Jiang","Zibo Zhao","Jiayuan Fan","Gang Yu","Taihao Li","Tao Chen"],"pdf_url":"https://arxiv.org/pdf/2311.17618v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.15920v2","updated":"2023-12-01T12:45:49Z","published":"2020-06-29T10:24:27Z","title":"Interpreting and Disentangling Feature Components of Various Complexity\n from DNNs","summary":" This paper aims to define, quantify, and analyze the feature complexity that\nis learned by a DNN. We propose a generic definition for the feature\ncomplexity. Given the feature of a certain layer in the DNN, our method\ndisentangles feature components of different complexity orders from the\nfeature. We further design a set of metrics to evaluate the reliability, the\neffectiveness, and the significance of over-fitting of these feature\ncomponents. Furthermore, we successfully discover a close relationship between\nthe feature complexity and the performance of DNNs. As a generic mathematical\ntool, the feature complexity and the proposed metrics can also be used to\nanalyze the success of network compression and knowledge distillation.\n","authors":["Jie Ren","Mingjie Li","Zexu Liu","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2006.15920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09020v2","updated":"2023-12-01T12:39:01Z","published":"2022-10-17T12:42:29Z","title":"Defects of Convolutional Decoder Networks in Frequency Representation","summary":" In this paper, we prove the representation defects of a cascaded\nconvolutional decoder network, considering the capacity of representing\ndifferent frequency components of an input sample. We conduct the discrete\nFourier transform on each channel of the feature map in an intermediate layer\nof the decoder network. Then, we extend the 2D circular convolution theorem to\nrepresent the forward and backward propagations through convolutional layers in\nthe frequency domain. Based on this, we prove three defects in representing\nfeature spectrums. First, we prove that the convolution operation, the\nzero-padding operation, and a set of other settings all make a convolutional\ndecoder network more likely to weaken high-frequency components. Second, we\nprove that the upsampling operation generates a feature spectrum, in which\nstrong signals repetitively appear at certain frequencies. Third, we prove that\nif the frequency components in the input sample and frequency components in the\ntarget output for regression have a small shift, then the decoder usually\ncannot be effectively learned.\n","authors":["Ling Tang","Wen Shen","Zhanpeng Zhou","Yuefeng Chen","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.09020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13095v2","updated":"2023-12-01T12:33:20Z","published":"2023-02-25T14:56:35Z","title":"Bayesian Neural Networks Avoid Encoding Complex and\n Perturbation-Sensitive Concepts","summary":" In this paper, we focus on mean-field variational Bayesian Neural Networks\n(BNNs) and explore the representation capacity of such BNNs by investigating\nwhich types of concepts are less likely to be encoded by the BNN. It has been\nobserved and studied that a relatively small set of interactive concepts\nusually emerge in the knowledge representation of a sufficiently-trained neural\nnetwork, and such concepts can faithfully explain the network output. Based on\nthis, our study proves that compared to standard deep neural networks (DNNs),\nit is less likely for BNNs to encode complex concepts. Experiments verify our\ntheoretical proofs. Note that the tendency to encode less complex concepts does\nnot necessarily imply weak representation power, considering that complex\nconcepts exhibit low generalization power and high adversarial vulnerability.\nThe code is available at https://github.com/sjtu-xai-lab/BNN-concepts.\n","authors":["Qihan Ren","Huiqi Deng","Yunuo Chen","Siyu Lou","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01811v2","updated":"2023-12-01T12:27:10Z","published":"2023-04-04T14:08:42Z","title":"HarsanyiNet: Computing Accurate Shapley Values in a Single Forward\n Propagation","summary":" The Shapley value is widely regarded as a trustworthy attribution metric.\nHowever, when people use Shapley values to explain the attribution of input\nvariables of a deep neural network (DNN), it usually requires a very high\ncomputational cost to approximate relatively accurate Shapley values in\nreal-world applications. Therefore, we propose a novel network architecture,\nthe HarsanyiNet, which makes inferences on the input sample and simultaneously\ncomputes the exact Shapley values of the input variables in a single forward\npropagation. The HarsanyiNet is designed on the theoretical foundation that the\nShapley value can be reformulated as the redistribution of Harsanyi\ninteractions encoded by the network.\n","authors":["Lu Chen","Siyu Lou","Keyan Zhang","Jin Huang","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.01811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00534v1","updated":"2023-12-01T12:15:09Z","published":"2023-12-01T12:15:09Z","title":"LiDAR-based curb detection for ground truth annotation in automated\n driving validation","summary":" Curb detection is essential for environmental awareness in Automated Driving\n(AD), as it typically limits drivable and non-drivable areas. Annotated data\nare necessary for developing and validating an AD function. However, the number\nof public datasets with annotated point cloud curbs is scarce. This paper\npresents a method for detecting 3D curbs in a sequence of point clouds captured\nfrom a LiDAR sensor, which consists of two main steps. First, our approach\ndetects the curbs at each scan using a segmentation deep neural network. Then,\na sequence-level processing step estimates the 3D curbs in the reconstructed\npoint cloud using the odometry of the vehicle. From these 3D points of the\ncurb, we obtain polylines structured following ASAM OpenLABEL standard. These\ndetections can be used as pre-annotations in labelling pipelines to efficiently\ngenerate curb-related ground truth data. We validate our approach through an\nexperiment in which different human annotators were required to annotate curbs\nin a group of LiDAR-based sequences with and without our automatically\ngenerated pre-annotations. The results show that the manual annotation time is\nreduced by 50.99% thanks to our detections, keeping the data quality level.\n","authors":["Jose Luis Apellániz","Mikel García","Nerea Aranjuelo","Javier Barandiarán","Marcos Nieto"],"pdf_url":"https://arxiv.org/pdf/2312.00534v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00532v1","updated":"2023-12-01T12:12:58Z","published":"2023-12-01T12:12:58Z","title":"DeepDR: Deep Structure-Aware RGB-D Inpainting for Diminished Reality","summary":" Diminished reality (DR) refers to the removal of real objects from the\nenvironment by virtually replacing them with their background. Modern DR\nframeworks use inpainting to hallucinate unobserved regions. While recent deep\nlearning-based inpainting is promising, the DR use case is complicated by the\nneed to generate coherent structure and 3D geometry (i.e., depth), in\nparticular for advanced applications, such as 3D scene editing. In this paper,\nwe propose DeepDR, a first RGB-D inpainting framework fulfilling all\nrequirements of DR: Plausible image and geometry inpainting with coherent\nstructure, running at real-time frame rates, with minimal temporal artifacts.\nOur structure-aware generative network allows us to explicitly condition color\nand depth outputs on the scene semantics, overcoming the difficulty of\nreconstructing sharp and consistent boundaries in regions with complex\nbackgrounds. Experimental results show that the proposed framework can\noutperform related work qualitatively and quantitatively.\n","authors":["Christina Gsaxner","Shohei Mori","Dieter Schmalstieg","Jan Egger","Gerhard Paar","Werner Bailer","Denis Kalkofen"],"pdf_url":"https://arxiv.org/pdf/2312.00532v1.pdf","comment":"11 pages, 8 figures + 13 pages, 10 figures supplementary. Accepted at\n 3DV 2024"},{"id":"http://arxiv.org/abs/2312.00529v1","updated":"2023-12-01T12:09:06Z","published":"2023-12-01T12:09:06Z","title":"Algorithm-based diagnostic application for diabetic retinopathy\n detection","summary":" Diabetic retinopathy (DR) is a growing health problem worldwide and is a\nleading cause of visual impairment and blindness, especially among working\npeople aged 20-65. Its incidence is increasing along with the number of\ndiabetes cases, and it is more common in developed countries than in developing\ncountries. Recent research in the field of diabetic retinopathy diagnosis is\nusing advanced technologies, such as analysis of images obtained by\nophthalmoscopy. Automatic methods for analyzing eye images based on neural\nnetworks, deep learning and image analysis algorithms can improve the\nefficiency of diagnosis. This paper describes an automatic DR diagnosis method\nthat includes processing and analysis of ophthalmoscopic images of the eye. It\nuses morphological algorithms to identify the optic disc and lesions\ncharacteristic of DR, such as microaneurysms, hemorrhages and exudates.\nAutomated DR diagnosis has the potential to improve the efficiency of early\ndetection of this disease and contribute to reducing the number of cases of\ndiabetes-related visual impairment. The final step was to create an application\nwith a graphical user interface that allowed retinal images taken at\ncooperating ophthalmology offices to be uploaded to the server. These images\nwere then analyzed using a developed algorithm to make a diagnosis.\n","authors":["Agnieszka Cisek","Karolina Korycinska","Leszek Pyziak","Marzena Malicka","Tomasz Wiecek","Grzegorz Gruzel","Kamil Szmuc","Jozef Cebulski","Mariusz Spyra"],"pdf_url":"https://arxiv.org/pdf/2312.00529v1.pdf","comment":"18 pages, 9 figures, preprint"},{"id":"http://arxiv.org/abs/2211.06108v4","updated":"2023-12-01T11:52:21Z","published":"2022-11-11T10:24:42Z","title":"RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object\n Detection Systems","summary":" In autonomous driving, LiDAR and radar play important roles in the perception\nof the surrounding environment. LiDAR provides accurate 3D spatial sensing\ninformation but cannot work in adverse weather like fog. On the other hand, the\nradar signal can be diffracted when encountering raindrops or mist particles\nthanks to its wavelength, but it suffers from large noise. Recent\nstate-of-the-art works reveal that fusion of radar and LiDAR can lead to robust\ndetection in adverse weather. The existing works adopt convolutional neural\nnetwork architecture to extract features from each sensor data, then align and\naggregate the two branch features to predict object detection results. However,\nthese methods have low accuracy of bounding box estimations due to a simple\ndesign of label assignment and fusion strategies. In this paper, we propose a\nbird's-eye view fusion learning-based anchor box-free object detection system,\nwhich fuses the feature derived from the radar range-azimuth heatmap and the\nLiDAR point cloud to estimate possible objects. Different label assignment\nstrategies have been designed to facilitate the consistency between the\nclassification of foreground or background anchor points and the corresponding\nbounding box regressions. Furthermore, the performance of the proposed object\ndetector is further enhanced by employing a novel interactive transformer\nmodule. The superior performance of the methods proposed in this paper has been\ndemonstrated using the recently published Oxford Radar RobotCar dataset. Our\nsystem's average precision significantly outperforms the state-of-the-art\nmethod by 13.1% and 19.0% at IoU of 0.8 under 'Clear+Foggy' training conditions\nfor 'Clear' and 'Foggy' testing, respectively.\n","authors":["Yanlong Yang","Jianan Liu","Tao Huang","Qing-Long Han","Gang Ma","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06108v4.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2311.00213v3","updated":"2023-12-01T11:41:34Z","published":"2023-11-01T01:20:12Z","title":"Consistent Video-to-Video Transfer Using Synthetic Dataset","summary":" We introduce a novel and efficient approach for text-based video-to-video\nediting that eliminates the need for resource-intensive per-video-per-model\nfinetuning. At the core of our approach is a synthetic paired video dataset\ntailored for video-to-video transfer tasks. Inspired by Instruct Pix2Pix's\nimage transfer via editing instruction, we adapt this paradigm to the video\ndomain. Extending the Prompt-to-Prompt to videos, we efficiently generate\npaired samples, each with an input video and its edited counterpart. Alongside\nthis, we introduce the Long Video Sampling Correction during sampling, ensuring\nconsistent long videos across batches. Our method surpasses current methods\nlike Tune-A-Video, heralding substantial progress in text-based video-to-video\nediting and suggesting exciting avenues for further exploration and deployment.\n","authors":["Jiaxin Cheng","Tianjun Xiao","Tong He"],"pdf_url":"https://arxiv.org/pdf/2311.00213v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00500v1","updated":"2023-12-01T11:03:07Z","published":"2023-12-01T11:03:07Z","title":"Global Localization: Utilizing Relative Spatio-Temporal Geometric\n Constraints from Adjacent and Distant Cameras","summary":" Re-localizing a camera from a single image in a previously mapped area is\nvital for many computer vision applications in robotics and augmented/virtual\nreality. In this work, we address the problem of estimating the 6 DoF camera\npose relative to a global frame from a single image. We propose to leverage a\nnovel network of relative spatial and temporal geometric constraints to guide\nthe training of a Deep Network for localization. We employ simultaneously\nspatial and temporal relative pose constraints that are obtained not only from\nadjacent camera frames but also from camera frames that are distant in the\nspatio-temporal space of the scene. We show that our method, through these\nconstraints, is capable of learning to localize when little or very sparse\nground-truth 3D coordinates are available. In our experiments, this is less\nthan 1% of available ground-truth data. We evaluate our method on 3 common\nvisual localization datasets and show that it outperforms other direct pose\nestimation methods.\n","authors":["Mohammad Altillawi","Zador Pataki","Shile Li","Ziyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.00500v1.pdf","comment":"To be published in the proceedings of IEEE/RSJ International\n Conference on Intelligent Robots and Systems (IROS) 2023"},{"id":"http://arxiv.org/abs/2311.17475v2","updated":"2023-12-01T10:43:18Z","published":"2023-11-29T09:31:31Z","title":"CLiSA: A Hierarchical Hybrid Transformer Model using Orthogonal Cross\n Attention for Satellite Image Cloud Segmentation","summary":" Clouds in optical satellite images are a major concern since their presence\nhinders the ability to carry accurate analysis as well as processing. Presence\nof clouds also affects the image tasking schedule and results in wastage of\nvaluable storage space on ground as well as space-based systems. Due to these\nreasons, deriving accurate cloud masks from optical remote-sensing images is an\nimportant task. Traditional methods such as threshold-based, spatial filtering\nfor cloud detection in satellite images suffer from lack of accuracy. In recent\nyears, deep learning algorithms have emerged as a promising approach to solve\nimage segmentation problems as it allows pixel-level classification and\nsemantic-level segmentation. In this paper, we introduce a deep-learning model\nbased on hybrid transformer architecture for effective cloud mask generation\nnamed CLiSA - Cloud segmentation via Lipschitz Stable Attention network. In\nthis context, we propose an concept of orthogonal self-attention combined with\nhierarchical cross attention model, and we validate its Lipschitz stability\ntheoretically and empirically. We design the whole setup under adversarial\nsetting in presence of Lov\\'asz-Softmax loss. We demonstrate both qualitative\nand quantitative outcomes for multiple satellite image datasets including\nLandsat-8, Sentinel-2, and Cartosat-2s. Performing comparative study we show\nthat our model performs preferably against other state-of-the-art methods and\nalso provides better generalization in precise cloud extraction from satellite\nmulti-spectral (MX) images. We also showcase different ablation studies to\nendorse our choices corresponding to different architectural elements and\nobjective functions.\n","authors":["Subhajit Paul","Ashutosh Gupta"],"pdf_url":"https://arxiv.org/pdf/2311.17475v2.pdf","comment":"14 pages, 11 figures, 7 tables"},{"id":"http://arxiv.org/abs/2312.00487v1","updated":"2023-12-01T10:37:02Z","published":"2023-12-01T10:37:02Z","title":"Explainable AI in Diagnosing and Anticipating Leukemia Using Transfer\n Learning Method","summary":" This research paper focuses on Acute Lymphoblastic Leukemia (ALL), a form of\nblood cancer prevalent in children and teenagers, characterized by the rapid\nproliferation of immature white blood cells (WBCs). These atypical cells can\noverwhelm healthy cells, leading to severe health consequences. Early and\naccurate detection of ALL is vital for effective treatment and improving\nsurvival rates. Traditional diagnostic methods are time-consuming, costly, and\nprone to errors. The paper proposes an automated detection approach using\ncomputer-aided diagnostic (CAD) models, leveraging deep learning techniques to\nenhance the accuracy and efficiency of leukemia diagnosis. The study utilizes\nvarious transfer learning models like ResNet101V2, VGG19, InceptionV3, and\nInceptionResNetV2 for classifying ALL. The methodology includes using the Local\nInterpretable Model-Agnostic Explanations (LIME) for ensuring the validity and\nreliability of the AI system's predictions. This approach is critical for\novercoming the \"black box\" nature of AI, where decisions made by models are\noften opaque and unaccountable. The paper highlights that the proposed method\nusing the InceptionV3 model achieved an impressive 98.38% accuracy,\noutperforming other tested models. The results, verified by the LIME algorithm,\nshowcase the potential of this method in accurately identifying ALL, providing\na valuable tool for medical practitioners. The research underscores the impact\nof explainable artificial intelligence (XAI) in medical diagnostics, paving the\nway for more transparent and trustworthy AI applications in healthcare.\n","authors":["Wahidul Hasan Abir","Md. Fahim Uddin","Faria Rahman Khanam","Mohammad Monirujjaman Khan"],"pdf_url":"https://arxiv.org/pdf/2312.00487v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14521v2","updated":"2023-12-01T10:09:21Z","published":"2023-11-24T14:46:59Z","title":"GaussianEditor: Swift and Controllable 3D Editing with Gaussian\n Splatting","summary":" 3D editing plays a crucial role in many areas such as gaming and virtual\nreality. Traditional 3D editing methods, which rely on representations like\nmeshes and point clouds, often fall short in realistically depicting complex\nscenes. On the other hand, methods based on implicit 3D representations, like\nNeural Radiance Field (NeRF), render complex scenes effectively but suffer from\nslow processing speeds and limited control over specific scene areas. In\nresponse to these challenges, our paper presents GaussianEditor, an innovative\nand efficient 3D editing algorithm based on Gaussian Splatting (GS), a novel 3D\nrepresentation. GaussianEditor enhances precision and control in editing\nthrough our proposed Gaussian semantic tracing, which traces the editing target\nthroughout the training process. Additionally, we propose Hierarchical Gaussian\nsplatting (HGS) to achieve stabilized and fine results under stochastic\ngenerative guidance from 2D diffusion models. We also develop editing\nstrategies for efficient object removal and integration, a challenging task for\nexisting methods. Our comprehensive experiments demonstrate GaussianEditor's\nsuperior control, efficacy, and rapid performance, marking a significant\nadvancement in 3D editing. Project Page:\nhttps://buaacyw.github.io/gaussian-editor/\n","authors":["Yiwen Chen","Zilong Chen","Chi Zhang","Feng Wang","Xiaofeng Yang","Yikai Wang","Zhongang Cai","Lei Yang","Huaping Liu","Guosheng Lin"],"pdf_url":"https://arxiv.org/pdf/2311.14521v2.pdf","comment":"Project Page: https://buaacyw.github.io/gaussian-editor/"},{"id":"http://arxiv.org/abs/2310.15578v3","updated":"2023-12-01T10:06:21Z","published":"2023-10-24T07:42:04Z","title":"VMAF Re-implementation on PyTorch: Some Experimental Results","summary":" Based on the standard VMAF implementation we propose an implementation of\nVMAF using PyTorch framework. For this implementation comparisons with the\nstandard (libvmaf) show the discrepancy $\\lesssim 10^{-2}$ in VMAF units. We\ninvestigate gradients computation when using VMAF as an objective function and\ndemonstrate that training using this function does not result in ill-behaving\ngradients. The implementation is then used to train a preprocessing filter. It\nis demonstrated that its performance is superior to the unsharp masking filter.\nThe resulting filter is also easy for implementation and can be applied in\nvideo processing tasks for video copression improvement. This is confirmed by\nthe results of numerical experiments.\n","authors":["Kirill Aistov","Maxim Koroteev"],"pdf_url":"https://arxiv.org/pdf/2310.15578v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.00467v1","updated":"2023-12-01T10:02:38Z","published":"2023-12-01T10:02:38Z","title":"Unfolder: Fast localization and image rectification of a document with a\n crease from folding in half","summary":" Presentation of folded documents is not an uncommon case in modern society.\nDigitizing such documents by capturing them with a smartphone camera can be\ntricky since a crease can divide the document contents into separate planes. To\nunfold the document, one could hold the edges potentially obscuring it in a\ncaptured image. While there are many geometrical rectification methods, they\nwere usually developed for arbitrary bends and folds. We consider such\nalgorithms and propose a novel approach Unfolder developed specifically for\nimages of documents with a crease from folding in half. Unfolder is robust to\nprojective distortions of the document image and does not fragment the image in\nthe vicinity of a crease after rectification. A new Folded Document Images\ndataset was created to investigate the rectification accuracy of folded (2, 3,\n4, and 8 folds) documents. The dataset includes 1600 images captured when\ndocument placed on a table and when held in hand. The Unfolder algorithm\nallowed for a recognition error rate of 0.33, which is better than the advanced\nneural network methods DocTr (0.44) and DewarpNet (0.57). The average runtime\nfor Unfolder was only 0.25 s/image on an iPhone XR.\n","authors":["A. M. Ershov","D. V. Tropin","E. E. Limonova","D. P. Nikolaev","V. V. Arlazarov"],"pdf_url":"https://arxiv.org/pdf/2312.00467v1.pdf","comment":"This is a preprint of the article accepted for publication in the\n journal \"Computer Optics\""},{"id":"http://arxiv.org/abs/2312.00462v1","updated":"2023-12-01T09:56:29Z","published":"2023-12-01T09:56:29Z","title":"Learning Unorthogonalized Matrices for Rotation Estimation","summary":" Estimating 3D rotations is a common procedure for 3D computer vision. The\naccuracy depends heavily on the rotation representation. One form of\nrepresentation -- rotation matrices -- is popular due to its continuity,\nespecially for pose estimation tasks. The learning process usually incorporates\northogonalization to ensure orthonormal matrices. Our work reveals, through\ngradient analysis, that common orthogonalization procedures based on the\nGram-Schmidt process and singular value decomposition will slow down training\nefficiency. To this end, we advocate removing orthogonalization from the\nlearning process and learning unorthogonalized `Pseudo' Rotation Matrices\n(PRoM). An optimization analysis shows that PRoM converges faster and to a\nbetter solution. By replacing the orthogonalization incorporated representation\nwith our proposed PRoM in various rotation-related tasks, we achieve\nstate-of-the-art results on large-scale benchmarks for human pose estimation.\n","authors":["Kerui Gu","Zhihao Li","Shiyong Liu","Jianzhuang Liu","Songcen Xu","Youliang Yan","Michael Bi Mi","Kenji Kawaguchi","Angela Yao"],"pdf_url":"https://arxiv.org/pdf/2312.00462v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00454v1","updated":"2023-12-01T09:34:28Z","published":"2023-12-01T09:34:28Z","title":"An Encoding Framework for Binarized Images using HyperDimensional\n Computing","summary":" Hyperdimensional Computing (HDC) is a brain-inspired and light-weight machine\nlearning method. It has received significant attention in the literature as a\ncandidate to be applied in the wearable internet of things, near-sensor\nartificial intelligence applications and on-device processing. HDC is\ncomputationally less complex than traditional deep learning algorithms and\ntypically achieves moderate to good classification performance. A key aspect\nthat determines the performance of HDC is the encoding of the input data to the\nhyperdimensional (HD) space. This article proposes a novel light-weight\napproach relying only on native HD arithmetic vector operations to encode\nbinarized images that preserves similarity of patterns at nearby locations by\nusing point of interest selection and local linear mapping. The method reaches\nan accuracy of 97.35% on the test set for the MNIST data set and 84.12% for the\nFashion-MNIST data set. These results outperform other studies using baseline\nHDC with different encoding approaches and are on par with more complex hybrid\nHDC models. The proposed encoding approach also demonstrates a higher\nrobustness to noise and blur compared to the baseline encoding.\n","authors":["Laura Smets","Werner Van Leekwijck","Ing Jyh Tsang","Steven Latré"],"pdf_url":"https://arxiv.org/pdf/2312.00454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06777v2","updated":"2023-12-01T09:33:21Z","published":"2023-05-11T12:59:21Z","title":"Generating high-quality 3DMPCs by adaptive data acquisition and\n NeREF-based radiometric calibration with UGV plant phenotyping system","summary":" Fusion of 3D and MS imaging data has a great potential for high-throughput\nplant phenotyping of structural and biochemical as well as physiological traits\nsimultaneously, which is important for decision support in agriculture and for\ncrop breeders in selecting the best genotypes. However, lacking of 3D data\nintegrity of various plant canopy structures and low-quality of MS images\ncaused by the complex illumination effects make a great challenge, especially\nat the proximal imaging scale. Therefore, this study proposed a novel approach\nfor adaptive data acquisition and radiometric calibration to generate\nhigh-quality 3DMPCs of plants. An efficient NBV planning method based on an UGV\nplant phenotyping system with a multi-sensor-equipped robotic arm was proposed\nto achieve adaptive data acquisition. The NeREF was employed to predict the DN\nvalues of the hemispherical reference for radiometric calibration. For NBV\nplanning, the average total time for single plant at a joint speed of 1.55\nrad/s was about 62.8 s, with an average reduction of 18.0% compared to the\nunplanned. The integrity of the whole-plant data was improved by an average of\n23.6% compared to the fixed viewpoints alone. Compared with the ASD\nmeasurements, the RMSE of the reflectance spectra obtained from 3DMPCs at\ndifferent regions of interest was 0.08 with an average decrease of 58.93%\ncompared to the results obtained from the single-frame of MS images without 3D\nradiometric calibration. The 3D-calibrated plant 3DMPCs improved the predictive\naccuracy of PLSR for chlorophyll content, with an average increase of 0.07 in\nR2 and an average decrease of 21.25% in RMSE. Our approach introduced a fresh\nperspective on generating high-quality 3DMPCs of plants under the natural light\ncondition, enabling more precise analysis of plant morphological and\nphysiological parameters.\n","authors":["Pengyao Xie","Zhihong Ma","Ruiming Du","Xin Yang","Haiyan Cen"],"pdf_url":"https://arxiv.org/pdf/2305.06777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00452v1","updated":"2023-12-01T09:31:24Z","published":"2023-12-01T09:31:24Z","title":"Towards Generalizable Referring Image Segmentation via Target Prompt and\n Visual Coherence","summary":" Referring image segmentation (RIS) aims to segment objects in an image\nconditioning on free-from text descriptions. Despite the overwhelming progress,\nit still remains challenging for current approaches to perform well on cases\nwith various text expressions or with unseen visual entities, limiting its\nfurther application. In this paper, we present a novel RIS approach, which\nsubstantially improves the generalization ability by addressing the two\ndilemmas mentioned above. Specially, to deal with unconstrained texts, we\npropose to boost a given expression with an explicit and crucial prompt, which\ncomplements the expression in a unified context, facilitating target capturing\nin the presence of linguistic style changes. Furthermore, we introduce a\nmulti-modal fusion aggregation module with visual guidance from a powerful\npretrained model to leverage spatial relations and pixel coherences to handle\nthe incomplete target masks and false positive irregular clumps which often\nappear on unseen visual entities. Extensive experiments are conducted in the\nzero-shot cross-dataset settings and the proposed approach achieves consistent\ngains compared to the state-of-the-art, e.g., 4.15\\%, 5.45\\%, and 4.64\\% mIoU\nincrease on RefCOCO, RefCOCO+ and ReferIt respectively, demonstrating its\neffectiveness. Additionally, the results on GraspNet-RIS show that our approach\nalso generalizes well to new scenarios with large domain shifts.\n","authors":["Yajie Liu","Pu Ge","Haoxiang Ma","Shichao Fan","Qingjie Liu","Di Huang","Yunhong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.00452v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.00451v1","updated":"2023-12-01T09:30:02Z","published":"2023-12-01T09:30:02Z","title":"FSGS: Real-Time Few-shot View Synthesis using Gaussian Splatting","summary":" Novel view synthesis from limited observations remains an important and\npersistent task. However, high efficiency in existing NeRF-based few-shot view\nsynthesis is often compromised to obtain an accurate 3D representation. To\naddress this challenge, we propose a few-shot view synthesis framework based on\n3D Gaussian Splatting that enables real-time and photo-realistic view synthesis\nwith as few as three training views. The proposed method, dubbed FSGS, handles\nthe extremely sparse initialized SfM points with a thoughtfully designed\nGaussian Unpooling process. Our method iteratively distributes new Gaussians\naround the most representative locations, subsequently infilling local details\nin vacant areas. We also integrate a large-scale pre-trained monocular depth\nestimator within the Gaussians optimization process, leveraging online\naugmented views to guide the geometric optimization towards an optimal\nsolution. Starting from sparse points observed from limited input viewpoints,\nour FSGS can accurately grow into unseen regions, comprehensively covering the\nscene and boosting the rendering quality of novel views. Overall, FSGS achieves\nstate-of-the-art performance in both accuracy and rendering efficiency across\ndiverse datasets, including LLFF, Mip-NeRF360, and Blender. Project website:\nhttps://zehaozhu.github.io/FSGS/.\n","authors":["Zehao Zhu","Zhiwen Fan","Yifan Jiang","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.00451v1.pdf","comment":"Project page: https://zehaozhu.github.io/FSGS/"},{"id":"http://arxiv.org/abs/2312.00438v1","updated":"2023-12-01T09:10:33Z","published":"2023-12-01T09:10:33Z","title":"Dolphins: Multimodal Language Model for Driving","summary":" The quest for fully autonomous vehicles (AVs) capable of navigating complex\nreal-world scenarios with human-like understanding and responsiveness. In this\npaper, we introduce Dolphins, a novel vision-language model architected to\nimbibe human-like abilities as a conversational driving assistant. Dolphins is\nadept at processing multimodal inputs comprising video (or image) data, text\ninstructions, and historical control signals to generate informed outputs\ncorresponding to the provided instructions. Building upon the open-sourced\npretrained Vision-Language Model, OpenFlamingo, we first enhance Dolphins's\nreasoning capabilities through an innovative Grounded Chain of Thought (GCoT)\nprocess. Then we tailored Dolphins to the driving domain by constructing\ndriving-specific instruction data and conducting instruction tuning. Through\nthe utilization of the BDD-X dataset, we designed and consolidated four\ndistinct AV tasks into Dolphins to foster a holistic understanding of intricate\ndriving scenarios. As a result, the distinctive features of Dolphins are\ncharacterized into two dimensions: (1) the ability to provide a comprehensive\nunderstanding of complex and long-tailed open-world driving scenarios and solve\na spectrum of AV tasks, and (2) the emergence of human-like capabilities\nincluding gradient-free instant adaptation via in-context learning and error\nrecovery via reflection.\n","authors":["Yingzi Ma","Yulong Cao","Jiachen Sun","Marco Pavone","Chaowei Xiao"],"pdf_url":"https://arxiv.org/pdf/2312.00438v1.pdf","comment":"The project page is available at https://vlm-driver.github.io/"},{"id":"http://arxiv.org/abs/2312.00435v1","updated":"2023-12-01T09:06:56Z","published":"2023-12-01T09:06:56Z","title":"Enhancing Image Captioning with Neural Models","summary":" This research explores the realm of neural image captioning using deep\nlearning models. The study investigates the performance of different neural\narchitecture configurations, focusing on the inject architecture, and proposes\na novel quality metric for evaluating caption generation. Through extensive\nexperimentation and analysis, this work sheds light on the challenges and\nopportunities in image captioning, providing insights into model behavior and\noverfitting. The results reveal that while the merge models exhibit a larger\nvocabulary and higher ROUGE scores, the inject architecture generates relevant\nand concise image captions. The study also highlights the importance of\nrefining training data and optimizing hyperparameters for improved model\nperformance. This research contributes to the growing body of knowledge in\nneural image captioning and encourages further exploration in the field,\nemphasizing the democratization of artificial intelligence.\n","authors":["Pooja Bhatnagar","Sai Mrunaal","Sachin Kamnure"],"pdf_url":"https://arxiv.org/pdf/2312.00435v1.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2312.00425v1","updated":"2023-12-01T08:47:56Z","published":"2023-12-01T08:47:56Z","title":"A Low-Power Neuromorphic Approach for Efficient Eye-Tracking","summary":" This paper introduces a neuromorphic methodology for eye tracking, harnessing\npure event data captured by a Dynamic Vision Sensor (DVS) camera. The framework\nintegrates a directly trained Spiking Neuron Network (SNN) regression model and\nleverages a state-of-the-art low power edge neuromorphic processor - Speck,\ncollectively aiming to advance the precision and efficiency of eye-tracking\nsystems. First, we introduce a representative event-based eye-tracking dataset,\n\"Ini-30\", which was collected with two glass-mounted DVS cameras from thirty\nvolunteers. Then,a SNN model, based on Integrate And Fire (IAF) neurons, named\n\"Retina\", is described , featuring only 64k parameters (6.63x fewer than the\nlatest) and achieving pupil tracking error of only 3.24 pixels in a 64x64 DVS\ninput. The continous regression output is obtained by means of convolution\nusing a non-spiking temporal 1D filter slided across the output spiking layer.\nFinally, we evaluate Retina on the neuromorphic processor, showing an\nend-to-end power between 2.89-4.8 mW and a latency of 5.57-8.01 mS dependent on\nthe time window. We also benchmark our model against the latest event-based\neye-tracking method, \"3ET\", which was built upon event frames. Results show\nthat Retina achieves superior precision with 1.24px less pupil centroid error\nand reduced computational complexity with 35 times fewer MAC operations. We\nhope this work will open avenues for further investigation of close-loop\nneuromorphic solutions and true event-based training pursuing edge performance.\n","authors":["Pietro Bonazzi","Sizhen Bian","Giovanni Lippolis","Yawei Li","Sadique Sheik","Michele Magno"],"pdf_url":"https://arxiv.org/pdf/2312.00425v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00416v1","updated":"2023-12-01T08:40:09Z","published":"2023-12-01T08:40:09Z","title":"Towards Explaining Satellite Based Poverty Predictions with\n Convolutional Neural Networks","summary":" Deep convolutional neural networks (CNNs) have been shown to predict poverty\nand development indicators from satellite images with surprising accuracy. This\npaper presents a first attempt at analyzing the CNNs responses in detail and\nexplaining the basis for the predictions. The CNN model, while trained on\nrelatively low resolution day- and night-time satellite images, is able to\noutperform human subjects who look at high-resolution images in ranking the\nWealth Index categories. Multiple explainability experiments performed on the\nmodel indicate the importance of the sizes of the objects, pixel colors in the\nimage, and provide a visualization of the importance of different structures in\ninput images. A visualization is also provided of type images that maximize the\nnetwork prediction of Wealth Index, which provides clues on what the CNN\nprediction is based on.\n","authors":["Hamid Sarmadi","Thorsteinn Rögnvaldsson","Nils Roger Carlsson","Mattias Ohlsson","Ibrahim Wahab","Ola Hall"],"pdf_url":"https://arxiv.org/pdf/2312.00416v1.pdf","comment":"Accepted to the DSAA 2023 conference, published in IEEE Xplore"},{"id":"http://arxiv.org/abs/2312.00414v1","updated":"2023-12-01T08:38:27Z","published":"2023-12-01T08:38:27Z","title":"Large-scale Vision-Language Models Learn Super Images for Efficient and\n High-Performance Partially Relevant Video Retrieval","summary":" In this paper, we propose an efficient and high-performance method for\npartially relevant video retrieval (PRVR), which aims to retrieve untrimmed\nlong videos that contain at least one relevant moment to the input text query.\nIn terms of both efficiency and performance, the overlooked bottleneck of\nprevious studies is the visual encoding of dense frames. This guides\nresearchers to choose lightweight visual backbones, yielding sub-optimal\nretrieval performance due to their limited capabilities of learned visual\nrepresentations. However, it is undesirable to simply replace them with\nhigh-performance large-scale vision-and-language models (VLMs) due to their low\nefficiency. To address these issues, instead of dense frames, we focus on super\nimages, which are created by rearranging the video frames in a $N \\times N$\ngrid layout. This reduces the number of visual encodings to $\\frac{1}{N^2}$ and\ncompensates for the low efficiency of large-scale VLMs, allowing us to adopt\nthem as powerful encoders. Surprisingly, we discover that with a simple\nquery-image attention trick, VLMs generalize well to super images effectively\nand demonstrate promising zero-shot performance against SOTA methods\nefficiently. In addition, we propose a fine-tuning approach by incorporating a\nfew trainable modules into the VLM backbones. The experimental results\ndemonstrate that our approaches efficiently achieve the best performance on\nActivityNet Captions and TVR.\n","authors":["Taichi Nishimura","Shota Nakada","Masayoshi Kondo"],"pdf_url":"https://arxiv.org/pdf/2312.00414v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.00412v1","updated":"2023-12-01T08:22:34Z","published":"2023-12-01T08:22:34Z","title":"SCHEME: Scalable Channer Mixer for Vision Transformers","summary":" Vision Transformers have received significant attention due to their\nimpressive performance in many vision tasks. While the token mixer or attention\nblock has been studied in great detail, the channel mixer or feature mixing\nblock (FFN or MLP) has not been explored in depth albeit it accounts for a bulk\nof the parameters and computation in a model. In this work, we study whether\nsparse feature mixing can replace the dense connections and confirm this with a\nblock diagonal MLP structure that improves the accuracy by supporting larger\nexpansion ratios. To improve the feature clusters formed by this structure and\nthereby further improve the accuracy, a lightweight, parameter-free, channel\ncovariance attention (CCA) mechanism is introduced as a parallel branch during\ntraining. This design of CCA enables gradual feature mixing across channel\ngroups during training whose contribution decays to zero as the training\nprogresses to convergence. This allows the CCA block to be discarded during\ninference, thus enabling enhanced performance with no additional computational\ncost. The resulting $\\textit{Scalable CHannEl MixEr}$ (SCHEME) can be plugged\ninto any ViT architecture to obtain a gamut of models with different trade-offs\nbetween complexity and performance by controlling the block diagonal structure\nsize in the MLP. This is shown by the introduction of a new family of\nSCHEMEformer models. Experiments on image classification, object detection, and\nsemantic segmentation, with different ViT backbones, consistently demonstrate\nsubstantial accuracy gains over existing designs, especially under lower FLOPs\nregimes. For example, the SCHEMEformer establishes a new SOTA of 79.7% accuracy\nfor ViTs using pure attention mixers on ImageNet-1K at 1.77G FLOPs.\n","authors":["Deepak Sridhar","Yunsheng Li","Nuno Vasconcelos"],"pdf_url":"https://arxiv.org/pdf/2312.00412v1.pdf","comment":"Preprint. Under review"},{"id":"http://arxiv.org/abs/2211.05781v2","updated":"2023-12-01T08:00:51Z","published":"2022-11-10T18:59:43Z","title":"Demystify Transformers & Convolutions in Modern Image Deep Networks","summary":" Vision transformers have gained popularity recently, leading to the\ndevelopment of new vision backbones with improved features and consistent\nperformance gains. However, these advancements are not solely attributable to\nnovel feature transformation designs; certain benefits also arise from advanced\nnetwork-level and block-level architectures. This paper aims to identify the\nreal gains of popular convolution and attention operators through a detailed\nstudy. We find that the key difference among these feature transformation\nmodules, such as attention or convolution, lies in their spatial feature\naggregation approach, known as the \"spatial token mixer\" (STM). To facilitate\nan impartial comparison, we introduce a unified architecture to neutralize the\nimpact of divergent network-level and block-level designs. Subsequently,\nvarious STMs are integrated into this unified framework for comprehensive\ncomparative analysis. Our experiments on various tasks and an analysis of\ninductive bias show a significant performance boost due to advanced\nnetwork-level and block-level designs, but performance differences persist\namong different STMs. Our detailed analysis also reveals various findings about\ndifferent STMs, such as effective receptive fields and invariance tests. All\nmodels and codes used in this study are publicly available at\n\\url{https://github.com/OpenGVLab/STM-Evaluation}.\n","authors":["Xiaowei Hu","Min Shi","Weiyun Wang","Sitong Wu","Linjie Xing","Wenhai Wang","Xizhou Zhu","Lewei Lu","Jie Zhou","Xiaogang Wang","Yu Qiao","Jifeng Dai"],"pdf_url":"https://arxiv.org/pdf/2211.05781v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16911v2","updated":"2023-12-01T07:55:16Z","published":"2023-08-31T17:59:46Z","title":"PointLLM: Empowering Large Language Models to Understand Point Clouds","summary":" The unprecedented advancements in Large Language Models (LLMs) have shown a\nprofound impact on natural language processing but are yet to fully embrace the\nrealm of 3D understanding. This paper introduces PointLLM, a preliminary effort\nto fill this gap, enabling LLMs to understand point clouds and offering a new\navenue beyond 2D visual data. PointLLM understands colored object point clouds\nwith human instructions and generates contextually appropriate responses,\nillustrating its grasp of point clouds and common sense. Specifically, it\nleverages a point cloud encoder with a powerful LLM to effectively fuse\ngeometric, appearance, and linguistic information. We collect a novel dataset\ncomprising 660K simple and 70K complex point-text instruction pairs to enable a\ntwo-stage training strategy: aligning latent spaces and subsequently\ninstruction-tuning the unified model. To rigorously evaluate the perceptual and\ngeneralization capabilities of PointLLM, we establish two benchmarks:\nGenerative 3D Object Classification and 3D Object Captioning, assessed through\nthree different methods, including human evaluation, GPT-4/ChatGPT evaluation,\nand traditional metrics. Experimental results reveal PointLLM's superior\nperformance over existing 2D and 3D baselines, with a notable achievement in\nhuman-evaluated object captioning tasks where it surpasses human annotators in\nover 50% of the samples. Codes, datasets, and benchmarks are available at\nhttps://github.com/OpenRobotLab/PointLLM .\n","authors":["Runsen Xu","Xiaolong Wang","Tai Wang","Yilun Chen","Jiangmiao Pang","Dahua Lin"],"pdf_url":"https://arxiv.org/pdf/2308.16911v2.pdf","comment":"28 pages. Empowering large language models with 3D point cloud\n understanding, accompanied by a novel dataset and carefully designed\n benchmarks. Project page: https://runsenxu.com/projects/PointLLM"},{"id":"http://arxiv.org/abs/2210.14611v2","updated":"2023-12-01T07:53:28Z","published":"2022-10-26T10:34:20Z","title":"Automatic Diagnosis of Myocarditis Disease in Cardiac MRI Modality using\n Deep Transformers and Explainable Artificial Intelligence","summary":" Myocarditis is a significant cardiovascular disease (CVD) that poses a threat\nto the health of many individuals by causing damage to the myocardium. The\noccurrence of microbes and viruses, including the likes of HIV, plays a crucial\nrole in the development of myocarditis disease (MCD). The images produced\nduring cardiac magnetic resonance imaging (CMRI) scans are low contrast, which\ncan make it challenging to diagnose cardiovascular diseases. In other hand,\nchecking numerous CMRI slices for each CVD patient can be a challenging task\nfor medical doctors. To overcome the existing challenges, researchers have\nsuggested the use of artificial intelligence (AI)-based computer-aided\ndiagnosis systems (CADS). The presented paper outlines a CADS for the detection\nof MCD from CMR images, utilizing deep learning (DL) methods. The proposed CADS\nconsists of several steps, including dataset, preprocessing, feature\nextraction, classification, and post-processing. First, the Z-Alizadeh dataset\nwas selected for the experiments. Subsequently, the CMR images underwent\nvarious preprocessing steps, including denoising, resizing, as well as data\naugmentation (DA) via CutMix and MixUp techniques. In the following, the most\ncurrent deep pre-trained and transformer models are used for feature extraction\nand classification on the CMR images. The findings of our study reveal that\ntransformer models exhibit superior performance in detecting MCD as opposed to\npre-trained architectures. In terms of DL architectures, the Turbulence Neural\nTransformer (TNT) model exhibited impressive accuracy, reaching 99.73%\nutilizing a 10-fold cross-validation approach. Additionally, to pinpoint areas\nof suspicion for MCD in CMRI images, the Explainable-based Grad Cam method was\nemployed.\n","authors":["Mahboobeh Jafari","Afshin Shoeibi","Navid Ghassemi","Jonathan Heras","Sai Ho Ling","Amin Beheshti","Yu-Dong Zhang","Shui-Hua Wang","Roohallah Alizadehsani","Juan M. Gorriz","U. Rajendra Acharya","Hamid Alinejad Rokny"],"pdf_url":"https://arxiv.org/pdf/2210.14611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17465v2","updated":"2023-12-01T07:52:41Z","published":"2023-11-29T09:13:00Z","title":"AgentAvatar: Disentangling Planning, Driving and Rendering for\n Photorealistic Avatar Agents","summary":" In this study, our goal is to create interactive avatar agents that can\nautonomously plan and animate nuanced facial movements realistically, from both\nvisual and behavioral perspectives. Given high-level inputs about the\nenvironment and agent profile, our framework harnesses LLMs to produce a series\nof detailed text descriptions of the avatar agents' facial motions. These\ndescriptions are then processed by our task-agnostic driving engine into motion\ntoken sequences, which are subsequently converted into continuous motion\nembeddings that are further consumed by our standalone neural-based renderer to\ngenerate the final photorealistic avatar animations. These streamlined\nprocesses allow our framework to adapt to a variety of non-verbal avatar\ninteractions, both monadic and dyadic. Our extensive study, which includes\nexperiments on both newly compiled and existing datasets featuring two types of\nagents -- one capable of monadic interaction with the environment, and the\nother designed for dyadic conversation -- validates the effectiveness and\nversatility of our approach. To our knowledge, we advanced a leap step by\ncombining LLMs and neural rendering for generalized non-verbal prediction and\nphoto-realistic rendering of avatar agents.\n","authors":["Duomin Wang","Bin Dai","Yu Deng","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17465v2.pdf","comment":"Project page: https://dorniwang.github.io/AgentAvatar_project/; 26\n pages"},{"id":"http://arxiv.org/abs/2312.00401v1","updated":"2023-12-01T07:50:53Z","published":"2023-12-01T07:50:53Z","title":"VIoTGPT: Learning to Schedule Vision Tools towards Intelligent Video\n Internet of Things","summary":" Video Internet of Things (VIoT) has shown full potential in collecting an\nunprecedented volume of video data. Learning to schedule perceiving models and\nanalyzing the collected videos intelligently will be potential sparks for VIoT.\nIn this paper, to address the challenges posed by the fine-grained and\ninterrelated vision tool usage of VIoT, we build VIoTGPT, the framework based\non LLMs to correctly interact with humans, query knowledge videos, and invoke\nvision models to accomplish complicated tasks. To support VIoTGPT and related\nfuture works, we meticulously crafted the training dataset and established\nbenchmarks involving 11 representative vision models across three categories\nbased on semi-automatic annotations. To guide LLM to act as the intelligent\nagent towards intelligent VIoT, we resort to ReAct instruction tuning based on\nthe collected VIoT dataset to learn the tool capability. Quantitative and\nqualitative experimental results and analyses demonstrate the effectiveness of\nVIoTGPT.\n","authors":["Yaoyao Zhong","Mengshi Qi","Rui Wang","Yuhan Qiu","Yang Zhang","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00398v1","updated":"2023-12-01T07:45:27Z","published":"2023-12-01T07:45:27Z","title":"Learning to Estimate Critical Gait Parameters from Single-View RGB\n Videos with Transformer-Based Attention Network","summary":" Musculoskeletal diseases and cognitive impairments in patients lead to\ndifficulties in movement as well as negative effects on their psychological\nhealth. Clinical gait analysis, a vital tool for early diagnosis and treatment,\ntraditionally relies on expensive optical motion capture systems. Recent\nadvances in computer vision and deep learning have opened the door to more\naccessible and cost-effective alternatives. This paper introduces a novel\nspatio-temporal Transformer network to estimate critical gait parameters from\nRGB videos captured by a single-view camera. Empirical evaluations on a public\ndataset of cerebral palsy patients indicate that the proposed framework\nsurpasses current state-of-the-art approaches and show significant improvements\nin predicting general gait parameters (including Walking Speed, Gait Deviation\nIndex - GDI, and Knee Flexion Angle at Maximum Extension), while utilizing\nfewer parameters and alleviating the need for manual feature extraction.\n","authors":["Quoc Hung T. Le","Hieu H. Pham"],"pdf_url":"https://arxiv.org/pdf/2312.00398v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15890v2","updated":"2023-12-01T07:39:02Z","published":"2023-11-27T14:56:47Z","title":"Stability-Informed Initialization of Neural Ordinary Differential\n Equations","summary":" This paper addresses the training of Neural Ordinary Differential Equations\n(neural ODEs), and in particular explores the interplay between numerical\nintegration techniques, stability regions, step size, and initialization\ntechniques. It is shown how the choice of integration technique implicitly\nregularizes the learned model, and how the solver's corresponding stability\nregion affects training and prediction performance. From this analysis, a\nstability-informed parameter initialization technique is introduced. The\neffectiveness of the initialization method is displayed across several learning\nbenchmarks and industrial applications.\n","authors":["Theodor Westny","Arman Mohammadi","Daniel Jung","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2311.15890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.13325v3","updated":"2023-12-01T07:29:42Z","published":"2023-06-23T07:05:08Z","title":"Differentiable Display Photometric Stereo","summary":" Photometric stereo leverages variations in illumination conditions to\nreconstruct surface normals. Display photometric stereo, which employs a\nconventional monitor as an illumination source, has the potential to overcome\nlimitations often encountered in bulky and difficult-to-use conventional\nsetups. In this paper, we present differentiable display photometric stereo\n(DDPS), addressing an often overlooked challenge in display photometric stereo:\nthe design of display patterns. Departing from using heuristic display\npatterns, DDPS learns the display patterns that yield accurate normal\nreconstruction for a target system in an end-to-end manner. To this end, we\npropose a differentiable framework that couples basis-illumination image\nformation with analytic photometric-stereo reconstruction. The differentiable\nframework facilitates the effective learning of display patterns via\nauto-differentiation. Also, for training supervision, we propose to use 3D\nprinting for creating a real-world training dataset, enabling accurate\nreconstruction on the target real-world setup. Finally, we exploit that\nconventional LCD monitors emit polarized light, which allows for the optical\nseparation of diffuse and specular reflections when combined with a\npolarization camera, leading to accurate normal reconstruction. Extensive\nevaluation of DDPS shows improved normal-reconstruction accuracy compared to\nheuristic patterns and demonstrates compelling properties such as robustness to\npattern initialization, calibration errors, and simplifications in image\nformation and reconstruction.\n","authors":["Seokjun Choi","Seungwoo Yoon","Giljoo Nam","Seungyong Lee","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2306.13325v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00392v1","updated":"2023-12-01T07:29:30Z","published":"2023-12-01T07:29:30Z","title":"Study and Survey on Gesture Recognition Systems","summary":" In recent years, there has been a considerable amount of research in the\nGesture Recognition domain, mainly owing to the technological advancements in\nComputer Vision. Various new applications have been conceptualised and\ndeveloped in this field. This paper discusses the implementation of gesture\nrecognition systems in multiple sectors such as gaming, healthcare, home\nappliances, industrial robots, and virtual reality. Different methodologies for\ncapturing gestures are compared and contrasted throughout this survey. Various\ndata sources and data acquisition techniques have been discussed. The role of\ngestures in sign language has been studied and existing approaches have been\nreviewed. Common challenges faced while building gesture recognition systems\nhave also been explored.\n","authors":["Kshitij Deshpande","Varad Mashalkar","Kaustubh Mhaisekar","Amaan Naikwadi","Archana Ghotkar"],"pdf_url":"https://arxiv.org/pdf/2312.00392v1.pdf","comment":"6 pages, accepted at the ICCUBEA, IEEE 2023 conference"},{"id":"http://arxiv.org/abs/2311.04591v3","updated":"2023-12-01T07:26:35Z","published":"2023-11-08T10:45:09Z","title":"Rethinking Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nTraditional frame-based cameras and videos are commonly applied, yet, they\nbecome less reliable in scenarios under high dynamic range or heavy motion\nblur. In contrast, event cameras offer a robust solution for navigating these\nchallenging contexts. Predominant methodologies incorporate event cameras into\nlearning frameworks by accumulating events into event frames. However, such\nmethods tend to marginalize the intrinsic asynchronous and high temporal\nresolution characteristics of events. This disregard leads to a loss in\nessential temporal dimension data, crucial for discerning distinct actions. To\naddress this issue and to unlock the 3D potential of event information, we\nintroduce two 3D event representations: the Rasterized Event Point Cloud\n(RasEPC) and the Decoupled Event Voxel (DEV). The RasEPC collates events within\nconcise temporal slices at identical positions, preserving 3D attributes with\nstatistical cues and markedly mitigating memory and computational demands.\nMeanwhile, the DEV representation discretizes events into voxels and projects\nthem across three orthogonal planes, utilizing decoupled event attention to\nretrieve 3D cues from the 2D planes. Furthermore, we develop and release\nEV-3DPW, a synthetic event-based dataset crafted to facilitate training and\nquantitative analysis in outdoor scenes. On the public real-world DHP19\ndataset, our event point cloud technique excels in real-time mobile\npredictions, while the decoupled event voxel method achieves the highest\naccuracy. Experiments on EV-3DPW demonstrate that the robustness of our\nproposed 3D representation methods compared to traditional RGB images and event\nframe techniques under the same backbones. Our code and dataset have been made\npublicly available at https://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Huajian Ni","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v3.pdf","comment":"Extended version of arXiv:2206.04511. The code and dataset are\n available at https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2312.00387v1","updated":"2023-12-01T07:17:12Z","published":"2023-12-01T07:17:12Z","title":"Partition-based K-space Synthesis for Multi-contrast Parallel Imaging","summary":" Multi-contrast magnetic resonance imaging is a significant and essential\nmedical imaging technique.However, multi-contrast imaging has longer\nacquisition time and is easy to cause motion artifacts. In particular, the\nacquisition time for a T2-weighted image is prolonged due to its longer\nrepetition time (TR). On the contrary, T1-weighted image has a shorter TR.\nTherefore,utilizing complementary information across T1 and T2-weighted image\nis a way to decrease the overall imaging time. Previous T1-assisted T2\nreconstruction methods have mostly focused on image domain using whole-based\nimage fusion approaches. The image domain reconstruction method has the defects\nof high computational complexity and limited flexibility. To address this\nissue, we propose a novel multi-contrast imaging method called partition-based\nk-space synthesis (PKS) which can achieve super reconstruction quality of\nT2-weighted image by feature fusion. Concretely, we first decompose\nfully-sampled T1 k-space data and under-sampled T2 k-space data into two\nsub-data, separately. Then two new objects are constructed by combining the two\nsub-T1/T2 data. After that, the two new objects as the whole data to realize\nthe reconstruction of T2-weighted image. Finally, the objective T2 is\nsynthesized by extracting the sub-T2 data of each part. Experimental results\nshowed that our combined technique can achieve comparable or better results\nthan using traditional k-space parallel imaging(SAKE) that processes each\ncontrast independently.\n","authors":["Yuxia Huang","Zhonghui Wu","Xiaoling Xu","Minghui Zhang","Shanshan Wang","Qiegen Liu"],"pdf_url":"https://arxiv.org/pdf/2312.00387v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00386v1","updated":"2023-12-01T07:15:51Z","published":"2023-12-01T07:15:51Z","title":"Local monotone operator learning using non-monotone operators: MnM-MOL","summary":" The recovery of magnetic resonance (MR) images from undersampled measurements\nis a key problem that has seen extensive research in recent years. Unrolled\napproaches, which rely on end-to-end training of convolutional neural network\n(CNN) blocks within iterative reconstruction algorithms, offer state-of-the-art\nperformance. These algorithms require a large amount of memory during training,\nmaking them difficult to employ in high-dimensional applications. Deep\nequilibrium (DEQ) models and the recent monotone operator learning (MOL)\napproach were introduced to eliminate the need for unrolling, thus reducing the\nmemory demand during training. Both approaches require a Lipschitz constraint\non the network to ensure that the forward and backpropagation iterations\nconverge. Unfortunately, the constraint often results in reduced performance\ncompared to unrolled methods. The main focus of this work is to relax the\nconstraint on the CNN block in two different ways. Inspired by\nconvex-non-convex regularization strategies, we now impose the monotone\nconstraint on the sum of the gradient of the data term and the CNN block,\nrather than constrain the CNN itself to be a monotone operator. This approach\nenables the CNN to learn possibly non-monotone score functions, which can\ntranslate to improved performance. In addition, we only restrict the operator\nto be monotone in a local neighborhood around the image manifold. Our\ntheoretical results show that the proposed algorithm is guaranteed to converge\nto the fixed point and that the solution is robust to input perturbations,\nprovided that it is initialized close to the true solution. Our empirical\nresults show that the relaxed constraints translate to improved performance and\nthat the approach enjoys robustness to input perturbations similar to MOL.\n","authors":["Maneesh John","Jyothi Rikhab Chand","Mathews Jacob"],"pdf_url":"https://arxiv.org/pdf/2312.00386v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.00377v1","updated":"2023-12-01T06:48:03Z","published":"2023-12-01T06:48:03Z","title":"SynFundus: Generating a synthetic fundus images dataset with millions of\n samples and multi-disease annotations","summary":" In the field of medical imaging, the scarcity of large-scale datasets due to\nprivacy restrictions stands as a significant barrier to develop large models\nfor medical. To address this issue, we introduce SynFundus-1M, a high-quality\nsynthetic dataset with over 1 million retinal fundus images and extensive\ndisease and pathologies annotations, which is generated by a Denoising\nDiffusion Probabilistic Model. The SynFundus-Generator and SynFundus-1M achieve\nsuperior Frechet Inception Distance (FID) scores compared to existing methods\non main-stream public real datasets. Furthermore, the ophthalmologists\nevaluation validate the difficulty in discerning these synthetic images from\nreal ones, confirming the SynFundus-1M's authenticity. Through extensive\nexperiments, we demonstrate that both CNN and ViT can benifit from SynFundus-1M\nby pretraining or training directly. Compared to datasets like ImageNet or\nEyePACS, models train on SynFundus-1M not only achieve better performance but\nalso faster convergence on various downstream tasks.\n","authors":["Fangxin Shang","Jie Fu","Yehui Yang","Lei Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00377v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00375v1","updated":"2023-12-01T06:36:23Z","published":"2023-12-01T06:36:23Z","title":"Text-Guided 3D Face Synthesis -- From Generation to Editing","summary":" Text-guided 3D face synthesis has achieved remarkable results by leveraging\ntext-to-image (T2I) diffusion models. However, most existing works focus solely\non the direct generation, ignoring the editing, restricting them from\nsynthesizing customized 3D faces through iterative adjustments. In this paper,\nwe propose a unified text-guided framework from face generation to editing. In\nthe generation stage, we propose a geometry-texture decoupled generation to\nmitigate the loss of geometric details caused by coupling. Besides, decoupling\nenables us to utilize the generated geometry as a condition for texture\ngeneration, yielding highly geometry-texture aligned results. We further employ\na fine-tuned texture diffusion model to enhance texture quality in both RGB and\nYUV space. In the editing stage, we first employ a pre-trained diffusion model\nto update facial geometry or texture based on the texts. To enable sequential\nediting, we introduce a UV domain consistency preservation regularization,\npreventing unintentional changes to irrelevant facial attributes. Besides, we\npropose a self-guided consistency weight strategy to improve editing efficacy\nwhile preserving consistency. Through comprehensive experiments, we showcase\nour method's superiority in face synthesis. Project page:\nhttps://faceg2e.github.io/.\n","authors":["Yunjie Wu","Yapeng Meng","Zhipeng Hu","Lincheng Li","Haoqian Wu","Kun Zhou","Weiwei Xu","Xin Yu"],"pdf_url":"https://arxiv.org/pdf/2312.00375v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.02003v2","updated":"2023-12-01T06:23:15Z","published":"2022-12-05T03:26:08Z","title":"Bayesian Learning with Information Gain Provably Bounds Risk for a\n Robust Adversarial Defense","summary":" We present a new algorithm to learn a deep neural network model robust\nagainst adversarial attacks. Previous algorithms demonstrate an adversarially\ntrained Bayesian Neural Network (BNN) provides improved robustness. We\nrecognize the adversarial learning approach for approximating the multi-modal\nposterior distribution of a Bayesian model can lead to mode collapse;\nconsequently, the model's achievements in robustness and performance are\nsub-optimal. Instead, we first propose preventing mode collapse to better\napproximate the multi-modal posterior distribution. Second, based on the\nintuition that a robust model should ignore perturbations and only consider the\ninformative content of the input, we conceptualize and formulate an information\ngain objective to measure and force the information learned from both benign\nand adversarial training instances to be similar. Importantly. we prove and\ndemonstrate that minimizing the information gain objective allows the\nadversarial risk to approach the conventional empirical risk. We believe our\nefforts provide a step toward a basis for a principled method of adversarially\ntraining BNNs. Our model demonstrate significantly improved robustness--up to\n20%--compared with adversarial training and Adv-BNN under PGD attacks with\n0.035 distortion on both CIFAR-10 and STL-10 datasets.\n","authors":["Bao Gia Doan","Ehsan Abbasnejad","Javen Qinfeng Shi","Damith C. Ranasinghe"],"pdf_url":"https://arxiv.org/pdf/2212.02003v2.pdf","comment":"Published at ICML 2022. Code is available at\n https://github.com/baogiadoan/IG-BNN"},{"id":"http://arxiv.org/abs/2312.00364v1","updated":"2023-12-01T06:11:14Z","published":"2023-12-01T06:11:14Z","title":"Benchmarking Multi-Domain Active Learning on Image Classification","summary":" Active learning aims to enhance model performance by strategically labeling\ninformative data points. While extensively studied, its effectiveness on\nlarge-scale, real-world datasets remains underexplored. Existing research\nprimarily focuses on single-source data, ignoring the multi-domain nature of\nreal-world data. We introduce a multi-domain active learning benchmark to\nbridge this gap. Our benchmark demonstrates that traditional single-domain\nactive learning strategies are often less effective than random selection in\nmulti-domain scenarios. We also introduce CLIP-GeoYFCC, a novel large-scale\nimage dataset built around geographical domains, in contrast to existing\ngenre-based domain datasets. Analysis on our benchmark shows that all\nmulti-domain strategies exhibit significant tradeoffs, with no strategy\noutperforming across all datasets or all metrics, emphasizing the need for\nfuture research.\n","authors":["Jiayi Li","Rohan Taori","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2312.00364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00362v1","updated":"2023-12-01T05:59:08Z","published":"2023-12-01T05:59:08Z","title":"Dancing with Images: Video Distillation via Static-Dynamic\n Disentanglement","summary":" Recently, dataset distillation has paved the way towards efficient machine\nlearning, especially for image datasets. However, the distillation for videos,\ncharacterized by an exclusive temporal dimension, remains an underexplored\ndomain. In this work, we provide the first systematic study of video\ndistillation and introduce a taxonomy to categorize temporal compression. Our\ninvestigation reveals that the temporal information is usually not well learned\nduring distillation , and the temporal dimension of synthetic data contributes\nlittle. The observations motivate our unified framework of disentangling the\ndynamic and static information in the videos. It first distills the videos into\nstill images as static memory and then compensates the dynamic and motion\ninformation with a learnable dynamic memory block. Our method achieves\nstate-of-the-art on video datasets at different scales, with notably smaller\nstorage expenditure. Our code will be publicly available.\n","authors":["Ziyu Wang","Yue Xu","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2312.00362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00360v1","updated":"2023-12-01T05:50:44Z","published":"2023-12-01T05:50:44Z","title":"Efficient Multimodal Semantic Segmentation via Dual-Prompt Learning","summary":" Multimodal (e.g., RGB-Depth/RGB-Thermal) fusion has shown great potential for\nimproving semantic segmentation in complex scenes (e.g., indoor/low-light\nconditions). Existing approaches often fully fine-tune a dual-branch\nencoder-decoder framework with a complicated feature fusion strategy for\nachieving multimodal semantic segmentation, which is training-costly due to the\nmassive parameter updates in feature extraction and fusion. To address this\nissue, we propose a surprisingly simple yet effective dual-prompt learning\nnetwork (dubbed DPLNet) for training-efficient multimodal (e.g., RGB-D/T)\nsemantic segmentation. The core of DPLNet is to directly adapt a frozen\npre-trained RGB model to multimodal semantic segmentation, reducing parameter\nupdates. For this purpose, we present two prompt learning modules, comprising\nmultimodal prompt generator (MPG) and multimodal feature adapter (MFA). MPG\nworks to fuse the features from different modalities in a compact manner and is\ninserted from shadow to deep stages to generate the multi-level multimodal\nprompts that are injected into the frozen backbone, while MPG adapts prompted\nmultimodal features in the frozen backbone for better multimodal semantic\nsegmentation. Since both the MPG and MFA are lightweight, only a few trainable\nparameters (3.88M, 4.4% of the pre-trained backbone parameters) are introduced\nfor multimodal feature fusion and learning. Using a simple decoder (3.27M\nparameters), DPLNet achieves new state-of-the-art performance or is on a par\nwith other complex approaches on four RGB-D/T semantic segmentation datasets\nwhile satisfying parameter efficiency. Moreover, we show that DPLNet is general\nand applicable to other multimodal tasks such as salient object detection and\nvideo semantic segmentation. Without special design, DPLNet outperforms many\ncomplicated models. Our code will be available at\ngithub.com/ShaohuaDong2021/DPLNet.\n","authors":["Shaohua Dong","Yunhe Feng","Qing Yang","Yan Huang","Dongfang Liu","Heng Fan"],"pdf_url":"https://arxiv.org/pdf/2312.00360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18670v2","updated":"2023-12-01T05:37:12Z","published":"2023-05-30T01:00:31Z","title":"SAVE: Spectral-Shift-Aware Adaptation of Image Diffusion Models for\n Text-driven Video Editing","summary":" Text-to-Image (T2I) diffusion models have achieved remarkable success in\nsynthesizing high-quality images conditioned on text prompts. Recent methods\nhave tried to replicate the success by either training text-to-video (T2V)\nmodels on a very large number of text-video pairs or adapting T2I models on\ntext-video pairs independently. Although the latter is computationally less\nexpensive, it still takes a significant amount of time for per-video adaption.\nTo address this issue, we propose SAVE, a novel spectral-shift-aware adaptation\nframework, in which we fine-tune the spectral shift of the parameter space\ninstead of the parameters themselves. Specifically, we take the spectral\ndecomposition of the pre-trained T2I weights and only update the singular\nvalues while freezing the corresponding singular vectors. In addition, we\nintroduce a spectral shift regularizer aimed at placing tighter constraints on\nlarger singular values compared to smaller ones. This form of regularization\nenables the model to grasp finer details within the video that align with the\nprovided textual descriptions. We also offer theoretical justification for our\nproposed regularization technique. Since we are only dealing with spectral\nshifts, the proposed method reduces the adaptation time significantly (approx.\n10 times) and has fewer resource constraints for training. Such attributes\nposit SAVE to be more suitable for real-world applications, e.g. editing\nundesirable content during video streaming. We validate the effectiveness of\nSAVE with an extensive experimental evaluation under different settings, e.g.\nstyle transfer, object replacement, privacy preservation, etc.\n","authors":["Nazmul Karim","Umar Khalid","Mohsen Joneidi","Chen Chen","Nazanin Rahnavard"],"pdf_url":"https://arxiv.org/pdf/2305.18670v2.pdf","comment":"11 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.00358v1","updated":"2023-12-01T05:28:19Z","published":"2023-12-01T05:28:19Z","title":"Impact of Data Augmentation on QCNNs","summary":" In recent years, Classical Convolutional Neural Networks (CNNs) have been\napplied for image recognition successfully. Quantum Convolutional Neural\nNetworks (QCNNs) are proposed as a novel generalization to CNNs by using\nquantum mechanisms. The quantum mechanisms lead to an efficient training\nprocess in QCNNs by reducing the size of input from $N$ to $log_2N$. This paper\nimplements and compares both CNNs and QCNNs by testing losses and prediction\naccuracy on three commonly used datasets. The datasets include the MNIST\nhand-written digits, Fashion MNIST and cat/dog face images. Additionally, data\naugmentation (DA), a technique commonly used in CNNs to improve the performance\nof classification by generating similar images based on original inputs, is\nalso implemented in QCNNs. Surprisingly, the results showed that data\naugmentation didn't improve QCNNs performance. The reasons and logic behind\nthis result are discussed, hoping to expand our understanding of Quantum\nmachine learning theory.\n","authors":["Leting Zhouli","Peiyong Wang","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2312.00358v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.00357v1","updated":"2023-12-01T05:27:29Z","published":"2023-12-01T05:27:29Z","title":"A Generalizable Deep Learning System for Cardiac MRI","summary":" Cardiac MRI allows for a comprehensive assessment of myocardial structure,\nfunction, and tissue characteristics. Here we describe a foundational vision\nsystem for cardiac MRI, capable of representing the breadth of human\ncardiovascular disease and health. Our deep learning model is trained via\nself-supervised contrastive learning, by which visual concepts in cine-sequence\ncardiac MRI scans are learned from the raw text of the accompanying radiology\nreports. We train and evaluate our model on data from four large academic\nclinical institutions in the United States. We additionally showcase the\nperformance of our models on the UK BioBank, and two additional publicly\navailable external datasets. We explore emergent zero-shot capabilities of our\nsystem, and demonstrate remarkable performance across a range of tasks;\nincluding the problem of left ventricular ejection fraction regression, and the\ndiagnosis of 35 different conditions such as cardiac amyloidosis and\nhypertrophic cardiomyopathy. We show that our deep learning system is capable\nof not only understanding the staggering complexity of human cardiovascular\ndisease, but can be directed towards clinical problems of interest yielding\nimpressive, clinical grade diagnostic accuracy with a fraction of the training\ndata typically required for such tasks.\n","authors":["Rohan Shad","Cyril Zakka","Dhamanpreet Kaur","Robyn Fong","Ross Warren Filice","John Mongan","Kimberly Kalianos","Nishith Khandwala","David Eng","Matthew Leipzig","Walter Witschey","Alejandro de Feria","Victor Ferrari","Euan Ashley","Michael A. Acker","Curtis Langlotz","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2312.00357v1.pdf","comment":"21 page main manuscript, 4 figures. Supplementary Appendix and code\n will be made available on publication"},{"id":"http://arxiv.org/abs/2307.07643v3","updated":"2023-12-01T05:08:48Z","published":"2023-07-14T22:22:28Z","title":"AECIF-Net: An Attention-Enhanced Co-Interactive Fusion Network for\n Automated Structural Condition Assessment in Visual Inspection","summary":" Efficiently monitoring the condition of civil infrastructures necessitates\nautomating the structural condition assessment in visual inspection. This paper\nproposes an Attention-Enhanced Co-Interactive Fusion Network (AECIF-Net) for\nautomatic structural condition assessment in visual bridge inspection.\nAECIF-Net can simultaneously parse structural elements and segment surface\ndefects on the elements in inspection images. It integrates two task-specific\nrelearning subnets to extract task-specific features from an overall feature\nembedding. A co-interactive feature fusion module further captures the spatial\ncorrelation and facilitates information sharing between tasks. Experimental\nresults demonstrate that the proposed AECIF-Net outperforms the current\nstate-of-the-art approaches, achieving promising performance with 92.11% mIoU\nfor element segmentation and 87.16% mIoU for corrosion segmentation on the test\nset of the new benchmark dataset Steel Bridge Condition Inspection Visual\n(SBCIV). An ablation study verifies the merits of the designs for AECIF-Net,\nand a case study demonstrates its capability to automate structural condition\nassessment.\n","authors":["Chenyu Zhang","Zhaozheng Yin","Ruwen Qin"],"pdf_url":"https://arxiv.org/pdf/2307.07643v3.pdf","comment":"Submitted to Automation in Construction"},{"id":"http://arxiv.org/abs/2312.00351v1","updated":"2023-12-01T04:57:20Z","published":"2023-12-01T04:57:20Z","title":"Manipulating the Label Space for In-Context Classification","summary":" After pre-training by generating the next word conditional on previous words,\nthe Language Model (LM) acquires the ability of In-Context Learning (ICL) that\ncan learn a new task conditional on the context of the given in-context\nexamples (ICEs). Similarly, visually-conditioned Language Modelling is also\nused to train Vision-Language Models (VLMs) with ICL ability. However, such\nVLMs typically exhibit weaker classification abilities compared to contrastive\nlearning-based models like CLIP, since the Language Modelling objective does\nnot directly contrast whether an object is paired with a text. To improve the\nICL of classification, using more ICEs to provide more knowledge is a\nstraightforward way. However, this may largely increase the selection time, and\nmore importantly, the inclusion of additional in-context images tends to extend\nthe length of the in-context sequence beyond the processing capacity of a VLM.\nTo alleviate these limitations, we propose to manipulate the label space of\neach ICE to increase its knowledge density, allowing for fewer ICEs to convey\nas much information as a larger set would. Specifically, we propose two\nstrategies which are Label Distribution Enhancement and Visual Descriptions\nEnhancement to improve In-context classification performance on diverse\ndatasets, including the classic ImageNet and more fine-grained datasets like\nCUB-200. Specifically, using our approach on ImageNet, we increase accuracy\nfrom 74.70\\% in a 4-shot setting to 76.21\\% with just 2 shots. surpassing CLIP\nby 0.67\\%. On CUB-200, our method raises 1-shot accuracy from 48.86\\% to\n69.05\\%, 12.15\\% higher than CLIP. The code is given in\nhttps://anonymous.4open.science/r/MLS_ICC.\n","authors":["Haokun Chen","Xu Yang","Yuhang Huang","Zihan Wu","Jing Wang","Xin Geng"],"pdf_url":"https://arxiv.org/pdf/2312.00351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00348v1","updated":"2023-12-01T04:51:57Z","published":"2023-12-01T04:51:57Z","title":"Student Activity Recognition in Classroom Environments using Transfer\n Learning","summary":" The recent advances in artificial intelligence and deep learning facilitate\nautomation in various applications including home automation, smart\nsurveillance systems, and healthcare among others. Human Activity Recognition\nis one of its emerging applications, which can be implemented in a classroom\nenvironment to enhance safety, efficiency, and overall educational quality.\nThis paper proposes a system for detecting and recognizing the activities of\nstudents in a classroom environment. The dataset has been structured and\nrecorded by the authors since a standard dataset for this task was not\navailable at the time of this study. Transfer learning, a widely adopted method\nwithin the field of deep learning, has proven to be helpful in complex tasks\nlike image and video processing. Pretrained models including VGG-16, ResNet-50,\nInceptionV3, and Xception are used for feature extraction and classification\ntasks. Xception achieved an accuracy of 93%, on the novel classroom dataset,\noutperforming the other three models in consideration. The system proposed in\nthis study aims to introduce a safer and more productive learning environment\nfor students and educators.\n","authors":["Anagha Deshpande","Vedant Deshpande"],"pdf_url":"https://arxiv.org/pdf/2312.00348v1.pdf","comment":"6 pages, 12 figures, accepted at the IEEE International Conference on\n Computational Intelligence, Networks and Security (ICCINS) 2023"},{"id":"http://arxiv.org/abs/2312.00347v1","updated":"2023-12-01T04:51:01Z","published":"2023-12-01T04:51:01Z","title":"RTQ: Rethinking Video-language Understanding Based on Image-text Model","summary":" Recent advancements in video-language understanding have been established on\nthe foundation of image-text models, resulting in promising outcomes due to the\nshared knowledge between images and videos. However, video-language\nunderstanding presents unique challenges due to the inclusion of highly complex\nsemantic details, which result in information redundancy, temporal dependency,\nand scene complexity. Current techniques have only partially tackled these\nissues, and our quantitative analysis indicates that some of these methods are\ncomplementary. In light of this, we propose a novel framework called RTQ\n(Refine, Temporal model, and Query), which addresses these challenges\nsimultaneously. The approach involves refining redundant information within\nframes, modeling temporal relations among frames, and querying task-specific\ninformation from the videos. Remarkably, our model demonstrates outstanding\nperformance even in the absence of video-language pre-training, and the results\nare comparable with or superior to those achieved by state-of-the-art\npre-training methods.\n","authors":["Xiao Wang","Yaoyu Li","Tian Gan","Zheng Zhang","Jingjing Lv","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.00347v1.pdf","comment":"Accepted by ACM MM 2023 as Oral representation"},{"id":"http://arxiv.org/abs/2312.00343v1","updated":"2023-12-01T04:35:47Z","published":"2023-12-01T04:35:47Z","title":"OpenStereo: A Comprehensive Benchmark for Stereo Matching and Strong\n Baseline","summary":" Stereo matching, a pivotal technique in computer vision, plays a crucial role\nin robotics, autonomous navigation, and augmented reality. Despite the\ndevelopment of numerous impressive methods in recent years, replicating their\nresults and determining the most suitable architecture for practical\napplication remains challenging. Addressing this gap, our paper introduces a\ncomprehensive benchmark focusing on practical applicability rather than solely\non performance enhancement. Specifically, we develop a flexible and efficient\nstereo matching codebase, called OpenStereo. OpenStereo includes training and\ninference codes of more than 12 network models, making it, to our knowledge,\nthe most complete stereo matching toolbox available. Based on OpenStereo, we\nconducted experiments on the SceneFlow dataset and have achieved or surpassed\nthe performance metrics reported in the original paper. Additionally, we\nconduct an in-depth revisitation of recent developments in stereo matching\nthrough ablative experiments. These investigations inspired the creation of\nStereoBase, a simple yet strong baseline model. Our extensive comparative\nanalyses of StereoBase against numerous contemporary stereo matching methods on\nthe SceneFlow dataset demonstrate its remarkably strong performance. The source\ncode is available at https://github.com/XiandaGuo/OpenStereo.\n","authors":["Xianda Guo","Juntao Lu","Chenming Zhang","Yiqi Wang","Yiqun Duan","Tian Yang","Zheng Zhu","Long Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00343v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00335v1","updated":"2023-12-01T04:07:12Z","published":"2023-12-01T04:07:12Z","title":"Learning Anatomically Consistent Embedding for Chest Radiography","summary":" Self-supervised learning (SSL) approaches have recently shown substantial\nsuccess in learning visual representations from unannotated images. Compared\nwith photographic images, medical images acquired with the same imaging\nprotocol exhibit high consistency in anatomy. To exploit this anatomical\nconsistency, this paper introduces a novel SSL approach, called PEAC (patch\nembedding of anatomical consistency), for medical image analysis. Specifically,\nin this paper, we propose to learn global and local consistencies via stable\ngrid-based matching, transfer pre-trained PEAC models to diverse downstream\ntasks, and extensively demonstrate that (1) PEAC achieves significantly better\nperformance than the existing state-of-the-art fully/self-supervised methods,\nand (2) PEAC captures the anatomical structure consistency across views of the\nsame patient and across patients of different genders, weights, and healthy\nstatuses, which enhances the interpretability of our method for medical image\nanalysis.\n","authors":["Ziyu Zhou","Haozhe Luo","Jiaxuan Pang","Xiaowei Ding","Michael Gotway","Jianming Liang"],"pdf_url":"https://arxiv.org/pdf/2312.00335v1.pdf","comment":"BMVC 2023, oral"},{"id":"http://arxiv.org/abs/2312.00330v1","updated":"2023-12-01T03:53:21Z","published":"2023-12-01T03:53:21Z","title":"StyleCrafter: Enhancing Stylized Text-to-Video Generation with Style\n Adapter","summary":" Text-to-video (T2V) models have shown remarkable capabilities in generating\ndiverse videos. However, they struggle to produce user-desired stylized videos\ndue to (i) text's inherent clumsiness in expressing specific styles and (ii)\nthe generally degraded style fidelity. To address these challenges, we\nintroduce StyleCrafter, a generic method that enhances pre-trained T2V models\nwith a style control adapter, enabling video generation in any style by\nproviding a reference image. Considering the scarcity of stylized video\ndatasets, we propose to first train a style control adapter using style-rich\nimage datasets, then transfer the learned stylization ability to video\ngeneration through a tailor-made finetuning paradigm. To promote content-style\ndisentanglement, we remove style descriptions from the text prompt and extract\nstyle information solely from the reference image using a decoupling learning\nstrategy. Additionally, we design a scale-adaptive fusion module to balance the\ninfluences of text-based content features and image-based style features, which\nhelps generalization across various text and style combinations. StyleCrafter\nefficiently generates high-quality stylized videos that align with the content\nof the texts and resemble the style of the reference images. Experiments\ndemonstrate that our approach is more flexible and efficient than existing\ncompetitors.\n","authors":["Gongye Liu","Menghan Xia","Yong Zhang","Haoxin Chen","Jinbo Xing","Xintao Wang","Yujiu Yang","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.00330v1.pdf","comment":"Project page: https://gongyeliu.github.io/StyleCrafter.github.io/ ;\n GitHub repository: https://github.com/GongyeLiu/StyleCrafter"},{"id":"http://arxiv.org/abs/2310.04677v2","updated":"2023-12-01T03:45:44Z","published":"2023-10-07T03:22:06Z","title":"AG-CRC: Anatomy-Guided Colorectal Cancer Segmentation in CT with\n Imperfect Anatomical Knowledge","summary":" When delineating lesions from medical images, a human expert can always keep\nin mind the anatomical structure behind the voxels. However, although\nhigh-quality (though not perfect) anatomical information can be retrieved from\ncomputed tomography (CT) scans with modern deep learning algorithms, it is\nstill an open problem how these automatically generated organ masks can assist\nin addressing challenging lesion segmentation tasks, such as the segmentation\nof colorectal cancer (CRC). In this paper, we develop a novel Anatomy-Guided\nsegmentation framework to exploit the auto-generated organ masks to aid CRC\nsegmentation from CT, namely AG-CRC. First, we obtain multi-organ segmentation\n(MOS) masks with existing MOS models (e.g., TotalSegmentor) and further derive\na more robust organ of interest (OOI) mask that may cover most of the\ncolon-rectum and CRC voxels. Then, we propose an anatomy-guided training patch\nsampling strategy by optimizing a heuristic gain function that considers both\nthe proximity of important regions (e.g., the tumor or organs of interest) and\nsample diversity. Third, we design a novel self-supervised learning scheme\ninspired by the topology of tubular organs like the colon to boost the model\nperformance further. Finally, we employ a masked loss scheme to guide the model\nto focus solely on the essential learning region. We extensively evaluate the\nproposed method on two CRC segmentation datasets, where substantial performance\nimprovement (5% to 9% in Dice) is achieved over current state-of-the-art\nmedical image segmentation models, and the ablation studies further evidence\nthe efficacy of every proposed component.\n","authors":["Rongzhao Zhang","Zhian Bai","Ruoying Yu","Wenrao Pang","Lingyun Wang","Lifeng Zhu","Xiaofan Zhang","Huan Zhang","Weiguo Hu"],"pdf_url":"https://arxiv.org/pdf/2310.04677v2.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2309.16496v2","updated":"2023-12-01T03:28:21Z","published":"2023-09-28T15:03:44Z","title":"CCEdit: Creative and Controllable Video Editing via Diffusion Models","summary":" In this paper, we present CCEdit, a versatile generative video editing\nframework based on diffusion models. Our approach employs a novel trident\nnetwork structure that separates structure and appearance control, ensuring\nprecise and creative editing capabilities. Utilizing the foundational\nControlNet architecture, we maintain the structural integrity of the video\nduring editing. The incorporation of an additional appearance branch enables\nusers to exert fine-grained control over the edited key frame. These two side\nbranches seamlessly integrate into the main branch, which is constructed upon\nexisting text-to-image (T2I) generation models, through learnable temporal\nlayers. The versatility of our framework is demonstrated through a diverse\nrange of choices in both structure representations and personalized T2I models,\nas well as the option to provide the edited key frame. To facilitate\ncomprehensive evaluation, we introduce the BalanceCC benchmark dataset,\ncomprising 100 videos and 4 target prompts for each video. Our extensive user\nstudies compare CCEdit with eight state-of-the-art video editing methods. The\noutcomes demonstrate CCEdit's substantial superiority over all other methods.\n","authors":["Ruoyu Feng","Wenming Weng","Yanhui Wang","Yuhui Yuan","Jianmin Bao","Chong Luo","Zhibo Chen","Baining Guo"],"pdf_url":"https://arxiv.org/pdf/2309.16496v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17338v2","updated":"2023-12-01T03:18:47Z","published":"2023-11-29T03:36:07Z","title":"VideoAssembler: Identity-Consistent Video Generation with Reference\n Entities using Diffusion Model","summary":" Identity-consistent video generation seeks to synthesize videos that are\nguided by both textual prompts and reference images of entities. Current\napproaches typically utilize cross-attention layers to integrate the appearance\nof the entity, which predominantly captures semantic attributes, resulting in\ncompromised fidelity of entities. Moreover, these methods necessitate iterative\nfine-tuning for each new entity encountered, thereby limiting their\napplicability. To address these challenges, we introduce VideoAssembler, a\nnovel end-to-end framework for identity-consistent video generation that can\nconduct inference directly when encountering new entities. VideoAssembler is\nadept at producing videos that are not only flexible with respect to the input\nreference entities but also responsive to textual conditions. Additionally, by\nmodulating the quantity of input images for the entity, VideoAssembler enables\nthe execution of tasks ranging from image-to-video generation to sophisticated\nvideo editing. VideoAssembler comprises two principal components: the Reference\nEntity Pyramid (REP) encoder and the Entity-Prompt Attention Fusion (EPAF)\nmodule. The REP encoder is designed to infuse comprehensive appearance details\ninto the denoising stages of the stable diffusion model. Concurrently, the EPAF\nmodule is utilized to integrate text-aligned features effectively. Furthermore,\nto mitigate the challenge of scarce data, we present a methodology for the\npreprocessing of training data. Our evaluation of the VideoAssembler framework\non the UCF-101, MSR-VTT, and DAVIS datasets indicates that it achieves good\nperformances in both quantitative and qualitative analyses (346.84 in FVD and\n48.01 in IS on UCF-101). Our project page is at\nhttps://gulucaptain.github.io/videoassembler/.\n","authors":["Haoyu Zhao","Tianyi Lu","Jiaxi Gu","Xing Zhang","Zuxuan Wu","Hang Xu","Yu-Gang Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.17338v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00316v1","updated":"2023-12-01T03:16:10Z","published":"2023-12-01T03:16:10Z","title":"Improving Efficiency of DNN-based Relocalization Module for Autonomous\n Driving with Server-side Computing","summary":" In this work, we present a novel framework for camera relocation in\nautonomous vehicles, leveraging deep neural networks (DNN). While existing\nliterature offers various DNN-based camera relocation methods, their deployment\nis hindered by their high computational demands during inference. In contrast,\nour approach addresses this challenge through edge cloud collaboration.\nSpecifically, we strategically offload certain modules of the neural network to\nthe server and evaluate the inference time of data frames under different\nnetwork segmentation schemes to guide our offloading decisions. Our findings\nhighlight the vital role of server-side offloading in DNN-based camera\nrelocation for autonomous vehicles, and we also discuss the results of data\nfusion. Finally, we validate the effectiveness of our proposed framework\nthrough experimental evaluation.\n","authors":["Dengbo Li","Jieren Cheng","Boyi Liu"],"pdf_url":"https://arxiv.org/pdf/2312.00316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00313v1","updated":"2023-12-01T03:12:04Z","published":"2023-12-01T03:12:04Z","title":"Improving Normalization with the James-Stein Estimator","summary":" Stein's paradox holds considerable sway in high-dimensional statistics,\nhighlighting that the sample mean, traditionally considered the de facto\nestimator, might not be the most efficacious in higher dimensions. To address\nthis, the James-Stein estimator proposes an enhancement by steering the sample\nmeans toward a more centralized mean vector. In this paper, first, we establish\nthat normalization layers in deep learning use inadmissible estimators for mean\nand variance. Next, we introduce a novel method to employ the James-Stein\nestimator to improve the estimation of mean and variance within normalization\nlayers. We evaluate our method on different computer vision tasks: image\nclassification, semantic segmentation, and 3D object classification. Through\nthese evaluations, it is evident that our improved normalization layers\nconsistently yield superior accuracy across all tasks without extra\ncomputational burden. Moreover, recognizing that a plethora of shrinkage\nestimators surpass the traditional estimator in performance, we study two other\nprominent shrinkage estimators: Ridge and LASSO. Additionally, we provide\nvisual representations to intuitively demonstrate the impact of shrinkage on\nthe estimated layer statistics. Finally, we study the effect of regularization\nand batch size on our modified batch normalization. The studies show that our\nmethod is less sensitive to batch size and regularization, improving accuracy\nunder various setups.\n","authors":["Seyedalireza Khoshsirat","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2312.00313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00312v1","updated":"2023-12-01T03:07:13Z","published":"2023-12-01T03:07:13Z","title":"Segment Anything Model-guided Collaborative Learning Network for\n Scribble-supervised Polyp Segmentation","summary":" Polyp segmentation plays a vital role in accurately locating polyps at an\nearly stage, which holds significant clinical importance for the prevention of\ncolorectal cancer. Various polyp segmentation methods have been developed using\nfully-supervised deep learning techniques. However, pixel-wise annotation for\npolyp images by physicians during the diagnosis is both time-consuming and\nexpensive. Moreover, visual foundation models such as the Segment Anything\nModel (SAM) have shown remarkable performance. Nevertheless, directly applying\nSAM to medical segmentation may not produce satisfactory results due to the\ninherent absence of medical knowledge. In this paper, we propose a novel\nSAM-guided Collaborative Learning Network (SAM-CLNet) for scribble-supervised\npolyp segmentation, enabling a collaborative learning process between our\nsegmentation network and SAM to boost the model performance. Specifically, we\nfirst propose a Cross-level Enhancement and Aggregation Network (CEA-Net) for\nweakly-supervised polyp segmentation. Within CEA-Net, we propose a Cross-level\nEnhancement Module (CEM) that integrates the adjacent features to enhance the\nrepresentation capabilities of different resolution features. Additionally, a\nFeature Aggregation Module (FAM) is employed to capture richer features across\nmultiple levels. Moreover, we present a box-augmentation strategy that combines\nthe segmentation maps generated by CEA-Net with scribble annotations to create\nmore precise prompts. These prompts are then fed into SAM, generating\nsegmentation SAM-guided masks, which can provide additional supervision to\ntrain CEA-Net effectively. Furthermore, we present an Image-level Filtering\nMechanism to filter out unreliable SAM-guided masks. Extensive experimental\nresults show that our SAM-CLNet outperforms state-of-the-art weakly-supervised\nsegmentation methods.\n","authors":["Yiming Zhao","Tao Zhou","Yunqi Gu","Yi Zhou","Yizhe Zhang","Ye Wu","Huazhu Fu"],"pdf_url":"https://arxiv.org/pdf/2312.00312v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.00311v1","updated":"2023-12-01T03:05:21Z","published":"2023-12-01T03:05:21Z","title":"3D Face Reconstruction with the Geometric Guidance of Facial Part\n Segmentation","summary":" 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in\nvarious applications. However, existing methods struggle to reconstruct faces\nwith extreme expressions due to deficiencies in supervisory signals, such as\nsparse or inaccurate landmarks. Segmentation information contains effective\ngeometric contexts for face reconstruction. Certain attempts intuitively depend\non differentiable renderers to compare the rendered silhouettes of\nreconstruction with segmentation, which is prone to issues like local optima\nand gradient instability. In this paper, we fully utilize the facial part\nsegmentation geometry by introducing Part Re-projection Distance Loss (PRDL).\nSpecifically, PRDL transforms facial part segmentation into 2D points and\nre-projects the reconstruction onto the image plane. Subsequently, by\nintroducing grid anchors and computing different statistical distances from\nthese anchors to the point sets, PRDL establishes geometry descriptors to\noptimize the distribution of the point sets for face reconstruction. PRDL\nexhibits a clear gradient compared to the renderer-based methods and presents\nstate-of-the-art reconstruction performance in extensive quantitative and\nqualitative experiments. The project will be publicly available.\n","authors":["Zidu Wang","Xiangyu Zhu","Tianshuo Zhang","Baiqin Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2312.00311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13670v4","updated":"2023-12-01T02:58:36Z","published":"2023-08-25T20:59:51Z","title":"Linear Oscillation: A Novel Activation Function for Vision Transformer","summary":" Activation functions are the linchpins of deep learning, profoundly\ninfluencing both the representational capacity and training dynamics of neural\nnetworks. They shape not only the nature of representations but also optimize\nconvergence rates and enhance generalization potential. Appreciating this\ncritical role, we present the Linear Oscillation (LoC) activation function,\ndefined as $f(x) = x \\times \\sin(\\alpha x + \\beta)$. Distinct from conventional\nactivation functions which primarily introduce non-linearity, LoC seamlessly\nblends linear trajectories with oscillatory deviations. The nomenclature\n\"Linear Oscillation\" is a nod to its unique attribute of infusing linear\nactivations with harmonious oscillations, capturing the essence of the\n\"Importance of Confusion\". This concept of \"controlled confusion\" within\nnetwork activations is posited to foster more robust learning, particularly in\ncontexts that necessitate discerning subtle patterns. Our empirical studies\nreveal that, when integrated into diverse neural architectures, the LoC\nactivation function consistently outperforms established counterparts like ReLU\nand Sigmoid. The stellar performance exhibited by the avant-garde Vision\nTransformer model using LoC further validates its efficacy. This study\nilluminates the remarkable benefits of the LoC over other prominent activation\nfunctions. It champions the notion that intermittently introducing deliberate\ncomplexity or \"confusion\" during training can spur more profound and nuanced\nlearning. This accentuates the pivotal role of judiciously selected activation\nfunctions in shaping the future of neural network training.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2308.13670v4.pdf","comment":"Not Completed Paper"},{"id":"http://arxiv.org/abs/2312.00308v1","updated":"2023-12-01T02:58:27Z","published":"2023-12-01T02:58:27Z","title":"A knowledge-based data-driven (KBDD) framework for all-day\n identification of cloud types using satellite remote sensing","summary":" Cloud types, as a type of meteorological data, are of particular significance\nfor evaluating changes in rainfall, heatwaves, water resources, floods and\ndroughts, food security and vegetation cover, as well as land use. In order to\neffectively utilize high-resolution geostationary observations, a\nknowledge-based data-driven (KBDD) framework for all-day identification of\ncloud types based on spectral information from Himawari-8/9 satellite sensors\nis designed. And a novel, simple and efficient network, named CldNet, is\nproposed. Compared with widely used semantic segmentation networks, including\nSegNet, PSPNet, DeepLabV3+, UNet, and ResUnet, our proposed model CldNet with\nan accuracy of 80.89+-2.18% is state-of-the-art in identifying cloud types and\nhas increased by 32%, 46%, 22%, 2%, and 39%, respectively. With the assistance\nof auxiliary information (e.g., satellite zenith/azimuth angle, solar\nzenith/azimuth angle), the accuracy of CldNet-W using visible and near-infrared\nbands and CldNet-O not using visible and near-infrared bands on the test\ndataset is 82.23+-2.14% and 73.21+-2.02%, respectively. Meanwhile, the total\nparameters of CldNet are only 0.46M, making it easy for edge deployment. More\nimportantly, the trained CldNet without any fine-tuning can predict cloud types\nwith higher spatial resolution using satellite spectral data with spatial\nresolution 0.02{\\deg}*0.02{\\deg}, which indicates that CldNet possesses a\nstrong generalization ability. In aggregate, the KBDD framework using CldNet is\na highly effective cloud-type identification system capable of providing a\nhigh-fidelity, all-day, spatiotemporal cloud-type database for many climate\nassessment fields.\n","authors":["Longfeng Nie","Yuntian Chen","Mengge Du","Changqi Sun","Dongxiao Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.00308v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00306v1","updated":"2023-12-01T02:54:38Z","published":"2023-12-01T02:54:38Z","title":"RadioGalaxyNET: Dataset and Novel Computer Vision Algorithms for the\n Detection of Extended Radio Galaxies and Infrared Hosts","summary":" Creating radio galaxy catalogues from next-generation deep surveys requires\nautomated identification of associated components of extended sources and their\ncorresponding infrared hosts. In this paper, we introduce RadioGalaxyNET, a\nmultimodal dataset, and a suite of novel computer vision algorithms designed to\nautomate the detection and localization of multi-component extended radio\ngalaxies and their corresponding infrared hosts. The dataset comprises 4,155\ninstances of galaxies in 2,800 images with both radio and infrared channels.\nEach instance provides information about the extended radio galaxy class, its\ncorresponding bounding box encompassing all components, the pixel-level\nsegmentation mask, and the keypoint position of its corresponding infrared host\ngalaxy. RadioGalaxyNET is the first dataset to include images from the highly\nsensitive Australian Square Kilometre Array Pathfinder (ASKAP) radio telescope,\ncorresponding infrared images, and instance-level annotations for galaxy\ndetection. We benchmark several object detection algorithms on the dataset and\npropose a novel multimodal approach to simultaneously detect radio galaxies and\nthe positions of infrared hosts.\n","authors":["Nikhel Gupta","Zeeshan Hayder","Ray P. Norris","Minh Huynh","Lars Petersson"],"pdf_url":"https://arxiv.org/pdf/2312.00306v1.pdf","comment":"Accepted for publication in PASA. The paper has 17 pages, 6 figures,\n 5 tables"},{"id":"http://arxiv.org/abs/2312.00304v1","updated":"2023-12-01T02:47:00Z","published":"2023-12-01T02:47:00Z","title":"Developmental Pretraining (DPT) for Image Classification Networks","summary":" In the backdrop of increasing data requirements of Deep Neural Networks for\nobject recognition that is growing more untenable by the day, we present\nDevelopmental PreTraining (DPT) as a possible solution. DPT is designed as a\ncurriculum-based pre-training approach designed to rival traditional\npre-training techniques that are data-hungry. These training approaches also\nintroduce unnecessary features that could be misleading when the network is\nemployed in a downstream classification task where the data is sufficiently\ndifferent from the pre-training data and is scarce. We design the curriculum\nfor DPT by drawing inspiration from human infant visual development. DPT\nemploys a phased approach where carefully-selected primitive and universal\nfeatures like edges and shapes are taught to the network participating in our\npre-training regime. A model that underwent the DPT regime is tested against\nmodels with randomised weights to evaluate the viability of DPT.\n","authors":["Niranjan Rajesh","Debayan Gupta"],"pdf_url":"https://arxiv.org/pdf/2312.00304v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.01985v2","updated":"2023-12-01T02:40:58Z","published":"2023-07-05T02:13:25Z","title":"Task-Specific Alignment and Multiple Level Transformer for Few-Shot\n Action Recognition","summary":" In the research field of few-shot learning, the main difference between\nimage-based and video-based is the additional temporal dimension. In recent\nyears, some works have used the Transformer to deal with frames, then get the\nattention feature and the enhanced prototype, and the results are competitive.\nHowever, some video frames may relate little to the action, and only using\nsingle frame-level or segment-level features may not mine enough information.\nWe address these problems sequentially through an end-to-end method named\n\"Task-Specific Alignment and Multiple-level Transformer Network (TSA-MLT)\". The\nfirst module (TSA) aims at filtering the action-irrelevant frames for action\nduration alignment. Affine Transformation for frame sequence in the time\ndimension is used for linear sampling. The second module (MLT) focuses on the\nMultiple-level feature of the support prototype and query sample to mine more\ninformation for the alignment, which operates on different level features. We\nadopt a fusion loss according to a fusion distance that fuses the L2 sequence\ndistance, which focuses on temporal order alignment, and the Optimal Transport\ndistance, which focuses on measuring the gap between the appearance and\nsemantics of the videos. Extensive experiments show our method achieves\nstate-of-the-art results on the HMDB51 and UCF101 datasets and a competitive\nresult on the benchmark of Kinetics and something 2-something V2 datasets. Our\ncode is available at the URL: https://github.com/cofly2014/tsa-mlt.git\n","authors":["Fei Guo","Li Zhu","YiWang Wang","Jing Sun"],"pdf_url":"https://arxiv.org/pdf/2307.01985v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00299v1","updated":"2023-12-01T02:30:08Z","published":"2023-12-01T02:30:08Z","title":"QIENet: Quantitative irradiance estimation network using recurrent\n neural network based on satellite remote sensing data","summary":" Global horizontal irradiance (GHI) plays a vital role in estimating solar\nenergy resources, which are used to generate sustainable green energy. In order\nto estimate GHI with high spatial resolution, a quantitative irradiance\nestimation network, named QIENet, is proposed. Specifically, the temporal and\nspatial characteristics of remote sensing data of the satellite Himawari-8 are\nextracted and fused by recurrent neural network (RNN) and convolution\noperation, respectively. Not only remote sensing data, but also GHI-related\ntime information (hour, day, and month) and geographical information (altitude,\nlongitude, and latitude), are used as the inputs of QIENet. The satellite\nspectral channels B07 and B11 - B15 and time are recommended as model inputs\nfor QIENet according to the spatial distributions of annual solar energy.\nMeanwhile, QIENet is able to capture the impact of various clouds on hourly GHI\nestimates. More importantly, QIENet does not overestimate ground observations\nand can also reduce RMSE by 27.51%/18.00%, increase R2 by 20.17%/9.42%, and\nincrease r by 8.69%/3.54% compared with ERA5/NSRDB. Furthermore, QIENet is\ncapable of providing a high-fidelity hourly GHI database with spatial\nresolution 0.02{\\deg} * 0.02{\\deg}(approximately 2km * 2km) for many applied\nenergy fields.\n","authors":["Longfeng Nie","Yuntian Chen","Dongxiao Zhang","Xinyue Liu","Wentian Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.00299v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17315v2","updated":"2023-12-01T02:25:33Z","published":"2023-11-29T02:10:31Z","title":"Explaining CLIP's performance disparities on data from blind/low vision\n users","summary":" Large multi-modal models (LMMs) hold the potential to usher in a new era of\nautomated visual assistance for people who are blind or low vision (BLV). Yet,\nthese models have not been systematically evaluated on data captured by BLV\nusers. We address this by empirically assessing CLIP, a widely-used LMM likely\nto underpin many assistive technologies. Testing 25 CLIP variants in a\nzero-shot classification task, we find that their accuracy is 15 percentage\npoints lower on average for images captured by BLV users than web-crawled\nimages. This disparity stems from CLIP's sensitivities to 1) image content\n(e.g. not recognizing disability objects as well as other objects); 2) image\nquality (e.g. not being robust to lighting variation); and 3) text content\n(e.g. not recognizing objects described by tactile adjectives as well as visual\nones). We delve deeper with a textual analysis of three common pre-training\ndatasets: LAION-400M, LAION-2B and DataComp-1B, showing that disability content\nis rarely mentioned. We then provide three examples that illustrate how the\nperformance disparities extend to three downstream models underpinned by CLIP:\nOWL-ViT, CLIPSeg and DALL-E2. We find that few-shot learning with as few as 5\nimages can mitigate CLIP's quality-of-service disparities for BLV users in some\nscenarios, which we discuss alongside a set of other possible mitigations.\n","authors":["Daniela Massiceti","Camilla Longden","Agnieszka Słowik","Samuel Wills","Martin Grayson","Cecily Morrison"],"pdf_url":"https://arxiv.org/pdf/2311.17315v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00269v1","updated":"2023-12-01T01:03:59Z","published":"2023-12-01T01:03:59Z","title":"Adaptability of Computer Vision at the Tactical Edge: Addressing\n Environmental Uncertainty","summary":" Computer Vision (CV) systems are increasingly being adopted into Command and\nControl (C2) systems to improve intelligence analysis on the battlefield, the\ntactical edge. CV systems leverage Artificial Intelligence (AI) algorithms to\nhelp visualize and interpret the environment, enhancing situational awareness.\nHowever, the adaptability of CV systems at the tactical edge remains\nchallenging due to rapidly changing environments and objects which can confuse\nthe deployed models. A CV model leveraged in this environment can become\nuncertain in its predictions, as the environment and the objects existing in\nthe environment begin to change. Additionally, mission objectives can rapidly\nchange leading to adjustments in technology, camera angles, and image\nresolutions. All of which can negatively affect the performance of and\npotentially introduce uncertainty into the system. When the training\nenvironment and/or technology differs from the deployment environment, CV\nmodels can perform unexpectedly. Unfortunately, most scenarios at the tactical\nedge do not incorporate Uncertainty Quantification (UQ) into their deployed C2\nand CV systems. This concept paper explores the idea of synchronizing robust\ndata operations and model fine-tuning driven by UQ all at the tactical edge.\nSpecifically, curating datasets and training child models based on the\nresiduals of predictions, using these child models to calculate prediction\nintervals (PI), and then using these PI to calibrate the deployed models. By\nincorporating UQ into the core operations surrounding C2 and CV systems at the\ntactical edge, we can help drive purposeful adaptability on the battlefield.\n","authors":["Hayden Moore"],"pdf_url":"https://arxiv.org/pdf/2312.00269v1.pdf","comment":"Accepted paper for the 28th annual International Command and Control\n Research and Technology Symposium (ICCRTS), Johns Hopkins Applied Physics\n Laboratory. Baltimore, MD. (2023)"},{"id":"http://arxiv.org/abs/2310.06282v3","updated":"2023-12-01T00:55:13Z","published":"2023-10-10T03:32:33Z","title":"MuseChat: A Conversational Music Recommendation System for Videos","summary":" Music recommendation for videos attracts growing interest in multi-modal\nresearch. However, existing systems focus primarily on content compatibility,\noften ignoring the users' preferences. Their inability to interact with users\nfor further refinements or to provide explanations leads to a less satisfying\nexperience. We address these issues with MuseChat, a first-of-its-kind\ndialogue-based recommendation system that personalizes music suggestions for\nvideos. Our system consists of two key functionalities with associated modules:\nrecommendation and reasoning. The recommendation module takes a video along\nwith optional information including previous suggested music and user's\npreference as inputs and retrieves an appropriate music matching the context.\nThe reasoning module, equipped with the power of Large Language Model\n(Vicuna-7B) and extended to multi-modal inputs, is able to provide reasonable\nexplanation for the recommended music. To evaluate the effectiveness of\nMuseChat, we build a large-scale dataset, conversational music recommendation\nfor videos, that simulates a two-turn interaction between a user and a\nrecommender based on accurate music track information. Experiment results show\nthat MuseChat achieves significant improvements over existing video-based music\nretrieval methods as well as offers strong interpretability and\ninteractability.\n","authors":["Zhikang Dong","Bin Chen","Xiulong Liu","Pawel Polak","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.06282v3.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.00774v1","updated":"2023-12-01T18:53:51Z","published":"2023-12-01T18:53:51Z","title":"Context Retrieval via Normalized Contextual Latent Interaction for\n Conversational Agent","summary":" Conversational agents leveraging AI, particularly deep learning, are emerging\nin both academic research and real-world applications. However, these\napplications still face challenges, including disrespecting knowledge and\nfacts, not personalizing to user preferences, and enormous demand for\ncomputational resources during training and inference. Recent research efforts\nhave been focused on addressing these challenges from various aspects,\nincluding supplementing various types of auxiliary information to the\nconversational agents. However, existing methods are still not able to\neffectively and efficiently exploit relevant information from these auxiliary\nsupplements to further unleash the power of the conversational agents and the\nlanguage models they use. In this paper, we present a novel method, PK-NCLI,\nthat is able to accurately and efficiently identify relevant auxiliary\ninformation to improve the quality of conversational responses by learning the\nrelevance among persona, chat history, and knowledge background through\nlow-level normalized contextual latent interaction. Our experimental results\nindicate that PK-NCLI outperforms the state-of-the-art method, PK-FoCus, by\n47.80%/30.61%/24.14% in terms of perplexity, knowledge grounding, and training\nefficiency, respectively, and maintained the same level of persona grounding\nperformance. We also provide a detailed analysis of how different factors,\nincluding language model choices and trade-offs on training weights, would\naffect the performance of PK-NCLI.\n","authors":["Junfeng Liu","Zhuocheng Mei","Kewen Peng","Ranga Raju Vatsavai"],"pdf_url":"https://arxiv.org/pdf/2312.00774v1.pdf","comment":"2023 IEEE International Conference on Data Mining Workshops (ICDMW)"},{"id":"http://arxiv.org/abs/2312.00699v1","updated":"2023-12-01T16:31:17Z","published":"2023-12-01T16:31:17Z","title":"Rethinking Detection Based Table Structure Recognition for Visually Rich\n Documents","summary":" Table Structure Recognition (TSR) aims at transforming unstructured table\nimages into structured formats, such as HTML sequences. One type of popular\nsolution is using detection models to detect components of a table, such as\ncolumns and rows, then applying a rule-based post-processing method to convert\ndetection results into HTML sequences. However, existing detection-based\nstudies often have the following limitations. First, these studies usually pay\nmore attention to improving the detection performance, which does not\nnecessarily lead to better performance regarding cell-level metrics, such as\nTEDS. Second, some solutions over-simplify the problem and can miss some\ncritical information. Lastly, even though some studies defined the problem to\ndetect more components to provide as much information as other types of\nsolutions, these studies ignore the fact this problem definition is a\nmulti-label detection because row, projected row header and column header can\nshare identical bounding boxes. Besides, there is often a performance gap\nbetween two-stage and transformer-based detection models regarding the\nstructure-only TEDS, even though they have similar performance regarding the\nCOCO metrics. Therefore, we revisit the limitations of existing detection-based\nsolutions, compare two-stage and transformer-based detection models, and\nidentify the key design aspects for the success of a two-stage detection model\nfor the TSR task, including the multi-class problem definition, the aspect\nratio for anchor box generation, and the feature generation of the backbone\nnetwork. We applied simple methods to improve these aspects of the Cascade\nR-CNN model, achieved state-of-the-art performance, and improved the baseline\nCascade R-CNN model by 19.32%, 11.56% and 14.77% regarding the structure-only\nTEDS on SciTSR, FinTabNet, and PubTables1M datasets.\n","authors":["Bin Xiao","Murat Simsek","Burak Kantarci","Ala Abu Alkheir"],"pdf_url":"https://arxiv.org/pdf/2312.00699v1.pdf","comment":"submitted to Pattern Recognition"},{"id":"http://arxiv.org/abs/2312.00512v1","updated":"2023-12-01T11:34:01Z","published":"2023-12-01T11:34:01Z","title":"Attack Detection Using Item Vector Shift in Matrix Factorisation\n Recommenders","summary":" This paper proposes a novel method for detecting shilling attacks in Matrix\nFactorization (MF)-based Recommender Systems (RS), in which attackers use false\nuser-item feedback to promote a specific item. Unlike existing methods that use\neither use supervised learning to distinguish between attack and genuine\nprofiles or analyse target item rating distributions to detect false ratings,\nour method uses an unsupervised technique to detect false ratings by examining\nshifts in item preference vectors that exploit rating deviations and user\ncharacteristics, making it a promising new direction. The experimental results\ndemonstrate the effectiveness of our approach in various attack scenarios,\nincluding those involving obfuscation techniques.\n","authors":["Sulthana Shams","Douglas Leith"],"pdf_url":"https://arxiv.org/pdf/2312.00512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00372v1","updated":"2023-12-01T06:30:31Z","published":"2023-12-01T06:30:31Z","title":"Event-driven Real-time Retrieval in Web Search","summary":" Information retrieval in real-time search presents unique challenges distinct\nfrom those encountered in classical web search. These challenges are\nparticularly pronounced due to the rapid change of user search intent, which is\ninfluenced by the occurrence and evolution of breaking news events, such as\nearthquakes, elections, and wars. Previous dense retrieval methods, which\nprimarily focused on static semantic representation, lack the capacity to\ncapture immediate search intent, leading to inferior performance in retrieving\nthe most recent event-related documents in time-sensitive scenarios. To address\nthis issue, this paper expands the query with event information that represents\nreal-time search intent. The Event information is then integrated with the\nquery through a cross-attention mechanism, resulting in a time-context query\nrepresentation. We further enhance the model's capacity for event\nrepresentation through multi-task training. Since publicly available datasets\nsuch as MS-MARCO do not contain any event information on the query side and\nhave few time-sensitive queries, we design an automatic data collection and\nannotation pipeline to address this issue, which includes ModelZoo-based Coarse\nAnnotation and LLM-driven Fine Annotation processes. In addition, we share the\ntraining tricks such as two-stage training and hard negative sampling. Finally,\nwe conduct a set of offline experiments on a million-scale production dataset\nto evaluate our approach and deploy an A/B testing in a real online system to\nverify the performance. Extensive experimental results demonstrate that our\nproposed approach significantly outperforms existing state-of-the-art baseline\nmethods.\n","authors":["Nan Yang","Yannan Zhang","Xiaoling Bai","Hualong Deng","Tianhua Zhou","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16767v2","updated":"2023-12-01T04:29:49Z","published":"2023-03-23T07:55:31Z","title":"A Novel Patent Similarity Measurement Methodology: Semantic Distance and\n Technological Distance","summary":" Patent similarity analysis plays a crucial role in evaluating the risk of\npatent infringement. Nonetheless, this analysis is predominantly conducted\nmanually by legal experts, often resulting in a time-consuming process. Recent\nadvances in natural language processing technology offer a promising avenue for\nautomating this process. However, methods for measuring similarity between\npatents still rely on experts manually classifying patents. Due to the recent\ndevelopment of artificial intelligence technology, a lot of research is being\nconducted focusing on the semantic similarity of patents using natural language\nprocessing technology. However, it is difficult to accurately analyze patent\ndata, which are legal documents representing complex technologies, using\nexisting natural language processing technologies. To address these\nlimitations, we propose a hybrid methodology that takes into account\nbibliographic similarity, measures the similarity between patents by\nconsidering the semantic similarity of patents, the technical similarity\nbetween patents, and the bibliographic information of patents. Using natural\nlanguage processing techniques, we measure semantic similarity based on patent\ntext and calculate technical similarity through the degree of coexistence of\nInternational patent classification (IPC) codes. The similarity of\nbibliographic information of a patent is calculated using the special\ncharacteristics of the patent: citation information, inventor information, and\nassignee information. We propose a model that assigns reasonable weights to\neach similarity method considered. With the help of experts, we performed\nmanual similarity evaluations on 420 pairs and evaluated the performance of our\nmodel based on this data. We have empirically shown that our method outperforms\nrecent natural language processing techniques.\n","authors":["Yongmin Yoo","Cheonkam Jeong","Sanguk Gim","Junwon Lee","Zachary Schimke","Deaho Seo"],"pdf_url":"https://arxiv.org/pdf/2303.16767v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00336v1","updated":"2023-12-01T04:10:00Z","published":"2023-12-01T04:10:00Z","title":"Hypergraph Node Representation Learning with One-Stage Message Passing","summary":" Hypergraphs as an expressive and general structure have attracted\nconsiderable attention from various research domains. Most existing hypergraph\nnode representation learning techniques are based on graph neural networks, and\nthus adopt the two-stage message passing paradigm (i.e. node -> hyperedge ->\nnode). This paradigm only focuses on local information propagation and does not\neffectively take into account global information, resulting in less optimal\nrepresentations. Our theoretical analysis of representative two-stage message\npassing methods shows that, mathematically, they model different ways of local\nmessage passing through hyperedges, and can be unified into one-stage message\npassing (i.e. node -> node). However, they still only model local information.\nMotivated by this theoretical analysis, we propose a novel one-stage message\npassing paradigm to model both global and local information propagation for\nhypergraphs. We integrate this paradigm into HGraphormer, a Transformer-based\nframework for hypergraph node representation learning. HGraphormer injects the\nhypergraph structure information (local information) into Transformers (global\ninformation) by combining the attention matrix and hypergraph Laplacian.\nExtensive experiments demonstrate that HGraphormer outperforms recent\nhypergraph learning methods on five representative benchmark datasets on the\nsemi-supervised hypernode classification task, setting new state-of-the-art\nperformance, with accuracy improvements between 2.52% and 6.70%. Our code and\ndatasets are available.\n","authors":["Shilin Qu","Weiqing Wang","Yuan-Fang Li","Xin Zhou","Fajie Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.00336v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2312.00326v1","updated":"2023-12-01T03:44:54Z","published":"2023-12-01T03:44:54Z","title":"Agent-OM: Leveraging Large Language Models for Ontology Matching","summary":" Ontology matching (OM) enables semantic interoperability between different\nontologies and resolves their conceptual heterogeneity by aligning related\nentities. OM systems currently have two prevailing design paradigms:\nconventional knowledge-based expert systems and newer machine learning-based\npredictive systems. While large language models (LLMs) and LLM-based agents\nhave become revolutionary in data engineering and have been applied creatively\nin various domains, their potential for OM remains underexplored. This study\nintroduces a novel agent-powered LLM-based design paradigm for OM systems. With\nthoughtful consideration of several specific challenges to leverage LLMs for\nOM, we propose a generic framework, namely Agent-OM, consisting of two Siamese\nagents for retrieval and matching, with a set of simple prompt-based OM tools.\nOur framework is implemented in a proof-of-concept system. Evaluations of three\nOntology Alignment Evaluation Initiative (OAEI) tracks over state-of-the-art OM\nsystems show that our system can achieve very close results to the best\nlong-standing performance on simple OM tasks and significantly improve the\nperformance on complex and few-shot OM tasks.\n","authors":["Zhangcheng Qiang","Weiqing Wang","Kerry Taylor"],"pdf_url":"https://arxiv.org/pdf/2312.00326v1.pdf","comment":"14 pages, 10 figures, 7 tables"},{"id":"http://arxiv.org/abs/2311.16334v2","updated":"2023-12-01T02:07:06Z","published":"2023-11-27T21:38:10Z","title":"Robust Basket Recommendation via Noise-tolerated Graph Contrastive\n Learning","summary":" The growth of e-commerce has seen a surge in popularity of platforms like\nAmazon, eBay, and Taobao. This has given rise to a unique shopping behavior\ninvolving baskets - sets of items purchased together. As a less studied\ninteraction mode in the community, the question of how should shopping basket\ncomplement personalized recommendation systems remains under-explored. While\nprevious attempts focused on jointly modeling user purchases and baskets, the\ndistinct semantic nature of these elements can introduce noise when directly\nintegrated. This noise negatively impacts the model's performance, further\nexacerbated by significant noise (e.g., a user is misled to click an item or\nrecognizes it as uninteresting after consuming it) within both user and basket\nbehaviors. In order to cope with the above difficulties, we propose a novel\nBasket recommendation framework via Noise-tolerated Contrastive Learning, named\nBNCL, to handle the noise existing in the cross-behavior integration and\nwithin-behavior modeling. First, we represent the basket-item interactions as\nthe hypergraph to model the complex basket behavior, where all items appearing\nin the same basket are treated as a single hyperedge. Second, cross-behavior\ncontrastive learning is designed to suppress the noise during the fusion of\ndiverse behaviors. Next, to further inhibit the within-behavior noise of the\nuser and basket interactions, we propose to exploit invariant properties of the\nrecommenders w.r.t augmentations through within-behavior contrastive learning.\nA novel consistency-aware augmentation approach is further designed to better\nidentify noisy interactions with the consideration of the above two types of\ninteractions. Our framework BNCL offers a generic training paradigm that is\napplicable to different backbones. Extensive experiments on three shopping\ntransaction datasets verify the effectiveness of our proposed method.\n","authors":["Xinrui He","Tianxin Wei","Jingrui He"],"pdf_url":"https://arxiv.org/pdf/2311.16334v2.pdf","comment":"CIKM 2023"},{"id":"http://arxiv.org/abs/2310.06282v3","updated":"2023-12-01T00:55:13Z","published":"2023-10-10T03:32:33Z","title":"MuseChat: A Conversational Music Recommendation System for Videos","summary":" Music recommendation for videos attracts growing interest in multi-modal\nresearch. However, existing systems focus primarily on content compatibility,\noften ignoring the users' preferences. Their inability to interact with users\nfor further refinements or to provide explanations leads to a less satisfying\nexperience. We address these issues with MuseChat, a first-of-its-kind\ndialogue-based recommendation system that personalizes music suggestions for\nvideos. Our system consists of two key functionalities with associated modules:\nrecommendation and reasoning. The recommendation module takes a video along\nwith optional information including previous suggested music and user's\npreference as inputs and retrieves an appropriate music matching the context.\nThe reasoning module, equipped with the power of Large Language Model\n(Vicuna-7B) and extended to multi-modal inputs, is able to provide reasonable\nexplanation for the recommended music. To evaluate the effectiveness of\nMuseChat, we build a large-scale dataset, conversational music recommendation\nfor videos, that simulates a two-turn interaction between a user and a\nrecommender based on accurate music track information. Experiment results show\nthat MuseChat achieves significant improvements over existing video-based music\nretrieval methods as well as offers strong interpretability and\ninteractability.\n","authors":["Zhikang Dong","Bin Chen","Xiulong Liu","Pawel Polak","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.06282v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00909v1","updated":"2023-12-01T20:13:08Z","published":"2023-12-01T20:13:08Z","title":"LLM-TAKE: Theme Aware Keyword Extraction Using Large Language Models","summary":" Keyword extraction is one of the core tasks in natural language processing.\nClassic extraction models are notorious for having a short attention span which\nmake it hard for them to conclude relational connections among the words and\nsentences that are far from each other. This, in turn, makes their usage\nprohibitive for generating keywords that are inferred from the context of the\nwhole text. In this paper, we explore using Large Language Models (LLMs) in\ngenerating keywords for items that are inferred from the items textual\nmetadata. Our modeling framework includes several stages to fine grain the\nresults by avoiding outputting keywords that are non informative or sensitive\nand reduce hallucinations common in LLM. We call our LLM-based framework\nTheme-Aware Keyword Extraction (LLM TAKE). We propose two variations of\nframework for generating extractive and abstractive themes for products in an E\ncommerce setting. We perform an extensive set of experiments on three real data\nsets and show that our modeling framework can enhance accuracy based and\ndiversity based metrics when compared with benchmark models.\n","authors":["Reza Yousefi Maragheh","Chenhao Fang","Charan Chand Irugu","Parth Parikh","Jason Cho","Jianpeng Xu","Saranyan Sukumar","Malay Patel","Evren Korpeoglu","Sushant Kumar","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2312.00909v1.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.00784v1","updated":"2023-12-01T18:59:56Z","published":"2023-12-01T18:59:56Z","title":"Making Large Multimodal Models Understand Arbitrary Visual Prompts","summary":" While existing large vision-language multimodal models focus on whole image\nunderstanding, there is a prominent gap in achieving region-specific\ncomprehension. Current approaches that use textual coordinates or spatial\nencodings often fail to provide a user-friendly interface for visual prompting.\nTo address this challenge, we introduce a novel multimodal model capable of\ndecoding arbitrary visual prompts. This allows users to intuitively mark images\nand interact with the model using natural cues like a \"red bounding box\" or\n\"pointed arrow\". Our simple design directly overlays visual markers onto the\nRGB image, eliminating the need for complex region encodings, yet achieves\nstate-of-the-art performance on region-understanding tasks like Visual7W,\nPointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present\nViP-Bench, a comprehensive benchmark to assess the capability of models in\nunderstanding visual prompts across multiple dimensions, enabling future\nresearch in this domain. Code, data, and model are publicly available.\n","authors":["Mu Cai","Haotian Liu","Siva Karthik Mustikovela","Gregory P. Meyer","Yuning Chai","Dennis Park","Yong Jae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.00784v1.pdf","comment":"Project page: https://vip-llava.github.io/"},{"id":"http://arxiv.org/abs/2011.04923v6","updated":"2023-12-01T18:58:00Z","published":"2020-11-10T06:06:02Z","title":"Topological properties of basins of attraction and expressiveness of\n width bounded neural networks","summary":" In Radhakrishnan et al. [2020], the authors empirically show that\nautoencoders trained with usual SGD methods shape out basins of attraction\naround their training data. We consider network functions of width not\nexceeding the input dimension and prove that in this situation basins of\nattraction are bounded and their complement cannot have bounded components. Our\nconditions in these results are met in several experiments of the latter work\nand we thus address a question posed therein. We also show that under some more\nrestrictive conditions the basins of attraction are path-connected. The\ntightness of the conditions in our results is demonstrated by means of several\nexamples. Finally, the arguments used to prove the above results allow us to\nderive a root cause why scalar-valued neural network functions that fulfill our\nbounded width condition are not dense in spaces of continuous functions.\n","authors":["Hans-Peter Beise","Steve Dias Da Cruz"],"pdf_url":"https://arxiv.org/pdf/2011.04923v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00775v1","updated":"2023-12-01T18:54:12Z","published":"2023-12-01T18:54:12Z","title":"Towards Generalizable Zero-Shot Manipulation via Translating Human\n Interaction Plans","summary":" We pursue the goal of developing robots that can interact zero-shot with\ngeneric unseen objects via a diverse repertoire of manipulation skills and show\nhow passive human videos can serve as a rich source of data for learning such\ngeneralist robots. Unlike typical robot learning approaches which directly\nlearn how a robot should act from interaction data, we adopt a factorized\napproach that can leverage large-scale human videos to learn how a human would\naccomplish a desired task (a human plan), followed by translating this plan to\nthe robots embodiment. Specifically, we learn a human plan predictor that,\ngiven a current image of a scene and a goal image, predicts the future hand and\nobject configurations. We combine this with a translation module that learns a\nplan-conditioned robot manipulation policy, and allows following humans plans\nfor generic manipulation tasks in a zero-shot manner with no deployment-time\ntraining. Importantly, while the plan predictor can leverage large-scale human\nvideos for learning, the translation module only requires a small amount of\nin-domain data, and can generalize to tasks not seen during training. We show\nthat our learned system can perform over 16 manipulation skills that generalize\nto 40 objects, encompassing 100 real-world tasks for table-top manipulation and\ndiverse in-the-wild manipulation. https://homangab.github.io/hopman/\n","authors":["Homanga Bharadhwaj","Abhinav Gupta","Vikash Kumar","Shubham Tulsiani"],"pdf_url":"https://arxiv.org/pdf/2312.00775v1.pdf","comment":"Preprint. Under Review"},{"id":"http://arxiv.org/abs/2312.00774v1","updated":"2023-12-01T18:53:51Z","published":"2023-12-01T18:53:51Z","title":"Context Retrieval via Normalized Contextual Latent Interaction for\n Conversational Agent","summary":" Conversational agents leveraging AI, particularly deep learning, are emerging\nin both academic research and real-world applications. However, these\napplications still face challenges, including disrespecting knowledge and\nfacts, not personalizing to user preferences, and enormous demand for\ncomputational resources during training and inference. Recent research efforts\nhave been focused on addressing these challenges from various aspects,\nincluding supplementing various types of auxiliary information to the\nconversational agents. However, existing methods are still not able to\neffectively and efficiently exploit relevant information from these auxiliary\nsupplements to further unleash the power of the conversational agents and the\nlanguage models they use. In this paper, we present a novel method, PK-NCLI,\nthat is able to accurately and efficiently identify relevant auxiliary\ninformation to improve the quality of conversational responses by learning the\nrelevance among persona, chat history, and knowledge background through\nlow-level normalized contextual latent interaction. Our experimental results\nindicate that PK-NCLI outperforms the state-of-the-art method, PK-FoCus, by\n47.80%/30.61%/24.14% in terms of perplexity, knowledge grounding, and training\nefficiency, respectively, and maintained the same level of persona grounding\nperformance. We also provide a detailed analysis of how different factors,\nincluding language model choices and trade-offs on training weights, would\naffect the performance of PK-NCLI.\n","authors":["Junfeng Liu","Zhuocheng Mei","Kewen Peng","Ranga Raju Vatsavai"],"pdf_url":"https://arxiv.org/pdf/2312.00774v1.pdf","comment":"2023 IEEE International Conference on Data Mining Workshops (ICDMW)"},{"id":"http://arxiv.org/abs/2309.16770v2","updated":"2023-12-01T18:45:12Z","published":"2023-09-28T18:07:01Z","title":"Persona-Coded Poly-Encoder: Persona-Guided Multi-Stream Conversational\n Sentence Scoring","summary":" Recent advances in machine learning and deep learning have led to the\nwidespread use of Conversational AI in many practical applications. However, it\nis still very challenging to leverage auxiliary information that can provide\nconversational context or personalized tuning to improve the quality of\nconversations. For example, there has only been limited research on using an\nindividuals persona information to improve conversation quality, and even\nstate-of-the-art conversational AI techniques are unable to effectively\nleverage signals from heterogeneous sources of auxiliary data, such as\nmulti-modal interaction data, demographics, SDOH data, etc. In this paper, we\npresent a novel Persona-Coded Poly-Encoder method that leverages persona\ninformation in a multi-stream encoding scheme to improve the quality of\nresponse generation for conversations. To show the efficacy of the proposed\nmethod, we evaluate our method on two different persona-based conversational\ndatasets, and compared against two state-of-the-art methods. Our experimental\nresults and analysis demonstrate that our method can improve conversation\nquality over the baseline method Poly-Encoder by 3.32% and 2.94% in terms of\nBLEU score and HR@1, respectively. More significantly, our method offers a path\nto better utilization of multi-modal data in conversational tasks. Lastly, our\nstudy outlines several challenges and future research directions for advancing\npersonalized conversational AI technology.\n","authors":["Junfeng Liu","Christopher Symons","Ranga Raju Vatsavai"],"pdf_url":"https://arxiv.org/pdf/2309.16770v2.pdf","comment":"The 35th IEEE International Conference on Tools with Artificial\n Intelligence (ICTAI)"},{"id":"http://arxiv.org/abs/2312.00765v1","updated":"2023-12-01T18:40:37Z","published":"2023-12-01T18:40:37Z","title":"Explaining Knock-on Effects of Bias Mitigation","summary":" In machine learning systems, bias mitigation approaches aim to make outcomes\nfairer across privileged and unprivileged groups. Bias mitigation methods work\nin different ways and have known \"waterfall\" effects, e.g., mitigating bias at\none place may manifest bias elsewhere. In this paper, we aim to characterise\nimpacted cohorts when mitigation interventions are applied. To do so, we treat\nintervention effects as a classification task and learn an explainable\nmeta-classifier to identify cohorts that have altered outcomes. We examine a\nrange of bias mitigation strategies that work at various stages of the model\nlife cycle. We empirically demonstrate that our meta-classifier is able to\nuncover impacted cohorts. Further, we show that all tested mitigation\nstrategies negatively impact a non-trivial fraction of cases, i.e., people who\nreceive unfavourable outcomes solely on account of mitigation efforts. This is\ndespite improvement in fairness metrics. We use these results as a basis to\nargue for more careful audits of static mitigation interventions that go beyond\naggregate metrics.\n","authors":["Svetoslav Nizhnichenkov","Rahul Nair","Elizabeth Daly","Brian Mac Namee"],"pdf_url":"https://arxiv.org/pdf/2312.00765v1.pdf","comment":"This paper was accepted at NeurIPS 2023 workshop"},{"id":"http://arxiv.org/abs/2308.13490v2","updated":"2023-12-01T18:38:03Z","published":"2023-08-25T17:04:35Z","title":"TpuGraphs: A Performance Prediction Dataset on Large Tensor\n Computational Graphs","summary":" Precise hardware performance models play a crucial role in code\noptimizations. They can assist compilers in making heuristic decisions or aid\nautotuners in identifying the optimal configuration for a given program. For\nexample, the autotuner for XLA, a machine learning compiler, discovered 10-20%\nspeedup on state-of-the-art models serving substantial production traffic at\nGoogle. Although there exist a few datasets for program performance prediction,\nthey target small sub-programs such as basic blocks or kernels. This paper\nintroduces TpuGraphs, a performance prediction dataset on full tensor programs,\nrepresented as computational graphs, running on Tensor Processing Units (TPUs).\nEach graph in the dataset represents the main computation of a machine learning\nworkload, e.g., a training epoch or an inference step. Each data sample\ncontains a computational graph, a compilation configuration, and the execution\ntime of the graph when compiled with the configuration. The graphs in the\ndataset are collected from open-source machine learning programs, featuring\npopular model architectures, e.g., ResNet, EfficientNet, Mask R-CNN, and\nTransformer. TpuGraphs provides 25x more graphs than the largest graph property\nprediction dataset (with comparable graph sizes), and 770x larger graphs on\naverage compared to existing performance prediction datasets on machine\nlearning programs. This graph-level prediction task on large graphs introduces\nnew challenges in learning, ranging from scalability, training efficiency, to\nmodel quality.\n","authors":["Phitchaya Mangpo Phothilimthana","Sami Abu-El-Haija","Kaidi Cao","Bahare Fatemi","Charith Mendis","Bryan Perozzi"],"pdf_url":"https://arxiv.org/pdf/2308.13490v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00763v1","updated":"2023-12-01T18:31:28Z","published":"2023-12-01T18:31:28Z","title":"Beyond ChatBots: ExploreLLM for Structured Thoughts and Personalized\n Model Responses","summary":" Large language model (LLM) powered chatbots are primarily text-based today,\nand impose a large interactional cognitive load, especially for exploratory or\nsensemaking tasks such as planning a trip or learning about a new city. Because\nthe interaction is textual, users have little scaffolding in the way of\nstructure, informational \"scent\", or ability to specify high-level preferences\nor goals. We introduce ExploreLLM that allows users to structure thoughts, help\nexplore different options, navigate through the choices and recommendations,\nand to more easily steer models to generate more personalized responses. We\nconduct a user study and show that users find it helpful to use ExploreLLM for\nexploratory or planning tasks, because it provides a useful schema-like\nstructure to the task, and guides users in planning. The study also suggests\nthat users can more easily personalize responses with high-level preferences\nwith ExploreLLM. Together, ExploreLLM points to a future where users interact\nwith LLMs beyond the form of chatbots, and instead designed to support complex\nuser tasks with a tighter integration between natural language and graphical\nuser interfaces.\n","authors":["Xiao Ma","Swaroop Mishra","Ariel Liu","Sophie Su","Jilin Chen","Chinmay Kulkarni","Heng-Tze Cheng","Quoc Le","Ed Chi"],"pdf_url":"https://arxiv.org/pdf/2312.00763v1.pdf","comment":"19 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.00761v1","updated":"2023-12-01T18:29:08Z","published":"2023-12-01T18:29:08Z","title":"Deep Unlearning: Fast and Efficient Training-free Approach to Controlled\n Forgetting","summary":" Machine unlearning has emerged as a prominent and challenging area of\ninterest, driven in large part by the rising regulatory demands for industries\nto delete user data upon request and the heightened awareness of privacy.\nExisting approaches either retrain models from scratch or use several\nfinetuning steps for every deletion request, often constrained by computational\nresource limitations and restricted access to the original training data. In\nthis work, we introduce a novel class unlearning algorithm designed to\nstrategically eliminate an entire class or a group of classes from the learned\nmodel. To that end, our algorithm first estimates the Retain Space and the\nForget Space, representing the feature or activation spaces for samples from\nclasses to be retained and unlearned, respectively. To obtain these spaces, we\npropose a novel singular value decomposition-based technique that requires\nlayer wise collection of network activations from a few forward passes through\nthe network. We then compute the shared information between these spaces and\nremove it from the forget space to isolate class-discriminatory feature space\nfor unlearning. Finally, we project the model weights in the orthogonal\ndirection of the class-discriminatory space to obtain the unlearned model. We\ndemonstrate our algorithm's efficacy on ImageNet using a Vision Transformer\nwith only $\\sim$1.5% drop in retain accuracy compared to the original model\nwhile maintaining under 1% accuracy on the unlearned class samples. Further,\nour algorithm consistently performs well when subject to Membership Inference\nAttacks showing 7.8% improvement on average across a variety of image\nclassification datasets and network architectures, as compared to other\nbaselines while being $\\sim$6x more computationally efficient.\n","authors":["Sangamesh Kodge","Gobinda Saha","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2312.00761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03136v3","updated":"2023-12-01T18:09:00Z","published":"2023-05-04T20:33:05Z","title":"Contrastive losses as generalized models of global epistasis","summary":" Fitness functions map large combinatorial spaces of biological sequences to\nproperties of interest. Inferring these multimodal functions from experimental\ndata is a central task in modern protein engineering. Global epistasis models\nare an effective and physically-grounded class of models for estimating fitness\nfunctions from observed data. These models assume that a sparse latent function\nis transformed by a monotonic nonlinearity to emit measurable fitness. Here we\ndemonstrate that minimizing contrastive loss functions, such as the\nBradley-Terry loss, is a simple and flexible technique for extracting the\nsparse latent function implied by global epistasis. We argue by way of a\nfitness-epistasis uncertainty principle that the nonlinearities in global\nepistasis models can produce observed fitness functions that do not admit\nsparse representations, and thus may be inefficient to learn from observations\nwhen using a Mean Squared Error (MSE) loss (a common practice). We show that\ncontrastive losses are able to accurately estimate a ranking function from\nlimited data even in regimes where MSE is ineffective. We validate the\npractical utility of this insight by showing contrastive loss functions result\nin consistently improved performance on benchmark tasks.\n","authors":["David H. Brookes","Jakub Otwinowski","Sam Sinai"],"pdf_url":"https://arxiv.org/pdf/2305.03136v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00752v1","updated":"2023-12-01T18:01:34Z","published":"2023-12-01T18:01:34Z","title":"Mamba: Linear-Time Sequence Modeling with Selective State Spaces","summary":" Foundation models, now powering most of the exciting applications in deep\nlearning, are almost universally based on the Transformer architecture and its\ncore attention module. Many subquadratic-time architectures such as linear\nattention, gated convolution and recurrent models, and structured state space\nmodels (SSMs) have been developed to address Transformers' computational\ninefficiency on long sequences, but they have not performed as well as\nattention on important modalities such as language. We identify that a key\nweakness of such models is their inability to perform content-based reasoning,\nand make several improvements. First, simply letting the SSM parameters be\nfunctions of the input addresses their weakness with discrete modalities,\nallowing the model to selectively propagate or forget information along the\nsequence length dimension depending on the current token. Second, even though\nthis change prevents the use of efficient convolutions, we design a\nhardware-aware parallel algorithm in recurrent mode. We integrate these\nselective SSMs into a simplified end-to-end neural network architecture without\nattention or even MLP blocks (Mamba). Mamba enjoys fast inference (5$\\times$\nhigher throughput than Transformers) and linear scaling in sequence length, and\nits performance improves on real data up to million-length sequences. As a\ngeneral sequence model backbone, Mamba achieves state-of-the-art performance\nacross several modalities such as language, audio, and genomics. On language\nmodeling, our Mamba-3B model outperforms Transformers of the same size and\nmatches Transformers twice its size, both in pretraining and downstream\nevaluation.\n","authors":["Albert Gu","Tri Dao"],"pdf_url":"https://arxiv.org/pdf/2312.00752v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01213v2","updated":"2023-12-01T17:51:46Z","published":"2023-06-02T00:28:48Z","title":"Learning Causally Disentangled Representations via the Principle of\n Independent Causal Mechanisms","summary":" Learning disentangled causal representations is a challenging problem that\nhas gained significant attention recently due to its implications for\nextracting meaningful information for downstream tasks. In this work, we define\na new notion of causal disentanglement from the perspective of independent\ncausal mechanisms. We propose ICM-VAE, a framework for learning causally\ndisentangled representations supervised by causally related observed labels. We\nmodel causal mechanisms using learnable flow-based diffeomorphic functions to\nmap noise variables to latent causal variables. Further, to promote the\ndisentanglement of causal factors, we propose a causal disentanglement prior\nthat utilizes the known causal structure to encourage learning a causally\nfactorized distribution in the latent space. Under relatively mild conditions,\nwe provide theoretical results showing the identifiability of causal factors\nand mechanisms up to permutation and elementwise reparameterization. We\nempirically demonstrate that our framework induces highly disentangled causal\nfactors, improves interventional robustness, and is compatible with\ncounterfactual generation.\n","authors":["Aneesh Komanduri","Yongkai Wu","Feng Chen","Xintao Wu"],"pdf_url":"https://arxiv.org/pdf/2306.01213v2.pdf","comment":"Accepted to the NeurIPS 2023 Workshop on Causal Representation\n Learning"},{"id":"http://arxiv.org/abs/2310.05869v3","updated":"2023-12-01T17:43:06Z","published":"2023-10-09T17:05:25Z","title":"HyperAttention: Long-context Attention in Near-Linear Time","summary":" We present an approximate attention mechanism named HyperAttention to address\nthe computational challenges posed by the growing complexity of long contexts\nused in Large Language Models (LLMs). Recent work suggests that in the\nworst-case scenario, quadratic time is necessary unless the entries of the\nattention matrix are bounded or the matrix has low stable rank. We introduce\ntwo parameters which measure: (1) the max column norm in the normalized\nattention matrix, and (2) the ratio of row norms in the unnormalized attention\nmatrix after detecting and removing large entries. We use these fine-grained\nparameters to capture the hardness of the problem. Despite previous lower\nbounds, we are able to achieve a linear time sampling algorithm even when the\nmatrix has unbounded entries or a large stable rank, provided the above\nparameters are small. HyperAttention features a modular design that easily\naccommodates integration of other fast low-level implementations, particularly\nFlashAttention. Empirically, employing Locality Sensitive Hashing (LSH) to\nidentify large entries, HyperAttention outperforms existing methods, giving\nsignificant speed improvements compared to state-of-the-art solutions like\nFlashAttention. We validate the empirical performance of HyperAttention on a\nvariety of different long-context length datasets. For example, HyperAttention\nmakes the inference time of ChatGLM2 50\\% faster on 32k context length while\nperplexity increases from 5.6 to 6.3. On larger context length, e.g., 131k,\nwith causal masking, HyperAttention offers 5-fold speedup on a single attention\nlayer.\n","authors":["Insu Han","Rajesh Jayaram","Amin Karbasi","Vahab Mirrokni","David P. Woodruff","Amir Zandieh"],"pdf_url":"https://arxiv.org/pdf/2310.05869v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01356v2","updated":"2023-12-01T17:40:35Z","published":"2023-11-02T16:03:26Z","title":"Upper and lower bounds for the Lipschitz constant of random neural\n networks","summary":" Empirical studies have widely demonstrated that neural networks are highly\nsensitive to small, adversarial perturbations of the input. The worst-case\nrobustness against these so-called adversarial examples can be quantified by\nthe Lipschitz constant of the neural network. In this paper, we study upper and\nlower bounds for the Lipschitz constant of random ReLU neural networks.\nSpecifically, we assume that the weights and biases follow a generalization of\nthe He initialization, where general symmetric distributions for the biases are\npermitted. For shallow neural networks, we characterize the Lipschitz constant\nup to an absolute numerical constant. For deep networks with fixed depth and\nsufficiently large width, our established bounds differ by a factor that is\nlogarithmic in the width.\n","authors":["Paul Geuchen","Thomas Heindl","Dominik Stöger","Felix Voigtlaender"],"pdf_url":"https://arxiv.org/pdf/2311.01356v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00742v1","updated":"2023-12-01T17:25:10Z","published":"2023-12-01T17:25:10Z","title":"Scalable Meta-Learning with Gaussian Processes","summary":" Meta-learning is a powerful approach that exploits historical data to quickly\nsolve new tasks from the same distribution. In the low-data regime, methods\nbased on the closed-form posterior of Gaussian processes (GP) together with\nBayesian optimization have achieved high performance. However, these methods\nare either computationally expensive or introduce assumptions that hinder a\nprincipled propagation of uncertainty between task models. This may disrupt the\nbalance between exploration and exploitation during optimization. In this\npaper, we develop ScaML-GP, a modular GP model for meta-learning that is\nscalable in the number of tasks. Our core contribution is a carefully designed\nmulti-task kernel that enables hierarchical training and task scalability.\nConditioning ScaML-GP on the meta-data exposes its modular nature yielding a\ntest-task prior that combines the posteriors of meta-task GPs. In synthetic and\nreal-world meta-learning experiments, we demonstrate that ScaML-GP can learn\nefficiently both with few and many meta-tasks.\n","authors":["Petru Tighineanu","Lukas Grossberger","Paul Baireuther","Kathrin Skubch","Stefan Falkner","Julia Vinogradska","Felix Berkenkamp"],"pdf_url":"https://arxiv.org/pdf/2312.00742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07099v2","updated":"2023-12-01T17:12:35Z","published":"2023-01-15T04:37:51Z","title":"Adaptive Deep Neural Network Inference Optimization with EENet","summary":" Well-trained deep neural networks (DNNs) treat all test samples equally\nduring prediction. Adaptive DNN inference with early exiting leverages the\nobservation that some test examples can be easier to predict than others. This\npaper presents EENet, a novel early-exiting scheduling framework for multi-exit\nDNN models. Instead of having every sample go through all DNN layers during\nprediction, EENet learns an early exit scheduler, which can intelligently\nterminate the inference earlier for certain predictions, which the model has\nhigh confidence of early exit. As opposed to previous early-exiting solutions\nwith heuristics-based methods, our EENet framework optimizes an early-exiting\npolicy to maximize model accuracy while satisfying the given per-sample average\ninference budget. Extensive experiments are conducted on four computer vision\ndatasets (CIFAR-10, CIFAR-100, ImageNet, Cityscapes) and two NLP datasets\n(SST-2, AgNews). The results demonstrate that the adaptive inference by EENet\ncan outperform the representative existing early exit techniques. We also\nperform a detailed visualization analysis of the comparison results to\ninterpret the benefits of EENet.\n","authors":["Fatih Ilhan","Ka-Ho Chow","Sihao Hu","Tiansheng Huang","Selim Tekin","Wenqi Wei","Yanzhao Wu","Myungjin Lee","Ramana Kompella","Hugo Latapie","Gaowen Liu","Ling Liu"],"pdf_url":"https://arxiv.org/pdf/2301.07099v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00727v1","updated":"2023-12-01T17:01:37Z","published":"2023-12-01T17:01:37Z","title":"Safe Reinforcement Learning in Tensor Reproducing Kernel Hilbert Space","summary":" This paper delves into the problem of safe reinforcement learning (RL) in a\npartially observable environment with the aim of achieving safe-reachability\nobjectives. In traditional partially observable Markov decision processes\n(POMDP), ensuring safety typically involves estimating the belief in latent\nstates. However, accurately estimating an optimal Bayesian filter in POMDP to\ninfer latent states from observations in a continuous state space poses a\nsignificant challenge, largely due to the intractable likelihood. To tackle\nthis issue, we propose a stochastic model-based approach that guarantees RL\nsafety almost surely in the face of unknown system dynamics and partial\nobservation environments. We leveraged the Predictive State Representation\n(PSR) and Reproducing Kernel Hilbert Space (RKHS) to represent future\nmulti-step observations analytically, and the results in this context are\nprovable. Furthermore, we derived essential operators from the kernel Bayes'\nrule, enabling the recursive estimation of future observations using various\noperators. Under the assumption of \\textit{undercompleness}, a polynomial\nsample complexity is established for the RL algorithm for the infinite size of\nobservation and action spaces, ensuring an $\\epsilon-$suboptimal safe policy\nguarantee.\n","authors":["Xiaoyuan Cheng","Boli Chen","Liz Varga","Yukun Hu"],"pdf_url":"https://arxiv.org/pdf/2312.00727v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07918v3","updated":"2023-12-01T17:00:04Z","published":"2023-10-11T22:17:37Z","title":"Contextualized Policy Recovery: Modeling and Interpreting Medical\n Decisions with Adaptive Imitation Learning","summary":" Interpretable policy learning seeks to estimate intelligible decision\npolicies from observed actions; however, existing models fall short by forcing\na tradeoff between accuracy and interpretability. This tradeoff limits\ndata-driven interpretations of human decision-making process. e.g. to audit\nmedical decisions for biases and suboptimal practices, we require models of\ndecision processes which provide concise descriptions of complex behaviors.\nFundamentally, existing approaches are burdened by this tradeoff because they\nrepresent the underlying decision process as a universal policy, when in fact\nhuman decisions are dynamic and can change drastically with contextual\ninformation. Thus, we propose Contextualized Policy Recovery (CPR), which\nre-frames the problem of modeling complex decision processes as a multi-task\nlearning problem in which complex decision policies are comprised of\ncontext-specific policies. CPR models each context-specific policy as a linear\nobservation-to-action mapping, and generates new decision models\n$\\textit{on-demand}$ as contexts are updated with new observations. CPR is\ncompatible with fully offline and partially observable decision environments,\nand can be tailored to incorporate any recurrent black-box model or\ninterpretable decision model. We assess CPR through studies on simulated and\nreal data, achieving state-of-the-art performance on the canonical tasks of\npredicting antibiotic prescription in intensive care units ($+22\\%$ AUROC vs.\nprevious SOTA) and predicting MRI prescription for Alzheimer's patients\n($+7.7\\%$ AUROC vs. previous SOTA). With this improvement in predictive\nperformance, CPR closes the accuracy gap between interpretable and black-box\nmethods for policy learning, allowing high-resolution exploration and analysis\nof context-specific decision models.\n","authors":["Jannik Deuschel","Caleb N. Ellington","Benjamin J. Lengerich","Yingtao Luo","Pascal Friederich","Eric P. Xing"],"pdf_url":"https://arxiv.org/pdf/2310.07918v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00718v1","updated":"2023-12-01T16:53:15Z","published":"2023-12-01T16:53:15Z","title":"Removing Biases from Molecular Representations via Information\n Maximization","summary":" High-throughput drug screening -- using cell imaging or gene expression\nmeasurements as readouts of drug effect -- is a critical tool in biotechnology\nto assess and understand the relationship between the chemical structure and\nbiological activity of a drug. Since large-scale screens have to be divided\ninto multiple experiments, a key difficulty is dealing with batch effects,\nwhich can introduce systematic errors and non-biological associations in the\ndata. We propose InfoCORE, an Information maximization approach for COnfounder\nREmoval, to effectively deal with batch effects and obtain refined molecular\nrepresentations. InfoCORE establishes a variational lower bound on the\nconditional mutual information of the latent representations given a batch\nidentifier. It adaptively reweighs samples to equalize their implied batch\ndistribution. Extensive experiments on drug screening data reveal InfoCORE's\nsuperior performance in a multitude of tasks including molecular property\nprediction and molecule-phenotype retrieval. Additionally, we show results for\nhow InfoCORE offers a versatile framework and resolves general distribution\nshifts and issues of data fairness by minimizing correlation with spurious\nfeatures or removing sensitive attributes. The code is available at\nhttps://github.com/uhlerlab/InfoCORE.\n","authors":["Chenyu Wang","Sharut Gupta","Caroline Uhler","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2312.00718v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18341v2","updated":"2023-12-01T16:46:10Z","published":"2023-11-30T08:22:08Z","title":"Learning Robust Precipitation Forecaster by Temporal Frame Interpolation","summary":" Recent advances in deep learning have significantly elevated weather\nprediction models. However, these models often falter in real-world scenarios\ndue to their sensitivity to spatial-temporal shifts. This issue is particularly\nacute in weather forecasting, where models are prone to overfit to local and\ntemporal variations, especially when tasked with fine-grained predictions. In\nthis paper, we address these challenges by developing a robust precipitation\nforecasting model that demonstrates resilience against such spatial-temporal\ndiscrepancies. We introduce Temporal Frame Interpolation (TFI), a novel\ntechnique that enhances the training dataset by generating synthetic samples\nthrough interpolating adjacent frames from satellite imagery and ground radar\ndata, thus improving the model's robustness against frame noise. Moreover, we\nincorporate a unique Multi-Level Dice (ML-Dice) loss function, leveraging the\nordinal nature of rainfall intensities to improve the model's performance. Our\napproach has led to significant improvements in forecasting precision,\nculminating in our model securing \\textit{1st place} in the transfer learning\nleaderboard of the \\textit{Weather4cast'23} competition. This achievement not\nonly underscores the effectiveness of our methodologies but also establishes a\nnew standard for deep learning applications in weather forecasting. Our code\nand weights have been public on \\url{https://github.com/Secilia-Cxy/UNetTFI}.\n","authors":["Lu Han","Xu-Yang Chen","Han-Jia Ye","De-Chuan Zhan"],"pdf_url":"https://arxiv.org/pdf/2311.18341v2.pdf","comment":"Previous version has text overlap with last year's paper\n arXiv:2212.02968 since the competition's datasets does not change. We restate\n the dataset description to avoid it. We also polish the overall writing"},{"id":"http://arxiv.org/abs/2312.00710v1","updated":"2023-12-01T16:42:57Z","published":"2023-12-01T16:42:57Z","title":"SpaCE: The Spatial Confounding Environment","summary":" Spatial confounding poses a significant challenge in scientific studies\ninvolving spatial data, where unobserved spatial variables can influence both\ntreatment and outcome, possibly leading to spurious associations. To address\nthis problem, we introduce SpaCE: The Spatial Confounding Environment, the\nfirst toolkit to provide realistic benchmark datasets and tools for\nsystematically evaluating causal inference methods designed to alleviate\nspatial confounding. Each dataset includes training data, true counterfactuals,\na spatial graph with coordinates, and smoothness and confounding scores\ncharacterizing the effect of a missing spatial confounder. It also includes\nrealistic semi-synthetic outcomes and counterfactuals, generated using\nstate-of-the-art machine learning ensembles, following best practices for\ncausal inference benchmarks. The datasets cover real treatment and covariates\nfrom diverse domains, including climate, health and social sciences. SpaCE\nfacilitates an automated end-to-end pipeline, simplifying data loading,\nexperimental setup, and evaluating machine learning and causal inference\nmodels. The SpaCE project provides several dozens of datasets of diverse sizes\nand spatial complexity. It is publicly available as a Python package,\nencouraging community feedback and contributions.\n","authors":["Mauricio Tec","Ana Trisovic","Michelle Audirac","Sophie Woodward","Naeem Khoshnevis","Francesca Dominici"],"pdf_url":"https://arxiv.org/pdf/2312.00710v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.01923v3","updated":"2023-12-01T16:37:36Z","published":"2023-03-03T13:48:35Z","title":"Bayesian CART models for insurance claims frequency","summary":" Accuracy and interpretability of a (non-life) insurance pricing model are\nessential qualities to ensure fair and transparent premiums for policy-holders,\nthat reflect their risk. In recent years, the classification and regression\ntrees (CARTs) and their ensembles have gained popularity in the actuarial\nliterature, since they offer good prediction performance and are relatively\neasily interpretable. In this paper, we introduce Bayesian CART models for\ninsurance pricing, with a particular focus on claims frequency modelling.\nAdditionally to the common Poisson and negative binomial (NB) distributions\nused for claims frequency, we implement Bayesian CART for the zero-inflated\nPoisson (ZIP) distribution to address the difficulty arising from the\nimbalanced insurance claims data. To this end, we introduce a general MCMC\nalgorithm using data augmentation methods for posterior tree exploration. We\nalso introduce the deviance information criterion (DIC) for the tree model\nselection. The proposed models are able to identify trees which can better\nclassify the policy-holders into risk groups. Some simulations and real\ninsurance data will be discussed to illustrate the applicability of these\nmodels.\n","authors":["Yaojun Zhang","Lanpeng Ji","Georgios Aivaliotis","Charles Taylor"],"pdf_url":"https://arxiv.org/pdf/2303.01923v3.pdf","comment":"46 pages"},{"id":"http://arxiv.org/abs/2312.00700v1","updated":"2023-12-01T16:33:57Z","published":"2023-12-01T16:33:57Z","title":"GIFT: Generative Interpretable Fine-Tuning Transformers","summary":" We present GIFT (Generative Interpretable Fine-tuning Transformers) for\nfine-tuning pretrained (often large) Transformer models at downstream tasks in\na parameter-efficient way with built-in interpretability. Our GIFT is a deep\nparameter-residual learning method, which addresses two problems in fine-tuning\na pretrained Transformer model: Where to apply the parameter-efficient\nfine-tuning (PEFT) to be extremely lightweight yet sufficiently expressive, and\nHow to learn the PEFT to better exploit the knowledge of the pretrained model\nin a direct way? For the former, we select the final projection (linear) layer\nin the multi-head self-attention of a Transformer model, and verify its\neffectiveness. For the latter, in contrast to the prior art that directly\nintroduce new model parameters (often in low-rank approximation form) to be\nlearned in fine-tuning with downstream data, we propose a method for learning\nto generate the fine-tuning parameters. Our GIFT is a hyper-Transformer which\ntake as input the pretrained parameters of the projection layer to generate its\nfine-tuning parameters using a proposed Parameter-to-Cluster Attention (PaCa).\nThe PaCa results in a simple clustering-based forward explainer that plays the\nrole of semantic segmentation in testing. In experiments, our proposed GIFT is\ntested on the VTAB benchmark and the fine-grained visual classification (FGVC)\nbenchmark. It obtains significantly better performance than the prior art. Our\ncode is available at https://github.com/savadikarc/gift\n","authors":["Chinmay Savadikar","Xi Song","Tianfu Wu"],"pdf_url":"https://arxiv.org/pdf/2312.00700v1.pdf","comment":"18 pages, 12 figures"},{"id":"http://arxiv.org/abs/2312.00694v1","updated":"2023-12-01T16:27:48Z","published":"2023-12-01T16:27:48Z","title":"Object Detector Differences when using Synthetic and Real Training Data","summary":" To train well-performing generalizing neural networks, sufficiently large and\ndiverse datasets are needed. Collecting data while adhering to privacy\nlegislation becomes increasingly difficult and annotating these large datasets\nis both a resource-heavy and time-consuming task. An approach to overcome these\ndifficulties is to use synthetic data since it is inherently scalable and can\nbe automatically annotated. However, how training on synthetic data affects the\nlayers of a neural network is still unclear. In this paper, we train the YOLOv3\nobject detector on real and synthetic images from city environments. We perform\na similarity analysis using Centered Kernel Alignment (CKA) to explore the\neffects of training on synthetic data on a layer-wise basis. The analysis\ncaptures the architecture of the detector while showing both different and\nsimilar patterns between different models. With this similarity analysis we\nwant to give insights on how training synthetic data affects each layer and to\ngive a better understanding of the inner workings of complex neural networks.\nThe results show that the largest similarity between a detector trained on real\ndata and a detector trained on synthetic data was in the early layers, and the\nlargest difference was in the head part. The results also show that no major\ndifference in performance or similarity could be seen between frozen and\nunfrozen backbone.\n","authors":["Martin Georg Ljungqvist","Otto Nordander","Markus Skans","Arvid Mildner","Tony Liu","Pierre Nugues"],"pdf_url":"https://arxiv.org/pdf/2312.00694v1.pdf","comment":"27 pages. The Version of Record of this article is published in\n Springer Nature Computer Science 2023, and is available online at\n https://doi.org/10.1007/s42979-023-01704-5"},{"id":"http://arxiv.org/abs/2312.00688v1","updated":"2023-12-01T16:11:38Z","published":"2023-12-01T16:11:38Z","title":"Towards Transparency in Coreference Resolution: A Quantum-Inspired\n Approach","summary":" Guided by grammatical structure, words compose to form sentences, and guided\nby discourse structure, sentences compose to form dialogues and documents. The\ncompositional aspect of sentence and discourse units is often overlooked by\nmachine learning algorithms. A recent initiative called Quantum Natural\nLanguage Processing (QNLP) learns word meanings as points in a Hilbert space\nand acts on them via a translation of grammatical structure into Parametrised\nQuantum Circuits (PQCs). Previous work extended the QNLP translation to\ndiscourse structure using points in a closure of Hilbert spaces. In this paper,\nwe evaluate this translation on a Winograd-style pronoun resolution task. We\ntrain a Variational Quantum Classifier (VQC) for binary classification and\nimplement an end-to-end pronoun resolution system. The simulations executed on\nIBMQ software converged with an F1 score of 87.20%. The model outperformed two\nout of three classical coreference resolution systems and neared\nstate-of-the-art SpanBERT. A mixed quantum-classical model yet improved these\nresults with an F1 score increase of around 6%.\n","authors":["Hadi Wazni","Mehrnoosh Sadrzadeh"],"pdf_url":"https://arxiv.org/pdf/2312.00688v1.pdf","comment":"CRAC 2023, the Sixth Workshop on Computational Models of Reference,\n Anaphora and Coreference, EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.07618v3","updated":"2023-12-01T16:00:14Z","published":"2023-05-12T17:17:01Z","title":"Uncertainty Estimation and Out-of-Distribution Detection for Deep\n Learning-Based Image Reconstruction using the Local Lipschitz","summary":" Accurate image reconstruction is at the heart of diagnostics in medical\nimaging. Supervised deep learning-based approaches have been investigated for\nsolving inverse problems including image reconstruction. However, these trained\nmodels encounter unseen data distributions that are widely shifted from\ntraining data during deployment. Therefore, it is essential to assess whether a\ngiven input falls within the training data distribution for diagnostic\npurposes. Uncertainty estimation approaches exist but focus on providing an\nuncertainty map to radiologists, rather than assessing the training\ndistribution fit. In this work, we propose a method based on the local\nLipschitz-based metric to distinguish out-of-distribution images from\nin-distribution with an area under the curve of 99.94%. Empirically, we\ndemonstrate a very strong relationship between the local Lipschitz value and\nmean absolute error (MAE), supported by a high Spearman's rank correlation\ncoefficient of 0.8475, which determines the uncertainty estimation threshold\nfor optimal model performance. Through the identification of false positives,\nthe local Lipschitz and MAE relationship was used to guide data augmentation\nand reduce model uncertainty. Our study was validated using the AUTOMAP\narchitecture for sensor-to-image Magnetic Resonance Imaging (MRI)\nreconstruction. We compare our proposed approach with baseline methods:\nMonte-Carlo dropout and deep ensembles, and further analysis included MRI\ndenoising and Computed Tomography (CT) sparse-to-full view reconstruction using\nUNET architectures. We show that our approach is applicable to various\narchitectures and learned functions, especially in the realm of medical image\nreconstruction, where preserving the diagnostic accuracy of reconstructed\nimages remains paramount.\n","authors":["Danyal F. Bhutto","Bo Zhu","Jeremiah Z. Liu","Neha Koonjoo","Hongwei B. Li","Bruce R. Rosen","Matthew S. Rosen"],"pdf_url":"https://arxiv.org/pdf/2305.07618v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.12224v2","updated":"2023-12-01T15:56:21Z","published":"2023-05-20T16:23:50Z","title":"On the Trade-off of Intra-/Inter-class Diversity for Supervised\n Pre-training","summary":" Pre-training datasets are critical for building state-of-the-art machine\nlearning models, motivating rigorous study on their impact on downstream tasks.\nIn this work, we study the impact of the trade-off between the intra-class\ndiversity (the number of samples per class) and the inter-class diversity (the\nnumber of classes) of a supervised pre-training dataset. Empirically, we found\nthat with the size of the pre-training dataset fixed, the best downstream\nperformance comes with a balance on the intra-/inter-class diversity. To\nunderstand the underlying mechanism, we show theoretically that the downstream\nperformance depends monotonically on both types of diversity. Notably, our\ntheory reveals that the optimal class-to-sample ratio (#classes / #samples per\nclass) is invariant to the size of the pre-training dataset, which motivates an\napplication of predicting the optimal number of pre-training classes. We\ndemonstrate the effectiveness of this application by an improvement of around 2\npoints on the downstream tasks when using ImageNet as the pre-training dataset.\n","authors":["Jieyu Zhang","Bohan Wang","Zhengyu Hu","Pang Wei Koh","Alexander Ratner"],"pdf_url":"https://arxiv.org/pdf/2305.12224v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2307.00309v2","updated":"2023-12-01T15:51:55Z","published":"2023-07-01T11:46:36Z","title":"Adversarial Attacks and Defenses on 3D Point Cloud Classification: A\n Survey","summary":" Deep learning has successfully solved a wide range of tasks in 2D vision as a\ndominant AI technique. Recently, deep learning on 3D point clouds is becoming\nincreasingly popular for addressing various tasks in this field. Despite\nremarkable achievements, deep learning algorithms are vulnerable to adversarial\nattacks. These attacks are imperceptible to the human eye but can easily fool\ndeep neural networks in the testing and deployment stage. To encourage future\nresearch, this survey summarizes the current progress on adversarial attack and\ndefense techniques on point cloud classification.This paper first introduces\nthe principles and characteristics of adversarial attacks and summarizes and\nanalyzes adversarial example generation methods in recent years. Additionally,\nit provides an overview of defense strategies, organized into data-focused and\nmodel-focused methods. Finally, it presents several current challenges and\npotential future research directions in this domain.\n","authors":["Hanieh Naderi","Ivan V. Bajić"],"pdf_url":"https://arxiv.org/pdf/2307.00309v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00671v1","updated":"2023-12-01T15:50:20Z","published":"2023-12-01T15:50:20Z","title":"CellMixer: Annotation-free Semantic Cell Segmentation of Heterogeneous\n Cell Populations","summary":" In recent years, several unsupervised cell segmentation methods have been\npresented, trying to omit the requirement of laborious pixel-level annotations\nfor the training of a cell segmentation model. Most if not all of these methods\nhandle the instance segmentation task by focusing on the detection of different\ncell instances ignoring their type. While such models prove adequate for\ncertain tasks, like cell counting, other applications require the\nidentification of each cell's type. In this paper, we present CellMixer, an\ninnovative annotation-free approach for the semantic segmentation of\nheterogeneous cell populations. Our augmentation-based method enables the\ntraining of a segmentation model from image-level labels of homogeneous cell\npopulations. Our results show that CellMixer can achieve competitive\nsegmentation performance across multiple cell types and imaging modalities,\ndemonstrating the method's scalability and potential for broader applications\nin medical imaging, cellular biology, and diagnostics.\n","authors":["Mehdi Naouar","Gabriel Kalweit","Anusha Klett","Yannick Vogt","Paula Silvestrini","Diana Laura Infante Ramirez","Roland Mertelsmann","Joschka Boedecker","Maria Kalweit"],"pdf_url":"https://arxiv.org/pdf/2312.00671v1.pdf","comment":"Medical Imaging Meets NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00662v1","updated":"2023-12-01T15:40:30Z","published":"2023-12-01T15:40:30Z","title":"Nonparametric Variational Regularisation of Pretrained Transformers","summary":" The current paradigm of large-scale pre-training and fine-tuning Transformer\nlarge language models has lead to significant improvements across the board in\nnatural language processing. However, such large models are susceptible to\noverfitting to their training data, and as a result the models perform poorly\nwhen the domain changes. Also, due to the model's scale, the cost of\nfine-tuning the model to the new domain is large. Nonparametric Variational\nInformation Bottleneck (NVIB) has been proposed as a regulariser for training\ncross-attention in Transformers, potentially addressing the overfitting\nproblem. We extend the NVIB framework to replace all types of attention\nfunctions in Transformers, and show that existing pretrained Transformers can\nbe reinterpreted as Nonparametric Variational (NV) models using a proposed\nidentity initialisation. We then show that changing the initialisation\nintroduces a novel, information-theoretic post-training regularisation in the\nattention mechanism, which improves out-of-domain generalisation without any\ntraining. This success supports the hypothesis that pretrained Transformers are\nimplicitly NV Bayesian models.\n","authors":["Fabio Fehr","James Henderson"],"pdf_url":"https://arxiv.org/pdf/2312.00662v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00660v1","updated":"2023-12-01T15:39:24Z","published":"2023-12-01T15:39:24Z","title":"Resource-constrained knowledge diffusion processes inspired by human\n peer learning","summary":" We consider a setting where a population of artificial learners is given, and\nthe objective is to optimize aggregate measures of performance, under\nconstraints on training resources. The problem is motivated by the study of\npeer learning in human educational systems. In this context, we study natural\nknowledge diffusion processes in networks of interacting artificial learners.\nBy `natural', we mean processes that reflect human peer learning where the\nstudents' internal state and learning process is mostly opaque, and the main\ndegree of freedom lies in the formation of peer learning groups by a\ncoordinator who can potentially evaluate the learners before assigning them to\npeer groups. Among else, we empirically show that such processes indeed make\neffective use of the training resources, and enable the design of modular\nneural models that have the capacity to generalize without being prone to\noverfitting noisy labels.\n","authors":["Ehsan Beikihassan","Amy K. Hoover","Ioannis Koutis","Ali Parviz","Niloofar Aghaieabiane"],"pdf_url":"https://arxiv.org/pdf/2312.00660v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18765v2","updated":"2023-12-01T15:38:31Z","published":"2023-11-30T18:05:52Z","title":"MLLMs-Augmented Visual-Language Representation Learning","summary":" Visual-language pre-training (VLP) has achieved remarkable success in\nmulti-modal tasks, largely attributed to the availability of large-scale\nimage-text datasets. In this work, we demonstrate that multi-modal large\nlanguage models (MLLMs) can enhance visual-language representation learning by\nimproving data quality. Our approach is simple, utilizing MLLMs to extend\nmultiple captions for each image. To prevent the bias introduced by MLLMs'\nhallucinations and intrinsic caption styles, we propose \"text shearing\" to\nmaintain the same length for extended captions as that of the original\ncaptions. In image-text retrieval, our method consistently obtains 5.6 ~ 35.0%\nand 16.8 ~ 46.1% improvement on R@1 under the fine-tuning and zero-shot\nsettings, respectively. Notably, we obtain zero-shot results that are\ncomparable to fine-tuning on target datasets, which encourages more exploration\nof the versatile use of MLLMs.\n","authors":["Yanqing Liu","Kai Wang","Wenqi Shao","Ping Luo","Yu Qiao","Mike Zheng Shou","Kaipeng Zhang","Yang You"],"pdf_url":"https://arxiv.org/pdf/2311.18765v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03687v2","updated":"2023-12-01T15:37:07Z","published":"2023-11-07T03:25:56Z","title":"Dissecting the Runtime Performance of the Training, Fine-tuning, and\n Inference of Large Language Models","summary":" Large Language Models (LLMs) have seen great advance in both academia and\nindustry, and their popularity results in numerous open-source frameworks and\ntechniques in accelerating LLM pre-training, fine-tuning, and inference.\nTraining and deploying LLMs are expensive as it requires considerable computing\nresources and memory, hence many efficient approaches have been developed for\nimproving system pipelines as well as operators. However, the runtime\nperformance can vary significantly across hardware and software stacks, which\nmakes it difficult to choose the best configuration. In this work, we aim to\nbenchmark the performance from both macro and micro perspectives. First, we\nbenchmark the end-to-end performance of pre-training, fine-tuning, and serving\nLLMs in different sizes , i.e., 7, 13, and 70 billion parameters (7B, 13B, and\n70B) on three 8-GPU platforms with and without individual optimization\ntechniques, including ZeRO, quantization, recomputation, FlashAttention. Then,\nwe dive deeper to provide a detailed runtime analysis of the sub-modules,\nincluding computing and communication operators in LLMs. For end users, our\nbenchmark and findings help better understand different optimization\ntechniques, training and inference frameworks, together with hardware platforms\nin choosing configurations for deploying LLMs. For researchers, our in-depth\nmodule-wise analyses discover potential opportunities for future work to\nfurther optimize the runtime performance of LLMs.\n","authors":["Longteng Zhang","Xiang Liu","Zeyu Li","Xinglin Pan","Peijie Dong","Ruibo Fan","Rui Guo","Xin Wang","Qiong Luo","Shaohuai Shi","Xiaowen Chu"],"pdf_url":"https://arxiv.org/pdf/2311.03687v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00656v1","updated":"2023-12-01T15:30:54Z","published":"2023-12-01T15:30:54Z","title":"Simple Transferability Estimation for Regression Tasks","summary":" We consider transferability estimation, the problem of estimating how well\ndeep learning models transfer from a source to a target task. We focus on\nregression tasks, which received little previous attention, and propose two\nsimple and computationally efficient approaches that estimate transferability\nbased on the negative regularized mean squared error of a linear regression\nmodel. We prove novel theoretical results connecting our approaches to the\nactual transferability of the optimal target models obtained from the transfer\nlearning process. Despite their simplicity, our approaches significantly\noutperform existing state-of-the-art regression transferability estimators in\nboth accuracy and efficiency. On two large-scale keypoint regression\nbenchmarks, our approaches yield 12% to 36% better results on average while\nbeing at least 27% faster than previous state-of-the-art methods.\n","authors":["Cuong N. Nguyen","Phong Tran","Lam Si Tung Ho","Vu Dinh","Anh T. Tran","Tal Hassner","Cuong V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.00656v1.pdf","comment":"23 pages, This paper published at The 39th Conference on Uncertainty\n in Artificial Intelligence (UAI2023)"},{"id":"http://arxiv.org/abs/2312.00655v1","updated":"2023-12-01T15:30:43Z","published":"2023-12-01T15:30:43Z","title":"Machine Learning for Health symposium 2023 -- Findings track","summary":" A collection of the accepted Findings papers that were presented at the 3rd\nMachine Learning for Health symposium (ML4H 2023), which was held on December\n10, 2023, in New Orleans, Louisiana, USA. ML4H 2023 invited high-quality\nsubmissions on relevant problems in a variety of health-related disciplines\nincluding healthcare, biomedicine, and public health. Two submission tracks\nwere offered: the archival Proceedings track, and the non-archival Findings\ntrack. Proceedings were targeted at mature work with strong technical\nsophistication and a high impact to health. The Findings track looked for new\nideas that could spark insightful discussion, serve as valuable resources for\nthe community, or could enable new collaborations. Submissions to the\nProceedings track, if not accepted, were automatically considered for the\nFindings track. All the manuscripts submitted to ML4H Symposium underwent a\ndouble-blind peer-review process.\n","authors":["Stefan Hegselmann","Antonio Parziale","Divya Shanmugam","Shengpu Tang","Mercy Nyamewaa Asiedu","Serina Chang","Thomas Hartvigsen","Harvineet Singh"],"pdf_url":"https://arxiv.org/pdf/2312.00655v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.13080v2","updated":"2023-12-01T15:18:18Z","published":"2023-02-25T13:58:37Z","title":"Does a Neural Network Really Encode Symbolic Concepts?","summary":" Recently, a series of studies have tried to extract interactions between\ninput variables modeled by a DNN and define such interactions as concepts\nencoded by the DNN. However, strictly speaking, there still lacks a solid\nguarantee whether such interactions indeed represent meaningful concepts.\nTherefore, in this paper, we examine the trustworthiness of interaction\nconcepts from four perspectives. Extensive empirical studies have verified that\na well-trained DNN usually encodes sparse, transferable, and discriminative\nconcepts, which is partially aligned with human intuition.\n","authors":["Mingjie Li","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13080v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00645v1","updated":"2023-12-01T15:16:00Z","published":"2023-12-01T15:16:00Z","title":"Hashmarks: Privacy-Preserving Benchmarks for High-Stakes AI Evaluation","summary":" There is a growing need to gain insight into language model capabilities that\nrelate to sensitive topics, such as bioterrorism or cyberwarfare. However,\ntraditional open source benchmarks are not fit for the task, due to the\nassociated practice of publishing the correct answers in human-readable form.\nAt the same time, enforcing mandatory closed-quarters evaluations might stifle\ndevelopment and erode trust. In this context, we propose hashmarking, a\nprotocol for evaluating language models in the open without having to disclose\nthe correct answers. In its simplest form, a hashmark is a benchmark whose\nreference solutions have been cryptographically hashed prior to publication.\nFollowing an overview of the proposed evaluation protocol, we go on to assess\nits resilience against traditional attack vectors (e.g. rainbow table attacks),\nas well as against failure modes unique to increasingly capable generative\nmodels.\n","authors":["Paul Bricman"],"pdf_url":"https://arxiv.org/pdf/2312.00645v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10043v3","updated":"2023-12-01T15:06:19Z","published":"2022-06-20T23:18:59Z","title":"Classification Utility, Fairness, and Compactness via Tunable\n Information Bottleneck and Rényi Measures","summary":" Designing machine learning algorithms that are accurate yet fair, not\ndiscriminating based on any sensitive attribute, is of paramount importance for\nsociety to accept AI for critical applications. In this article, we propose a\nnovel fair representation learning method termed the R\\'enyi Fair Information\nBottleneck Method (RFIB) which incorporates constraints for utility, fairness,\nand compactness (compression) of representation, and apply it to image and\ntabular data classification. A key attribute of our approach is that we\nconsider - in contrast to most prior work - both demographic parity and\nequalized odds as fairness constraints, allowing for a more nuanced\nsatisfaction of both criteria. Leveraging a variational approach, we show that\nour objectives yield a loss function involving classical Information Bottleneck\n(IB) measures and establish an upper bound in terms of two R\\'enyi measures of\norder $\\alpha$ on the mutual information IB term measuring compactness between\nthe input and its encoded embedding. We study the influence of the $\\alpha$\nparameter as well as two other tunable IB parameters on achieving\nutility/fairness trade-off goals, and show that the $\\alpha$ parameter gives an\nadditional degree of freedom that can be used to control the compactness of the\nrepresentation. Experimenting on three different image datasets (EyePACS,\nCelebA, and FairFace) and two tabular datasets (Adult and COMPAS), using both\nbinary and categorical sensitive attributes, we show that on various utility,\nfairness, and compound utility/fairness metrics RFIB outperforms current\nstate-of-the-art approaches.\n","authors":["Adam Gronowski","William Paul","Fady Alajaji","Bahman Gharesifard","Philippe Burlina"],"pdf_url":"https://arxiv.org/pdf/2206.10043v3.pdf","comment":"To appear in the IEEE Transactions on Information Forensics &\n Security"},{"id":"http://arxiv.org/abs/2312.00640v1","updated":"2023-12-01T15:00:59Z","published":"2023-12-01T15:00:59Z","title":"One to beat them all: \"RYU'' -- a unifying framework for the\n construction of safe balls","summary":" In this paper, we put forth a novel framework (named ``RYU'') for the\nconstruction of ``safe'' balls, i.e. regions that provably contain the dual\nsolution of a target optimization problem. We concentrate on the standard setup\nwhere the cost function is the sum of two terms: a closed, proper, convex\nLipschitz-smooth function and a closed, proper, convex function. The RYU\nframework is shown to generalize or improve upon all the results proposed in\nthe last decade for the considered family of optimization problems.\n","authors":["Thu-Le Tran","Clément Elvira","Hong-Phuong Dang","Cédric Herzet"],"pdf_url":"https://arxiv.org/pdf/2312.00640v1.pdf","comment":"19 pages, 1 table"},{"id":"http://arxiv.org/abs/2312.00639v1","updated":"2023-12-01T14:59:43Z","published":"2023-12-01T14:59:43Z","title":"EvE: Exploiting Generative Priors for Radiance Field Enrichment","summary":" Modeling large-scale scenes from unconstrained image collections in-the-wild\nhas proven to be a major challenge in computer vision. Existing methods\ntackling in-the-wild neural rendering operate in a closed-world setting, where\nknowledge is limited to a scene's captured images within a training set. We\npropose EvE, which is, to the best of our knowledge, the first method\nleveraging generative priors to improve in-the-wild scene modeling. We employ\npre-trained generative networks to enrich K-Planes representations with\nextrinsic knowledge. To this end, we define an alternating training procedure\nto conduct optimization guidance of K-Planes trained on the training set. We\ncarry out extensive experiments and verify the merit of our method on synthetic\ndata as well as real tourism photo collections. EvE enhances rendered scenes\nwith richer details and outperforms the state of the art on the task of novel\nview synthesis in-the-wild. Our project page can be found at\nhttps://eve-nvs.github.io .\n","authors":["Karim Kassab","Antoine Schnepf","Jean-Yves Franceschi","Laurent Caraffa","Jeremie Mary","Valérie Gouet-Brunet"],"pdf_url":"https://arxiv.org/pdf/2312.00639v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00626v1","updated":"2023-12-01T14:42:37Z","published":"2023-12-01T14:42:37Z","title":"Forecasting Trends in Food Security: a Reservoir Computing Approach","summary":" Early warning systems are an essential tool for effective humanitarian\naction. Advance warnings on impending disasters facilitate timely and targeted\nresponse which help save lives, livelihoods, and scarce financial resources. In\nthis work we present a new quantitative methodology to forecast levels of food\nconsumption for 60 consecutive days, at the sub-national level, in four\ncountries: Mali, Nigeria, Syria, and Yemen. The methodology is built on\npublicly available data from the World Food Programme's integrated global\nhunger monitoring system which collects, processes, and displays daily updates\non key food security metrics, conflict, weather events, and other drivers of\nfood insecurity across 90 countries (https://hungermap.wfp.org/). In this\nstudy, we assessed the performance of various models including ARIMA, XGBoost,\nLSTMs, CNNs, and Reservoir Computing (RC), by comparing their Root Mean Squared\nError (RMSE) metrics. This comprehensive analysis spanned classical\nstatistical, machine learning, and deep learning approaches. Our findings\nhighlight Reservoir Computing as a particularly well-suited model in the field\nof food security given both its notable resistance to over-fitting on limited\ndata samples and its efficient training capabilities. The methodology we\nintroduce establishes the groundwork for a global, data-driven early warning\nsystem designed to anticipate and detect food insecurity.\n","authors":["Joschka Herteux","Christoph Räth","Amine Baha","Giulia Martini","Duccio Piovani"],"pdf_url":"https://arxiv.org/pdf/2312.00626v1.pdf","comment":"22 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.00622v1","updated":"2023-12-01T14:39:11Z","published":"2023-12-01T14:39:11Z","title":"Practical Path-based Bayesian Optimization","summary":" There has been a surge in interest in data-driven experimental design with\napplications to chemical engineering and drug manufacturing. Bayesian\noptimization (BO) has proven to be adaptable to such cases, since we can model\nthe reactions of interest as expensive black-box functions. Sometimes, the cost\nof this black-box functions can be separated into two parts: (a) the cost of\nthe experiment itself, and (b) the cost of changing the input parameters. In\nthis short paper, we extend the SnAKe algorithm to deal with both types of\ncosts simultaneously. We further propose extensions to the case of a maximum\nallowable input change, as well as to the multi-objective setting.\n","authors":["Jose Pablo Folch","James Odgers","Shiqiang Zhang","Robert M Lee","Behrang Shafei","David Walz","Calvin Tsay","Mark van der Wilk","Ruth Misener"],"pdf_url":"https://arxiv.org/pdf/2312.00622v1.pdf","comment":"6 main pages, 12 with references and appendix. 4 figures, 2 tables.\n To appear in NeurIPS 2023 Workshop on Adaptive Experimental Design and Active\n Learning in the Real World"},{"id":"http://arxiv.org/abs/2312.00616v1","updated":"2023-12-01T14:28:37Z","published":"2023-12-01T14:28:37Z","title":"Investigating a domain adaptation approach for integrating different\n measurement instruments in a longitudinal clinical registry","summary":" In a longitudinal clinical registry, different measurement instruments might\nhave been used for assessing individuals at different time points. To combine\nthem, we investigate deep learning techniques for obtaining a joint latent\nrepresentation, to which the items of different measurement instruments are\nmapped. This corresponds to domain adaptation, an established concept in\ncomputer science for image data. Using the proposed approach as an example, we\nevaluate the potential of domain adaptation in a longitudinal cohort setting\nwith a rather small number of time points, motivated by an application with\ndifferent motor function measurement instruments in a registry of spinal\nmuscular atrophy (SMA) patients. There, we model trajectories in the latent\nrepresentation by ordinary differential equations (ODEs), where person-specific\nODE parameters are inferred from baseline characteristics. The goodness of fit\nand complexity of the ODE solutions then allows to judge the measurement\ninstrument mappings. We subsequently explore how alignment can be improved by\nincorporating corresponding penalty terms into model fitting. To systematically\ninvestigate the effect of differences between measurement instruments, we\nconsider several scenarios based on modified SMA data, including scenarios\nwhere a mapping should be feasible in principle and scenarios where no perfect\nmapping is available. While misalignment increases in more complex scenarios,\nsome structure is still recovered, even if the availability of measurement\ninstruments depends on patient state. A reasonable mapping is feasible also in\nthe more complex real SMA dataset. These results indicate that domain\nadaptation might be more generally useful in statistical modeling for\nlongitudinal registry data.\n","authors":["Maren Hackenberg","Michelle Pfaffenlehner","Max Behrens","Astrid Pechmann","Janbernd Kirschner","Harald Binder"],"pdf_url":"https://arxiv.org/pdf/2312.00616v1.pdf","comment":"18 pages, 4 figures"},{"id":"http://arxiv.org/abs/2304.06767v4","updated":"2023-12-01T14:28:06Z","published":"2023-04-13T18:22:40Z","title":"RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment","summary":" Generative foundation models are susceptible to implicit biases that can\narise from extensive unsupervised training data. Such biases can produce\nsuboptimal samples, skewed outcomes, and unfairness, with potentially serious\nconsequences. Consequently, aligning these models with human ethics and\npreferences is an essential step toward ensuring their responsible and\neffective deployment in real-world applications. Prior research has primarily\nemployed Reinforcement Learning from Human Feedback (RLHF) to address this\nproblem, where generative models are fine-tuned with RL algorithms guided by a\nhuman-feedback-informed reward model. However, the inefficiencies and\ninstabilities associated with RL algorithms frequently present substantial\nobstacles to the successful alignment, necessitating the development of a more\nrobust and streamlined approach. To this end, we introduce a new framework,\nReward rAnked FineTuning (RAFT), designed to align generative models\neffectively. Utilizing a reward model and a sufficient number of samples, our\napproach selects the high-quality samples, discarding those that exhibit\nundesired behavior, and subsequently enhancing the model by fine-tuning on\nthese filtered samples. Our studies show that RAFT can effectively improve the\nmodel performance in both reward learning and other automated metrics in both\nlarge language models and diffusion models.\n","authors":["Hanze Dong","Wei Xiong","Deepanshu Goyal","Yihan Zhang","Winnie Chow","Rui Pan","Shizhe Diao","Jipeng Zhang","Kashun Shum","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.06767v4.pdf","comment":"29 pages, 12 figures, Published in Transactions on Machine Learning\n Research (TMLR)"},{"id":"http://arxiv.org/abs/2306.17670v3","updated":"2023-12-01T14:23:16Z","published":"2023-06-30T14:01:53Z","title":"Learning Delays in Spiking Neural Networks using Dilated Convolutions\n with Learnable Spacings","summary":" Spiking Neural Networks (SNNs) are a promising research direction for\nbuilding power-efficient information processing systems, especially for\ntemporal tasks such as speech recognition. In SNNs, delays refer to the time\nneeded for one spike to travel from one neuron to another. These delays matter\nbecause they influence the spike arrival times, and it is well-known that\nspiking neurons respond more strongly to coincident input spikes. More\nformally, it has been shown theoretically that plastic delays greatly increase\nthe expressivity in SNNs. Yet, efficient algorithms to learn these delays have\nbeen lacking. Here, we propose a new discrete-time algorithm that addresses\nthis issue in deep feedforward SNNs using backpropagation, in an offline\nmanner. To simulate delays between consecutive layers, we use 1D convolutions\nacross time. The kernels contain only a few non-zero weights - one per synapse\n- whose positions correspond to the delays. These positions are learned\ntogether with the weights using the recently proposed Dilated Convolution with\nLearnable Spacings (DCLS). We evaluated our method on three datasets: the\nSpiking Heidelberg Dataset (SHD), the Spiking Speech Commands (SSC) and its\nnon-spiking version Google Speech Commands v0.02 (GSC) benchmarks, which\nrequire detecting temporal patterns. We used feedforward SNNs with two or three\nhidden fully connected layers, and vanilla leaky integrate-and-fire neurons. We\nshowed that fixed random delays help and that learning them helps even more.\nFurthermore, our method outperformed the state-of-the-art in the three datasets\nwithout using recurrent connections and with substantially fewer parameters.\nOur work demonstrates the potential of delay learning in developing accurate\nand precise models for temporal data processing. Our code is based on PyTorch /\nSpikingJelly and available at: https://github.com/Thvnvtos/SNN-delays\n","authors":["Ilyass Hammouamri","Ismail Khalfaoui-Hassani","Timothée Masquelier"],"pdf_url":"https://arxiv.org/pdf/2306.17670v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.03155v2","updated":"2023-12-01T14:19:31Z","published":"2020-07-07T01:24:22Z","title":"Decentralized policy learning with partial observation and mechanical\n constraints for multiperson modeling","summary":" Extracting the rules of real-world multi-agent behaviors is a current\nchallenge in various scientific and engineering fields. Biological agents\nindependently have limited observation and mechanical constraints; however,\nmost of the conventional data-driven models ignore such assumptions, resulting\nin lack of biological plausibility and model interpretability for behavioral\nanalyses. Here we propose sequential generative models with partial observation\nand mechanical constraints in a decentralized manner, which can model agents'\ncognition and body dynamics, and predict biologically plausible behaviors. We\nformulate this as a decentralized multi-agent imitation-learning problem,\nleveraging binary partial observation and decentralized policy models based on\nhierarchical variational recurrent neural networks with physical and\nbiomechanical penalties. Using real-world basketball and soccer datasets, we\nshow the effectiveness of our method in terms of the constraint violations,\nlong-term trajectory prediction, and partial observation. Our approach can be\nused as a multi-agent simulator to generate realistic trajectories using\nreal-world data.\n","authors":["Keisuke Fujii","Naoya Takeishi","Yoshinobu Kawahara","Kazuya Takeda"],"pdf_url":"https://arxiv.org/pdf/2007.03155v2.pdf","comment":"17 pages with 7 figures and 4 tables, accepted in Neural Networks"},{"id":"http://arxiv.org/abs/2312.00600v1","updated":"2023-12-01T14:06:28Z","published":"2023-12-01T14:06:28Z","title":"Improving Plasticity in Online Continual Learning via Collaborative\n Learning","summary":" Online Continual Learning (CL) solves the problem of learning the\never-emerging new classification tasks from a continuous data stream. Unlike\nits offline counterpart, in online CL, the training data can only be seen once.\nMost existing online CL research regards catastrophic forgetting (i.e., model\nstability) as almost the only challenge. In this paper, we argue that the\nmodel's capability to acquire new knowledge (i.e., model plasticity) is another\nchallenge in online CL. While replay-based strategies have been shown to be\neffective in alleviating catastrophic forgetting, there is a notable gap in\nresearch attention toward improving model plasticity. To this end, we propose\nCollaborative Continual Learning (CCL), a collaborative learning based strategy\nto improve the model's capability in acquiring new concepts. Additionally, we\nintroduce Distillation Chain (DC), a novel collaborative learning scheme to\nboost the training of the models. We adapted CCL-DC to existing representative\nonline CL works. Extensive experiments demonstrate that even if the learners\nare well-trained with state-of-the-art online CL methods, our strategy can\nstill improve model plasticity dramatically, and thereby improve the overall\nperformance by a large margin.\n","authors":["Maorong Wang","Nicolas Michel","Ling Xiao","Toshihiko Yamasaki"],"pdf_url":"https://arxiv.org/pdf/2312.00600v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2303.06419v3","updated":"2023-12-01T14:03:15Z","published":"2023-03-11T14:57:52Z","title":"Use Perturbations when Learning from Explanations","summary":" Machine learning from explanations (MLX) is an approach to learning that uses\nhuman-provided explanations of relevant or irrelevant features for each input\nto ensure that model predictions are right for the right reasons. Existing MLX\napproaches rely on local model interpretation methods and require strong model\nsmoothing to align model and human explanations, leading to sub-optimal\nperformance. We recast MLX as a robustness problem, where human explanations\nspecify a lower dimensional manifold from which perturbations can be drawn, and\nshow both theoretically and empirically how this approach alleviates the need\nfor strong model smoothing. We consider various approaches to achieving\nrobustness, leading to improved performance over prior MLX methods. Finally, we\nshow how to combine robustness with an earlier MLX method, yielding\nstate-of-the-art results on both synthetic and real-world benchmarks.\n","authors":["Juyeon Heo","Vihari Piratla","Matthew Wicker","Adrian Weller"],"pdf_url":"https://arxiv.org/pdf/2303.06419v3.pdf","comment":"NeurIPS 2023; https://github.com/vihari/robust_mlx"},{"id":"http://arxiv.org/abs/2312.00592v1","updated":"2023-12-01T13:56:28Z","published":"2023-12-01T13:56:28Z","title":"Tracking Object Positions in Reinforcement Learning: A Metric for\n Keypoint Detection (extended version)","summary":" Reinforcement learning (RL) for robot control typically requires a detailed\nrepresentation of the environment state, including information about\ntask-relevant objects not directly measurable. Keypoint detectors, such as\nspatial autoencoders (SAEs), are a common approach to extracting a\nlow-dimensional representation from high-dimensional image data. SAEs aim at\nspatial features such as object positions, which are often useful\nrepresentations in robotic RL. However, whether an SAE is actually able to\ntrack objects in the scene and thus yields a spatial state representation well\nsuited for RL tasks has rarely been examined due to a lack of established\nmetrics. In this paper, we propose to assess the performance of an SAE instance\nby measuring how well keypoints track ground truth objects in images. We\npresent a computationally lightweight metric and use it to evaluate common\nbaseline SAE architectures on image data from a simulated robot task. We find\nthat common SAEs differ substantially in their spatial extraction capability.\nFurthermore, we validate that SAEs that perform well in our metric achieve\nsuperior performance when used in downstream RL. Thus, our metric is an\neffective and lightweight indicator of RL performance before executing\nexpensive RL training. Building on these insights, we identify three key\nmodifications of SAE architectures to improve tracking performance. We make our\ncode available at anonymous.4open.science/r/sae-rl.\n","authors":["Emma Cramer","Jonas Reiher","Sebastian Trimpe"],"pdf_url":"https://arxiv.org/pdf/2312.00592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.11956v4","updated":"2023-12-01T13:53:37Z","published":"2022-05-24T10:36:05Z","title":"Bandwidth Selection for Gaussian Kernel Ridge Regression via Jacobian\n Control","summary":" Most machine learning methods require tuning of hyper-parameters. For kernel\nridge regression with the Gaussian kernel, the hyper-parameter is the\nbandwidth. The bandwidth specifies the length scale of the kernel and has to be\ncarefully selected to obtain a model with good generalization. The default\nmethods for bandwidth selection, cross-validation and marginal likelihood\nmaximization, often yield good results, albeit at high computational costs.\nInspired by Jacobian regularization, we formulate an approximate expression for\nhow the derivatives of the functions inferred by kernel ridge regression with\nthe Gaussian kernel depend on the kernel bandwidth. We use this expression to\npropose a closed-form, computationally feather-light, bandwidth selection\nheuristic, based on controlling the Jacobian. In addition, the Jacobian\nexpression illuminates how the bandwidth selection is a trade-off between the\nsmoothness of the inferred function and the conditioning of the training data\nkernel matrix. We show on real and synthetic data that compared to\ncross-validation and marginal likelihood maximization, our method is on pair in\nterms of model performance, but up to six orders of magnitude faster.\n","authors":["Oskar Allerbo","Rebecka Jörnsten"],"pdf_url":"https://arxiv.org/pdf/2205.11956v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11046v2","updated":"2023-12-01T13:52:32Z","published":"2023-07-20T17:28:01Z","title":"A Definition of Continual Reinforcement Learning","summary":" In a standard view of the reinforcement learning problem, an agent's goal is\nto efficiently identify a policy that maximizes long-term reward. However, this\nperspective is based on a restricted view of learning as finding a solution,\nrather than treating learning as endless adaptation. In contrast, continual\nreinforcement learning refers to the setting in which the best agents never\nstop learning. Despite the importance of continual reinforcement learning, the\ncommunity lacks a simple definition of the problem that highlights its\ncommitments and makes its primary concepts precise and clear. To this end, this\npaper is dedicated to carefully defining the continual reinforcement learning\nproblem. We formalize the notion of agents that \"never stop learning\" through a\nnew mathematical language for analyzing and cataloging agents. Using this new\nlanguage, we define a continual learning agent as one that can be understood as\ncarrying out an implicit search process indefinitely, and continual\nreinforcement learning as the setting in which the best agents are all\ncontinual learning agents. We provide two motivating examples, illustrating\nthat traditional views of multi-task reinforcement learning and continual\nsupervised learning are special cases of our definition. Collectively, these\ndefinitions and perspectives formalize many intuitive concepts at the heart of\nlearning, and open new research pathways surrounding continual learning agents.\n","authors":["David Abel","André Barreto","Benjamin Van Roy","Doina Precup","Hado van Hasselt","Satinder Singh"],"pdf_url":"https://arxiv.org/pdf/2307.11046v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.17886v2","updated":"2023-12-01T13:51:24Z","published":"2023-05-29T05:14:36Z","title":"Action valuation of on- and off-ball soccer players based on multi-agent\n deep reinforcement learning","summary":" Analysis of invasive sports such as soccer is challenging because the game\nsituation changes continuously in time and space, and multiple agents\nindividually recognize the game situation and make decisions. Previous studies\nusing deep reinforcement learning have often considered teams as a single agent\nand valued the teams and players who hold the ball in each discrete event. Then\nit was challenging to value the actions of multiple players, including players\nfar from the ball, in a spatiotemporally continuous state space. In this paper,\nwe propose a method of valuing possible actions for on- and off-ball soccer\nplayers in a single holistic framework based on multi-agent deep reinforcement\nlearning. We consider a discrete action space in a continuous state space that\nmimics that of Google research football and leverages supervised learning for\nactions in reinforcement learning. In the experiment, we analyzed the\nrelationships with conventional indicators, season goals, and game ratings by\nexperts, and showed the effectiveness of the proposed method. Our approach can\nassess how multiple players move continuously throughout the game, which is\ndifficult to be discretized or labeled but vital for teamwork, scouting, and\nfan engagement.\n","authors":["Hiroshi Nakahara","Kazushi Tsutsui","Kazuya Takeda","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2305.17886v2.pdf","comment":"12 pages, 4 figures, published in IEEE Access. The latest version is\n at https://ieeexplore.ieee.org/document/10328596"},{"id":"http://arxiv.org/abs/2312.00586v1","updated":"2023-12-01T13:50:55Z","published":"2023-12-01T13:50:55Z","title":"Explainable Fraud Detection with Deep Symbolic Classification","summary":" There is a growing demand for explainable, transparent, and data-driven\nmodels within the domain of fraud detection. Decisions made by fraud detection\nmodels need to be explainable in the event of a customer dispute. Additionally,\nthe decision-making process in the model must be transparent to win the trust\nof regulators and business stakeholders. At the same time, fraud detection\nsolutions can benefit from data due to the noisy, dynamic nature of fraud and\nthe availability of large historical data sets. Finally, fraud detection is\nnotorious for its class imbalance: there are typically several orders of\nmagnitude more legitimate transactions than fraudulent ones. In this paper, we\npresent Deep Symbolic Classification (DSC), an extension of the Deep Symbolic\nRegression framework to classification problems. DSC casts classification as a\nsearch problem in the space of all analytic functions composed of a vocabulary\nof variables, constants, and operations and optimizes for an arbitrary\nevaluation metric directly. The search is guided by a deep neural network\ntrained with reinforcement learning. Because the functions are mathematical\nexpressions that are in closed-form and concise, the model is inherently\nexplainable both at the level of a single classification decision and the\nmodel's decision process. Furthermore, the class imbalance problem is\nsuccessfully addressed by optimizing for metrics that are robust to class\nimbalance such as the F1 score. This eliminates the need for oversampling and\nundersampling techniques that plague traditional approaches. Finally, the model\nallows to explicitly balance between the prediction accuracy and the\nexplainability. An evaluation on the PaySim data set demonstrates competitive\npredictive performance with state-of-the-art models, while surpassing them in\nterms of explainability. This establishes DSC as a promising model for fraud\ndetection systems.\n","authors":["Samantha Visbeek","Erman Acar","Floris den Hengst"],"pdf_url":"https://arxiv.org/pdf/2312.00586v1.pdf","comment":"12 pages, 3 figures, To be published in the 3rd International\n Workshop on Explainable AI in Finance of the 4th ACM International Conference\n on AI in Finance (ICAIF, https://ai-finance.org/)"},{"id":"http://arxiv.org/abs/2312.00585v1","updated":"2023-12-01T13:50:15Z","published":"2023-12-01T13:50:15Z","title":"Adaptive Parameter-Free Robust Learning using Latent Bernoulli Variables","summary":" We present an efficient parameter-free approach for statistical learning from\ncorrupted training sets. We identify corrupted and non-corrupted samples using\nlatent Bernoulli variables, and therefore formulate the robust learning problem\nas maximization of the likelihood where latent variables are marginalized out.\nThe resulting optimization problem is solved via variational inference using an\nefficient Expectation-Maximization based method. The proposed approach improves\nover the state-of-the-art by automatically inferring the corruption level and\nidentifying outliers, while adding minimal computational overhead. We\ndemonstrate our robust learning method on a wide variety of machine learning\ntasks including online learning and deep learning where it exhibits ability to\nadapt to different levels of noise and attain high prediction accuracy.\n","authors":["Aleksandr Karakulev","Dave Zachariah","Prashant Singh"],"pdf_url":"https://arxiv.org/pdf/2312.00585v1.pdf","comment":"14 pages, 14 figures, and 2 tables"},{"id":"http://arxiv.org/abs/2312.00581v1","updated":"2023-12-01T13:45:42Z","published":"2023-12-01T13:45:42Z","title":"Pathway to a fully data-driven geotechnics: lessons from materials\n informatics","summary":" This paper elucidates the challenges and opportunities inherent in\nintegrating data-driven methodologies into geotechnics, drawing inspiration\nfrom the success of materials informatics. Highlighting the intricacies of soil\ncomplexity, heterogeneity, and the lack of comprehensive data, the discussion\nunderscores the pressing need for community-driven database initiatives and\nopen science movements. By leveraging the transformative power of deep\nlearning, particularly in feature extraction from high-dimensional data and the\npotential of transfer learning, we envision a paradigm shift towards a more\ncollaborative and innovative geotechnics field. The paper concludes with a\nforward-looking stance, emphasizing the revolutionary potential brought about\nby advanced computational tools like large language models in reshaping\ngeotechnics informatics.\n","authors":["Stephen Wu","Yu Otake","Yosuke Higo","Ikumasa Yoshida"],"pdf_url":"https://arxiv.org/pdf/2312.00581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04962v4","updated":"2023-12-01T13:35:01Z","published":"2023-07-11T01:52:08Z","title":"Intrinsically motivated graph exploration using network theories of\n human curiosity","summary":" Intrinsically motivated exploration has proven useful for reinforcement\nlearning, even without additional extrinsic rewards. When the environment is\nnaturally represented as a graph, how to guide exploration best remains an open\nquestion. In this work, we propose a novel approach for exploring\ngraph-structured data motivated by two theories of human curiosity: the\ninformation gap theory and the compression progress theory. The theories view\ncuriosity as an intrinsic motivation to optimize for topological features of\nsubgraphs induced by nodes visited in the environment. We use these proposed\nfeatures as rewards for graph neural-network-based reinforcement learning. On\nmultiple classes of synthetically generated graphs, we find that trained agents\ngeneralize to longer exploratory walks and larger environments than are seen\nduring training. Our method computes more efficiently than the greedy\nevaluation of the relevant topological properties. The proposed intrinsic\nmotivations bear particular relevance for recommender systems. We demonstrate\nthat next-node recommendations considering curiosity are more predictive of\nhuman choices than PageRank centrality in several real-world graph\nenvironments.\n","authors":["Shubhankar P. Patankar","Mathieu Ouellet","Juan Cervino","Alejandro Ribeiro","Kieran A. Murphy","Dani S. Bassett"],"pdf_url":"https://arxiv.org/pdf/2307.04962v4.pdf","comment":"15 pages, 5 figures in main text, and 18 pages, 9 figures in\n supplement"},{"id":"http://arxiv.org/abs/2302.11259v2","updated":"2023-12-01T13:22:56Z","published":"2023-02-22T10:12:07Z","title":"Transfer Learning Enhanced Full Waveform Inversion","summary":" We propose a way to favorably employ neural networks in the field of\nnon-destructive testing using Full Waveform Inversion (FWI). The presented\nmethodology discretizes the unknown material distribution in the domain with a\nneural network within an adjoint optimization. To further increase efficiency\nof the FWI, pretrained neural networks are used to provide a good starting\npoint for the inversion. This reduces the number of iterations in the Full\nWaveform Inversion for specific, yet generalizable settings.\n","authors":["Stefan Kollmannsberger","Divya Singh","Leon Herrmann"],"pdf_url":"https://arxiv.org/pdf/2302.11259v2.pdf","comment":"7 pages, 5 figures"},{"id":"http://arxiv.org/abs/2302.01170v2","updated":"2023-12-01T13:21:51Z","published":"2023-02-02T15:48:39Z","title":"Timewarp: Transferable Acceleration of Molecular Dynamics by Learning\n Time-Coarsened Dynamics","summary":" Molecular dynamics (MD) simulation is a widely used technique to simulate\nmolecular systems, most commonly at the all-atom resolution where equations of\nmotion are integrated with timesteps on the order of femtoseconds\n($1\\textrm{fs}=10^{-15}\\textrm{s}$). MD is often used to compute equilibrium\nproperties, which requires sampling from an equilibrium distribution such as\nthe Boltzmann distribution. However, many important processes, such as binding\nand folding, occur over timescales of milliseconds or beyond, and cannot be\nefficiently sampled with conventional MD. Furthermore, new MD simulations need\nto be performed for each molecular system studied. We present Timewarp, an\nenhanced sampling method which uses a normalising flow as a proposal\ndistribution in a Markov chain Monte Carlo method targeting the Boltzmann\ndistribution. The flow is trained offline on MD trajectories and learns to make\nlarge steps in time, simulating the molecular dynamics of $10^{5} -\n10^{6}\\:\\textrm{fs}$. Crucially, Timewarp is transferable between molecular\nsystems: once trained, we show that it generalises to unseen small peptides\n(2-4 amino acids) at all-atom resolution, exploring their metastable states and\nproviding wall-clock acceleration of sampling compared to standard MD. Our\nmethod constitutes an important step towards general, transferable algorithms\nfor accelerating MD.\n","authors":["Leon Klein","Andrew Y. K. Foong","Tor Erlend Fjelde","Bruno Mlodozeniec","Marc Brockschmidt","Sebastian Nowozin","Frank Noé","Ryota Tomioka"],"pdf_url":"https://arxiv.org/pdf/2302.01170v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00561v1","updated":"2023-12-01T13:16:39Z","published":"2023-12-01T13:16:39Z","title":"Interior Point Constrained Reinforcement Learning with Global\n Convergence Guarantees","summary":" We consider discounted infinite horizon constrained Markov decision processes\n(CMDPs) where the goal is to find an optimal policy that maximizes the expected\ncumulative reward subject to expected cumulative constraints. Motivated by the\napplication of CMDPs in online learning of safety-critical systems, we focus on\ndeveloping an algorithm that ensures constraint satisfaction during learning.\nTo this end, we develop a zeroth-order interior point approach based on the log\nbarrier function of the CMDP. Under the commonly assumed conditions of Fisher\nnon-degeneracy and bounded transfer error of the policy parameterization, we\nestablish the theoretical properties of the algorithm. In particular, in\ncontrast to existing CMDP approaches that ensure policy feasibility only upon\nconvergence, our algorithm guarantees feasibility of the policies during the\nlearning process and converges to the optimal policy with a sample complexity\nof $O(\\varepsilon^{-6})$. In comparison to the state-of-the-art policy\ngradient-based algorithm, C-NPG-PDA, our algorithm requires an additional\n$O(\\varepsilon^{-2})$ samples to ensure policy feasibility during learning with\nsame Fisher-non-degenerate parameterization.\n","authors":["Tingting Ni","Maryam Kamgarpour"],"pdf_url":"https://arxiv.org/pdf/2312.00561v1.pdf","comment":"34 pages, no figures"},{"id":"http://arxiv.org/abs/2010.04055v2","updated":"2023-12-01T12:49:42Z","published":"2020-10-08T15:19:22Z","title":"A Unified Approach to Interpreting and Boosting Adversarial\n Transferability","summary":" In this paper, we use the interaction inside adversarial perturbations to\nexplain and boost the adversarial transferability. We discover and prove the\nnegative correlation between the adversarial transferability and the\ninteraction inside adversarial perturbations. The negative correlation is\nfurther verified through different DNNs with various inputs. Moreover, this\nnegative correlation can be regarded as a unified perspective to understand\ncurrent transferability-boosting methods. To this end, we prove that some\nclassic methods of enhancing the transferability essentially decease\ninteractions inside adversarial perturbations. Based on this, we propose to\ndirectly penalize interactions during the attacking process, which\nsignificantly improves the adversarial transferability.\n","authors":["Xin Wang","Jie Ren","Shuyun Lin","Xiangming Zhu","Yisen Wang","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2010.04055v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00548v1","updated":"2023-12-01T12:48:41Z","published":"2023-12-01T12:48:41Z","title":"Domain Adaptive Imitation Learning with Visual Observation","summary":" In this paper, we consider domain-adaptive imitation learning with visual\nobservation, where an agent in a target domain learns to perform a task by\nobserving expert demonstrations in a source domain. Domain adaptive imitation\nlearning arises in practical scenarios where a robot, receiving visual sensory\ndata, needs to mimic movements by visually observing other robots from\ndifferent angles or observing robots of different shapes. To overcome the\ndomain shift in cross-domain imitation learning with visual observation, we\npropose a novel framework for extracting domain-independent behavioral features\nfrom input observations that can be used to train the learner, based on dual\nfeature extraction and image reconstruction. Empirical results demonstrate that\nour approach outperforms previous algorithms for imitation learning from visual\nobservation with domain shift.\n","authors":["Sungho Choi","Seungyul Han","Woojun Kim","Jongseong Chae","Whiyoung Jung","Youngchul Sung"],"pdf_url":"https://arxiv.org/pdf/2312.00548v1.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2006.15920v2","updated":"2023-12-01T12:45:49Z","published":"2020-06-29T10:24:27Z","title":"Interpreting and Disentangling Feature Components of Various Complexity\n from DNNs","summary":" This paper aims to define, quantify, and analyze the feature complexity that\nis learned by a DNN. We propose a generic definition for the feature\ncomplexity. Given the feature of a certain layer in the DNN, our method\ndisentangles feature components of different complexity orders from the\nfeature. We further design a set of metrics to evaluate the reliability, the\neffectiveness, and the significance of over-fitting of these feature\ncomponents. Furthermore, we successfully discover a close relationship between\nthe feature complexity and the performance of DNNs. As a generic mathematical\ntool, the feature complexity and the proposed metrics can also be used to\nanalyze the success of network compression and knowledge distillation.\n","authors":["Jie Ren","Mingjie Li","Zexu Liu","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2006.15920v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.09020v2","updated":"2023-12-01T12:39:01Z","published":"2022-10-17T12:42:29Z","title":"Defects of Convolutional Decoder Networks in Frequency Representation","summary":" In this paper, we prove the representation defects of a cascaded\nconvolutional decoder network, considering the capacity of representing\ndifferent frequency components of an input sample. We conduct the discrete\nFourier transform on each channel of the feature map in an intermediate layer\nof the decoder network. Then, we extend the 2D circular convolution theorem to\nrepresent the forward and backward propagations through convolutional layers in\nthe frequency domain. Based on this, we prove three defects in representing\nfeature spectrums. First, we prove that the convolution operation, the\nzero-padding operation, and a set of other settings all make a convolutional\ndecoder network more likely to weaken high-frequency components. Second, we\nprove that the upsampling operation generates a feature spectrum, in which\nstrong signals repetitively appear at certain frequencies. Third, we prove that\nif the frequency components in the input sample and frequency components in the\ntarget output for regression have a small shift, then the decoder usually\ncannot be effectively learned.\n","authors":["Ling Tang","Wen Shen","Zhanpeng Zhou","Yuefeng Chen","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2210.09020v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00540v1","updated":"2023-12-01T12:35:18Z","published":"2023-12-01T12:35:18Z","title":"Target-agnostic Source-free Domain Adaptation for Regression Tasks","summary":" Unsupervised domain adaptation (UDA) seeks to bridge the domain gap between\nthe target and source using unlabeled target data. Source-free UDA removes the\nrequirement for labeled source data at the target to preserve data privacy and\nstorage. However, work on source-free UDA assumes knowledge of domain gap\ndistribution, and hence is limited to either target-aware or classification\ntask. To overcome it, we propose TASFAR, a novel target-agnostic source-free\ndomain adaptation approach for regression tasks. Using prediction confidence,\nTASFAR estimates a label density map as the target label distribution, which is\nthen used to calibrate the source model on the target domain. We have conducted\nextensive experiments on four regression tasks with various domain gaps,\nnamely, pedestrian dead reckoning for different users, image-based people\ncounting in different scenes, housing-price prediction at different districts,\nand taxi-trip duration prediction from different departure points. TASFAR is\nshown to substantially outperform the state-of-the-art source-free UDA\napproaches by averagely reducing 22% errors for the four tasks and achieve\nnotably comparable accuracy as source-based UDA without using source data.\n","authors":["Tianlang He","Zhiqiu Xia","Jierun Chen","Haoliang Li","S. -H. Gary Chan"],"pdf_url":"https://arxiv.org/pdf/2312.00540v1.pdf","comment":"Accepted by ICDE 2024"},{"id":"http://arxiv.org/abs/2302.13095v2","updated":"2023-12-01T12:33:20Z","published":"2023-02-25T14:56:35Z","title":"Bayesian Neural Networks Avoid Encoding Complex and\n Perturbation-Sensitive Concepts","summary":" In this paper, we focus on mean-field variational Bayesian Neural Networks\n(BNNs) and explore the representation capacity of such BNNs by investigating\nwhich types of concepts are less likely to be encoded by the BNN. It has been\nobserved and studied that a relatively small set of interactive concepts\nusually emerge in the knowledge representation of a sufficiently-trained neural\nnetwork, and such concepts can faithfully explain the network output. Based on\nthis, our study proves that compared to standard deep neural networks (DNNs),\nit is less likely for BNNs to encode complex concepts. Experiments verify our\ntheoretical proofs. Note that the tendency to encode less complex concepts does\nnot necessarily imply weak representation power, considering that complex\nconcepts exhibit low generalization power and high adversarial vulnerability.\nThe code is available at https://github.com/sjtu-xai-lab/BNN-concepts.\n","authors":["Qihan Ren","Huiqi Deng","Yunuo Chen","Siyu Lou","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2302.13095v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00538v1","updated":"2023-12-01T12:27:11Z","published":"2023-12-01T12:27:11Z","title":"A Preconditioned Interior Point Method for Support Vector Machines Using\n an ANOVA-Decomposition and NFFT-Based Matrix-Vector Products","summary":" In this paper we consider the numerical solution to the soft-margin support\nvector machine optimization problem. This problem is typically solved using the\nSMO algorithm, given the high computational complexity of traditional\noptimization algorithms when dealing with large-scale kernel matrices. In this\nwork, we propose employing an NFFT-accelerated matrix-vector product using an\nANOVA decomposition for the feature space that is used within an interior point\nmethod for the overall optimization problem. As this method requires the\nsolution of a linear system of saddle point form we suggest a preconditioning\napproach that is based on low-rank approximations of the kernel matrix together\nwith a Krylov subspace solver. We compare the accuracy of the ANOVA-based\nkernel with the default LIBSVM implementation. We investigate the performance\nof the different preconditioners as well as the accuracy of the ANOVA kernel on\nseveral large-scale datasets.\n","authors":["Theresa Wagner","John W. Pearson","Martin Stoll"],"pdf_url":"https://arxiv.org/pdf/2312.00538v1.pdf","comment":"Official Code https://github.com/wagnertheresa/NFFTSVMipm"},{"id":"http://arxiv.org/abs/2304.01811v2","updated":"2023-12-01T12:27:10Z","published":"2023-04-04T14:08:42Z","title":"HarsanyiNet: Computing Accurate Shapley Values in a Single Forward\n Propagation","summary":" The Shapley value is widely regarded as a trustworthy attribution metric.\nHowever, when people use Shapley values to explain the attribution of input\nvariables of a deep neural network (DNN), it usually requires a very high\ncomputational cost to approximate relatively accurate Shapley values in\nreal-world applications. Therefore, we propose a novel network architecture,\nthe HarsanyiNet, which makes inferences on the input sample and simultaneously\ncomputes the exact Shapley values of the input variables in a single forward\npropagation. The HarsanyiNet is designed on the theoretical foundation that the\nShapley value can be reformulated as the redistribution of Harsanyi\ninteractions encoded by the network.\n","authors":["Lu Chen","Siyu Lou","Keyan Zhang","Jin Huang","Quanshi Zhang"],"pdf_url":"https://arxiv.org/pdf/2304.01811v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00535v1","updated":"2023-12-01T12:15:49Z","published":"2023-12-01T12:15:49Z","title":"RIS-Based On-the-Air Semantic Communications -- a Diffractional Deep\n Neural Network Approach","summary":" Semantic communication has gained significant attention recently due to its\nadvantages in achieving higher transmission efficiency by focusing on semantic\ninformation instead of bit-level information. However, current AI-based\nsemantic communication methods require digital hardware for implementation.\nWith the rapid advancement on reconfigurable intelligence surfaces (RISs), a\nnew approach called on-the-air diffractional deep neural networks (D$^2$NN) can\nbe utilized to enable semantic communications on the wave domain. This paper\nproposes a new paradigm of RIS-based on-the-air semantic communications, where\nthe computational process occurs inherently as wireless signals pass through\nRISs. We present the system model and discuss the data and control flows of\nthis scheme, followed by a performance analysis using image transmission as an\nexample. In comparison to traditional hardware-based approaches, RIS-based\nsemantic communications offer appealing features, such as light-speed\ncomputation, low computational power requirements, and the ability to handle\nmultiple tasks simultaneously.\n","authors":["Shuyi Chen","Yingzhe Hui","Yifan Qin","Yueyi Yuan","Weixiao Meng","Xuewen Luo","Hsiao-Hwa Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00535v1.pdf","comment":"17 pages, 5 figures, accepted by IEEE WCM"},{"id":"http://arxiv.org/abs/2305.15925v3","updated":"2023-12-01T11:54:20Z","published":"2023-05-25T10:53:18Z","title":"On the Identifiability of Switching Dynamical Systems","summary":" In the realm of interpretability and out-of-distribution generalisation, the\nidentifiability of latent variable models has emerged as a captivating field of\ninquiry. In this work, we delve into the identifiability of Switching Dynamical\nSystems, taking an initial stride toward extending identifiability analysis to\nsequential latent variable models. We first prove the identifiability of Markov\nSwitching Models, which commonly serve as the prior distribution for the\ncontinuous latent variables in Switching Dynamical Systems. We present\nidentification conditions for first-order Markov dependency structures, whose\ntransition distribution is parametrised via non-linear Gaussians. We then\nestablish the identifiability of the latent variables and non-linear mappings\nin Switching Dynamical Systems up to affine transformations, by leveraging\nidentifiability analysis techniques from identifiable deep latent variable\nmodels. We finally develop estimation algorithms for identifiable Switching\nDynamical Systems. Throughout empirical studies, we demonstrate the\npracticality of identifiable Switching Dynamical Systems for segmenting\nhigh-dimensional time series such as videos, and showcase the use of\nidentifiable Markov Switching Models for regime-dependent causal discovery in\nclimate data.\n","authors":["Carles Balsells-Rodas","Yixin Wang","Yingzhen Li"],"pdf_url":"https://arxiv.org/pdf/2305.15925v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06108v4","updated":"2023-12-01T11:52:21Z","published":"2022-11-11T10:24:42Z","title":"RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object\n Detection Systems","summary":" In autonomous driving, LiDAR and radar play important roles in the perception\nof the surrounding environment. LiDAR provides accurate 3D spatial sensing\ninformation but cannot work in adverse weather like fog. On the other hand, the\nradar signal can be diffracted when encountering raindrops or mist particles\nthanks to its wavelength, but it suffers from large noise. Recent\nstate-of-the-art works reveal that fusion of radar and LiDAR can lead to robust\ndetection in adverse weather. The existing works adopt convolutional neural\nnetwork architecture to extract features from each sensor data, then align and\naggregate the two branch features to predict object detection results. However,\nthese methods have low accuracy of bounding box estimations due to a simple\ndesign of label assignment and fusion strategies. In this paper, we propose a\nbird's-eye view fusion learning-based anchor box-free object detection system,\nwhich fuses the feature derived from the radar range-azimuth heatmap and the\nLiDAR point cloud to estimate possible objects. Different label assignment\nstrategies have been designed to facilitate the consistency between the\nclassification of foreground or background anchor points and the corresponding\nbounding box regressions. Furthermore, the performance of the proposed object\ndetector is further enhanced by employing a novel interactive transformer\nmodule. The superior performance of the methods proposed in this paper has been\ndemonstrated using the recently published Oxford Radar RobotCar dataset. Our\nsystem's average precision significantly outperforms the state-of-the-art\nmethod by 13.1% and 19.0% at IoU of 0.8 under 'Clear+Foggy' training conditions\nfor 'Clear' and 'Foggy' testing, respectively.\n","authors":["Yanlong Yang","Jianan Liu","Tao Huang","Qing-Long Han","Gang Ma","Bing Zhu"],"pdf_url":"https://arxiv.org/pdf/2211.06108v4.pdf","comment":"14 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.00516v1","updated":"2023-12-01T11:43:49Z","published":"2023-12-01T11:43:49Z","title":"Spatio-Temporal-Decoupled Masked Pre-training for Traffic Forecasting","summary":" Accurate forecasting of multivariate traffic flow time series remains\nchallenging due to substantial spatio-temporal heterogeneity and complex\nlong-range correlative patterns. To address this, we propose\nSpatio-Temporal-Decoupled Masked Pre-training (STD-MAE), a novel framework that\nemploys masked autoencoders to learn and encode complex spatio-temporal\ndependencies via pre-training. Specifically, we use two decoupled masked\nautoencoders to reconstruct the traffic data along spatial and temporal axes\nusing a self-supervised pre-training approach. These mask reconstruction\nmechanisms capture the long-range correlations in space and time separately.\nThe learned hidden representations are then used to augment the downstream\nspatio-temporal traffic predictor. A series of quantitative and qualitative\nevaluations on four widely-used traffic benchmarks (PEMS03, PEMS04, PEMS07, and\nPEMS08) are conducted to verify the state-of-the-art performance, with STD-MAE\nexplicitly enhancing the downstream spatio-temporal models' ability to capture\nlong-range intricate spatial and temporal patterns. Codes are available at\nhttps://github.com/Jimmy-7664/STD_MAE.\n","authors":["Haotian Gao","Renhe Jiang","Zheng Dong","Jinliang Deng","Xuan Song"],"pdf_url":"https://arxiv.org/pdf/2312.00516v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.07312v3","updated":"2023-12-01T11:38:20Z","published":"2023-10-11T08:57:59Z","title":"Diffusion Models for Wireless Communications","summary":" Innovative foundation models, such as GPT-4 and stable diffusion models, have\nmade a paradigm shift in the realm of artificial intelligence (AI) towards\ngenerative AI-based systems. AI and machine learning (AI/ML) algorithms are\nenvisioned to be pervasively incorporated into the future wireless\ncommunications systems. In this article, we outline the applications of\ndiffusion models in wireless communication systems, which are a new family of\nprobabilistic generative models that have showcased state-of-the-art\nperformance. The key idea is to decompose data generation process over\n\"denoising\" steps, gradually generating samples out of noise. Based on two case\nstudies presented, we show how diffusion models can be employed for the\ndevelopment of resilient AI-native communication systems. Specifically, we\npropose denoising diffusion probabilistic models (DDPM) for a wireless\ncommunication scheme with non-ideal transceivers, where 30% improvement is\nachieved in terms of bit error rate. In the other example, DDPM is employed at\nthe transmitter to shape the constellation symbols, highlighting a robust\nout-of-distribution performance.\n","authors":["Mehdi Letafati","Samad Ali","Matti Latva-aho"],"pdf_url":"https://arxiv.org/pdf/2310.07312v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00509v1","updated":"2023-12-01T11:30:51Z","published":"2023-12-01T11:30:51Z","title":"Bayesian causal discovery from unknown general interventions","summary":" We consider the problem of learning causal Directed Acyclic Graphs (DAGs)\nusing combinations of observational and interventional experimental data.\nCurrent methods tailored to this setting assume that interventions either\ndestroy parent-child relations of the intervened (target) nodes or only alter\nsuch relations without modifying the parent sets, even when the intervention\ntargets are unknown. We relax this assumption by proposing a Bayesian method\nfor causal discovery from general interventions, which allow for modifications\nof the parent sets of the unknown targets. Even in this framework, DAGs and\ngeneral interventions may be identifiable only up to some equivalence classes.\nWe provide graphical characterizations of such interventional Markov\nequivalence and devise compatible priors for Bayesian inference that guarantee\nscore equivalence of indistinguishable structures. We then develop a Markov\nChain Monte Carlo (MCMC) scheme to approximate the posterior distribution over\nDAGs, intervention targets and induced parent sets. Finally, we evaluate the\nproposed methodology on both simulated and real protein expression data.\n","authors":["Alessandro Mascaro","Federico Castelletti"],"pdf_url":"https://arxiv.org/pdf/2312.00509v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00507v1","updated":"2023-12-01T11:22:10Z","published":"2023-12-01T11:22:10Z","title":"VEXIR2Vec: An Architecture-Neutral Embedding Framework for Binary\n Similarity","summary":" We propose VEXIR2Vec, a code embedding framework for finding similar\nfunctions in binaries. Our representations rely on VEX IR, the intermediate\nrepresentation used by binary analysis tools like Valgrind and angr. Our\nproposed embeddings encode both syntactic and semantic information to represent\na function, and is both application and architecture independent. We also\npropose POV, a custom Peephole Optimization engine that normalizes the VEX IR\nfor effective similarity analysis. We design several optimizations like\ncopy/constant propagation, constant folding, common subexpression elimination\nand load-store elimination in POV.\n We evaluate our framework on two experiments -- diffing and searching --\ninvolving binaries targeting different architectures, compiled using different\ncompilers and versions, optimization sequences, and obfuscations. We show\nresults on several standard projects and on real-world vulnerabilities. Our\nresults show that VEXIR2Vec achieves superior precision and recall values\ncompared to the state-of-the-art works. Our framework is highly scalable and is\nbuilt as a multi-threaded, parallel library by only using open-source tools.\nVEXIR2Vec achieves about $3.2 \\times$ speedup on the closest competitor, and\norders-of-magnitude speedup on other tools.\n","authors":["S. VenkataKeerthy","Yashas Andaluri","Sayan Dey","Soumya Banerjee","Ramakrishna Upadrasta"],"pdf_url":"https://arxiv.org/pdf/2312.00507v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.13669v4","updated":"2023-12-01T11:10:59Z","published":"2021-04-28T09:47:21Z","title":"Optimal Stopping via Randomized Neural Networks","summary":" This paper presents the benefits of using randomized neural networks instead\nof standard basis functions or deep neural networks to approximate the\nsolutions of optimal stopping problems. The key idea is to use neural networks,\nwhere the parameters of the hidden layers are generated randomly and only the\nlast layer is trained, in order to approximate the continuation value. Our\napproaches are applicable to high dimensional problems where the existing\napproaches become increasingly impractical. In addition, since our approaches\ncan be optimized using simple linear regression, they are easy to implement and\ntheoretical guarantees can be provided. We test our approaches for American\noption pricing on Black--Scholes, Heston and rough Heston models and for\noptimally stopping a fractional Brownian motion. In all cases, our algorithms\noutperform the state-of-the-art and other relevant machine learning approaches\nin terms of computation time while achieving comparable results. Moreover, we\nshow that they can also be used to efficiently compute Greeks of American\noptions.\n","authors":["Calypso Herrera","Florian Krach","Pierre Ruyssen","Josef Teichmann"],"pdf_url":"https://arxiv.org/pdf/2104.13669v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00502v1","updated":"2023-12-01T11:06:00Z","published":"2023-12-01T11:06:00Z","title":"On the Out-Of-Distribution Robustness of Self-Supervised Representation\n Learning for Phonocardiogram Signals","summary":" Objective: Despite the recent increase in research activity, deep-learning\nmodels have not yet been widely accepted in medicine. The shortage of\nhigh-quality annotated data often hinders the development of robust and\ngeneralizable models, which do not suffer from degraded effectiveness when\npresented with newly-collected, out-of-distribution (OOD) datasets. Methods:\nContrastive Self-Supervised Learning (SSL) offers a potential solution to the\nscarcity of labeled data as it takes advantage of unlabeled data to increase\nmodel effectiveness and robustness. In this research, we propose applying\ncontrastive SSL for detecting abnormalities in phonocardiogram (PCG) samples by\nlearning a generalized representation of the signal. Specifically, we perform\nan extensive comparative evaluation of a wide range of audio-based\naugmentations and evaluate trained classifiers on multiple datasets across\ndifferent downstream tasks. Results: We experimentally demonstrate that,\ndepending on its training distribution, the effectiveness of a fully-supervised\nmodel can degrade up to 32% when evaluated on unseen data, while SSL models\nonly lose up to 10% or even improve in some cases. Conclusions: Contrastive SSL\npretraining can assist in providing robust classifiers which can generalize to\nunseen, OOD data, without relying on time- and labor-intensive annotation\nprocesses by medical experts. Furthermore, the proposed extensive evaluation\nprotocol sheds light on the most promising and appropriate augmentations for\nrobust PCG signal processing. Significance: We provide researchers and\npractitioners with a roadmap towards producing robust models for PCG\nclassification, in addition to an open-source codebase for developing novel\napproaches.\n","authors":["Aristotelis Ballas","Vasileios Papapanagiotou","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2312.00502v1.pdf","comment":"PREPRINT Manuscript under review"},{"id":"http://arxiv.org/abs/2312.00486v1","updated":"2023-12-01T10:34:22Z","published":"2023-12-01T10:34:22Z","title":"REDUCR: Robust Data Downsampling Using Class Priority Reweighting","summary":" Modern machine learning models are becoming increasingly expensive to train\nfor real-world image and text classification tasks, where massive web-scale\ndata is collected in a streaming fashion. To reduce the training cost, online\nbatch selection techniques have been developed to choose the most informative\ndatapoints. However, these techniques can suffer from poor worst-class\ngeneralization performance due to class imbalance and distributional shifts.\nThis work introduces REDUCR, a robust and efficient data downsampling method\nthat uses class priority reweighting. REDUCR reduces the training data while\npreserving worst-class generalization performance. REDUCR assigns priority\nweights to datapoints in a class-aware manner using an online learning\nalgorithm. We demonstrate the data efficiency and robust performance of REDUCR\non vision and text classification tasks. On web-scraped datasets with\nimbalanced class distributions, REDUCR significantly improves worst-class test\naccuracy (and average accuracy), surpassing state-of-the-art methods by around\n15%.\n","authors":["William Bankes","George Hughes","Ilija Bogunovic","Zi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.00486v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2312.00485v1","updated":"2023-12-01T10:34:03Z","published":"2023-12-01T10:34:03Z","title":"Backbone-based Dynamic Graph Spatio-Temporal Network for Epidemic\n Forecasting","summary":" Accurate epidemic forecasting is a critical task in controlling disease\ntransmission. Many deep learning-based models focus only on static or dynamic\ngraphs when constructing spatial information, ignoring their relationship.\nAdditionally, these models often rely on recurrent structures, which can lead\nto error accumulation and computational time consumption. To address the\naforementioned problems, we propose a novel model called Backbone-based Dynamic\nGraph Spatio-Temporal Network (BDGSTN). Intuitively, the continuous and smooth\nchanges in graph structure, make adjacent graph structures share a basic\npattern. To capture this property, we use adaptive methods to generate static\nbackbone graphs containing the primary information and temporal models to\ngenerate dynamic temporal graphs of epidemic data, fusing them to generate a\nbackbone-based dynamic graph. To overcome potential limitations associated with\nrecurrent structures, we introduce a linear model DLinear to handle temporal\ndependencies and combine it with dynamic graph convolution for epidemic\nforecasting. Extensive experiments on two datasets demonstrate that BDGSTN\noutperforms baseline models and ablation comparison further verifies the\neffectiveness of model components. Furthermore, we analyze and measure the\nsignificance of backbone and temporal graphs by using information metrics from\ndifferent aspects. Finally, we compare model parameter volume and training time\nto confirm the superior complexity and efficiency of BDGSTN.\n","authors":["Junkai Mao","Yuexing Han","Gouhei Tanaka","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.00485v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13494v5","updated":"2023-12-01T10:33:36Z","published":"2023-07-25T13:42:22Z","title":"Duet: efficient and scalable hybriD neUral rElation undersTanding","summary":" Learned cardinality estimation methods have achieved high precision compared\nto traditional methods. Among learned methods, query-driven approaches have\nfaced the workload drift problem for a long time. Although both data-driven and\nhybrid methods are proposed to avoid this problem, most of them suffer from\nhigh training and estimation costs, limited scalability, instability, and\nlong-tail distribution problems on high-dimensional tables, which seriously\naffects the practical application of learned cardinality estimators. In this\npaper, we prove that most of these problems are directly caused by the widely\nused progressive sampling. We solve this problem by introducing predicate\ninformation into the autoregressive model and propose Duet, a stable,\nefficient, and scalable hybrid method to estimate cardinality directly without\nsampling or any non-differentiable process, which can not only reduce the\ninference complexity from $O(n)$ to $O(1)$ compared to Naru and UAE but also\nachieve higher accuracy on high cardinality and high-dimensional tables.\nExperimental results show that Duet can achieve all the design goals above and\nbe much more practical. Besides, Duet even has a lower inference cost on CPU\nthan that of most learned methods on GPU.\n","authors":["Kaixin Zhang","Hongzhi Wang","Yabin Lu","Ziqi Li","Chang Shu","Yu Yan","Donghua Yang"],"pdf_url":"https://arxiv.org/pdf/2307.13494v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00484v1","updated":"2023-12-01T10:33:16Z","published":"2023-12-01T10:33:16Z","title":"MultiView Independent Component Analysis with Delays","summary":" Linear Independent Component Analysis (ICA) is a blind source separation\ntechnique that has been used in various domains to identify independent latent\nsources from observed signals. In order to obtain a higher signal-to-noise\nratio, the presence of multiple views of the same sources can be used. In this\nwork, we present MultiView Independent Component Analysis with Delays (MVICAD).\nThis algorithm builds on the MultiView ICA model by allowing sources to be\ndelayed versions of some shared sources: sources are shared across views up to\nsome unknown latencies that are view- and source-specific. Using simulations,\nwe demonstrate that MVICAD leads to better unmixing of the sources. Moreover,\nas ICA is often used in neuroscience, we show that latencies are age-related\nwhen applied to Cam-CAN, a large-scale magnetoencephalography (MEG) dataset.\nThese results demonstrate that the MVICAD model can reveal rich effects on\nneural signals without human supervision.\n","authors":["Ambroise Heurtebise","Pierre Ablin","Alexandre Gramfort"],"pdf_url":"https://arxiv.org/pdf/2312.00484v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.04885v3","updated":"2023-12-01T10:31:32Z","published":"2023-09-09T22:27:38Z","title":"Symplectic Structure-Aware Hamiltonian (Graph) Embeddings","summary":" In traditional Graph Neural Networks (GNNs), the assumption of a fixed\nembedding manifold often limits their adaptability to diverse graph geometries.\nRecently, Hamiltonian system-inspired GNNs have been proposed to address the\ndynamic nature of such embeddings by incorporating physical laws into node\nfeature updates. We present Symplectic Structure-Aware Hamiltonian GNN\n(SAH-GNN), a novel approach that generalizes Hamiltonian dynamics for more\nflexible node feature updates. Unlike existing Hamiltonian approaches, SAH-GNN\nemploys Riemannian optimization on the symplectic Stiefel manifold to\nadaptively learn the underlying symplectic structure, circumventing the\nlimitations of existing Hamiltonian GNNs that rely on a pre-defined form of\nstandard symplectic structure. This innovation allows SAH-GNN to automatically\nadapt to various graph datasets without extensive hyperparameter tuning.\nMoreover, it conserves energy during training meaning the implicit Hamiltonian\nsystem is physically meaningful. Finally, we empirically validate SAH-GNN's\nsuperiority and adaptability in node classification tasks across multiple types\nof graph datasets.\n","authors":["Jiaxu Liu","Xinping Yi","Tianle Zhang","Xiaowei Huang"],"pdf_url":"https://arxiv.org/pdf/2309.04885v3.pdf","comment":"5 pages main content with 5 pages appendix"},{"id":"http://arxiv.org/abs/2312.00477v1","updated":"2023-12-01T10:18:50Z","published":"2023-12-01T10:18:50Z","title":"Interpretable Meta-Learning of Physical Systems","summary":" Machine learning methods can be a valuable aid in the scientific process, but\nthey need to face challenging settings where data come from inhomogeneous\nexperimental conditions. Recent meta-learning methods have made significant\nprogress in multi-task learning, but they rely on black-box neural networks,\nresulting in high computational costs and limited interpretability. Leveraging\nthe structure of the learning problem, we argue that multi-environment\ngeneralization can be achieved using a simpler learning model, with an affine\nstructure with respect to the learning task. Crucially, we prove that this\narchitecture can identify the physical parameters of the system, enabling\ninterpreable learning. We demonstrate the competitive generalization\nperformance and the low computational cost of our method by comparing it to\nstate-of-the-art algorithms on physical systems, ranging from toy models to\ncomplex, non-analytical systems. The interpretability of our method is\nillustrated with original applications to physical-parameter-induced adaptation\nand to adaptive control.\n","authors":["Matthieu Blanke","Marc Lelarge"],"pdf_url":"https://arxiv.org/pdf/2312.00477v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00471v1","updated":"2023-12-01T10:10:18Z","published":"2023-12-01T10:10:18Z","title":"A Bayesian approach for prompt optimization in pre-trained language\n models","summary":" A prompt is a sequence of symbol or tokens, selected from a vocabulary\naccording to some rule, which is prepended/concatenated to a textual query. A\nkey problem is how to select the sequence of tokens: in this paper we formulate\nit as a combinatorial optimization problem. The high dimensionality of the\ntoken space com-pounded by the length of the prompt sequence requires a very\nefficient solution. In this paper we propose a Bayesian optimization method,\nexecuted in a continuous em-bedding of the combinatorial space. In this paper\nwe focus on hard prompt tuning (HPT) which directly searches for discrete\ntokens to be added to the text input with-out requiring access to the large\nlanguage model (LLM) and can be used also when LLM is available only as a\nblack-box. This is critically important if LLMs are made available in the Model\nas a Service (MaaS) manner as in GPT-4. The current manu-script is focused on\nthe optimization of discrete prompts for classification tasks. The discrete\nprompts give rise to difficult combinatorial optimization problem which easily\nbecome intractable given the dimension of the token space in realistic\napplications. The optimization method considered in this paper is Bayesian\noptimization (BO) which has become the dominant approach in black-box\noptimization for its sample efficiency along with its modular structure and\nversatility. In this paper we use BoTorch, a library for Bayesian optimization\nresearch built on top of pyTorch. Albeit preliminary and obtained using a\n'vanilla' version of BO, the experiments on RoB-ERTa on six benchmarks, show a\ngood performance across a variety of tasks and enable an analysis of the\ntradeoff between size of the search space, accuracy and wall clock time.\n","authors":["Antonio Sabbatella","Andrea Ponti","Antonio Candelieri","Ilaria Giordani","Francesco Archetti"],"pdf_url":"https://arxiv.org/pdf/2312.00471v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15578v3","updated":"2023-12-01T10:06:21Z","published":"2023-10-24T07:42:04Z","title":"VMAF Re-implementation on PyTorch: Some Experimental Results","summary":" Based on the standard VMAF implementation we propose an implementation of\nVMAF using PyTorch framework. For this implementation comparisons with the\nstandard (libvmaf) show the discrepancy $\\lesssim 10^{-2}$ in VMAF units. We\ninvestigate gradients computation when using VMAF as an objective function and\ndemonstrate that training using this function does not result in ill-behaving\ngradients. The implementation is then used to train a preprocessing filter. It\nis demonstrated that its performance is superior to the unsharp masking filter.\nThe resulting filter is also easy for implementation and can be applied in\nvideo processing tasks for video copression improvement. This is confirmed by\nthe results of numerical experiments.\n","authors":["Kirill Aistov","Maxim Koroteev"],"pdf_url":"https://arxiv.org/pdf/2310.15578v3.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.00456v1","updated":"2023-12-01T09:41:40Z","published":"2023-12-01T09:41:40Z","title":"Auto-encoding GPS data to reveal individual and collective behaviour","summary":" We propose an innovative and generic methodology to analyse individual and\ncollective behaviour through individual trajectory data. The work is motivated\nby the analysis of GPS trajectories of fishing vessels collected from\nregulatory tracking data in the context of marine biodiversity conservation and\necosystem-based fisheries management. We build a low-dimensional latent\nrepresentation of trajectories using convolutional neural networks as\nnon-linear mapping. This is done by training a conditional variational\nauto-encoder taking into account covariates. The posterior distributions of the\nlatent representations can be linked to the characteristics of the actual\ntrajectories. The latent distributions of the trajectories are compared with\nthe Bhattacharyya coefficient, which is well-suited for comparing\ndistributions. Using this coefficient, we analyse the variation of the\nindividual behaviour of each vessel during time. For collective behaviour\nanalysis, we build proximity graphs and use an extension of the stochastic\nblock model for multiple networks. This model results in a clustering of the\nindividuals based on their set of trajectories. The application to French\nfishing vessels enables us to obtain groups of vessels whose individual and\ncollective behaviours exhibit spatio-temporal patterns over the period\n2014-2018.\n","authors":["Saint-Clair Chabert-Liddell","Nicolas Bez","Pierre Gloaguen","Sophie Donnet","Stéphanie Mahévas"],"pdf_url":"https://arxiv.org/pdf/2312.00456v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00455v1","updated":"2023-12-01T09:40:27Z","published":"2023-12-01T09:40:27Z","title":"Meta-Diversity Search in Complex Systems, A Recipe for Artificial\n Open-Endedness ?","summary":" Can we build an artificial system that would be able to generate endless\nsurprises if ran \"forever\" in Minecraft? While there is not a single path\ntoward solving that grand challenge, this article presents what we believe to\nbe some working ingredients for the endless generation of novel increasingly\ncomplex artifacts in Minecraft. Our framework for an open-ended system includes\ntwo components: a complex system used to recursively grow and complexify\nartifacts over time, and a discovery algorithm that leverages the concept of\nmeta-diversity search. Since complex systems have shown to enable the emergence\nof considerable complexity from set of simple rules, we believe them to be\ngreat candidates to generate all sort of artifacts in Minecraft. Yet, the space\nof possible artifacts that can be generated by these systems is often unknown,\nchallenging to characterize and explore. Therefore automating the long-term\ndiscovery of novel and increasingly complex artifacts in these systems is an\nexciting research field. To approach these challenges, we formulate the problem\nof meta-diversity search where an artificial \"discovery assistant\"\nincrementally learns a diverse set of representations to characterize behaviors\nand searches to discover diverse patterns within each of them. A successful\ndiscovery assistant should continuously seek for novel sources of diversities\nwhile being able to quickly specialize the search toward a new unknown type of\ndiversity. To implement those ideas in the Minecraft environment, we simulate\nan artificial \"chemistry\" system based on Lenia continuous cellular automaton\nfor generating artifacts, as well as an artificial \"discovery assistant\"\n(called Holmes) for the artifact-discovery process. Holmes incrementally learns\na hierarchy of modular representations to characterize divergent sources of\ndiversity and uses a goal-based intrinsically-motivated exploration as the\ndiversity search strategy.\n","authors":["Mayalen Etcheverry","Bert Wang-Chak Chan","Clément Moulin-Frier","Pierre-Yves Oudeyer"],"pdf_url":"https://arxiv.org/pdf/2312.00455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00454v1","updated":"2023-12-01T09:34:28Z","published":"2023-12-01T09:34:28Z","title":"An Encoding Framework for Binarized Images using HyperDimensional\n Computing","summary":" Hyperdimensional Computing (HDC) is a brain-inspired and light-weight machine\nlearning method. It has received significant attention in the literature as a\ncandidate to be applied in the wearable internet of things, near-sensor\nartificial intelligence applications and on-device processing. HDC is\ncomputationally less complex than traditional deep learning algorithms and\ntypically achieves moderate to good classification performance. A key aspect\nthat determines the performance of HDC is the encoding of the input data to the\nhyperdimensional (HD) space. This article proposes a novel light-weight\napproach relying only on native HD arithmetic vector operations to encode\nbinarized images that preserves similarity of patterns at nearby locations by\nusing point of interest selection and local linear mapping. The method reaches\nan accuracy of 97.35% on the test set for the MNIST data set and 84.12% for the\nFashion-MNIST data set. These results outperform other studies using baseline\nHDC with different encoding approaches and are on par with more complex hybrid\nHDC models. The proposed encoding approach also demonstrates a higher\nrobustness to noise and blur compared to the baseline encoding.\n","authors":["Laura Smets","Werner Van Leekwijck","Ing Jyh Tsang","Steven Latré"],"pdf_url":"https://arxiv.org/pdf/2312.00454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.06777v2","updated":"2023-12-01T09:33:21Z","published":"2023-05-11T12:59:21Z","title":"Generating high-quality 3DMPCs by adaptive data acquisition and\n NeREF-based radiometric calibration with UGV plant phenotyping system","summary":" Fusion of 3D and MS imaging data has a great potential for high-throughput\nplant phenotyping of structural and biochemical as well as physiological traits\nsimultaneously, which is important for decision support in agriculture and for\ncrop breeders in selecting the best genotypes. However, lacking of 3D data\nintegrity of various plant canopy structures and low-quality of MS images\ncaused by the complex illumination effects make a great challenge, especially\nat the proximal imaging scale. Therefore, this study proposed a novel approach\nfor adaptive data acquisition and radiometric calibration to generate\nhigh-quality 3DMPCs of plants. An efficient NBV planning method based on an UGV\nplant phenotyping system with a multi-sensor-equipped robotic arm was proposed\nto achieve adaptive data acquisition. The NeREF was employed to predict the DN\nvalues of the hemispherical reference for radiometric calibration. For NBV\nplanning, the average total time for single plant at a joint speed of 1.55\nrad/s was about 62.8 s, with an average reduction of 18.0% compared to the\nunplanned. The integrity of the whole-plant data was improved by an average of\n23.6% compared to the fixed viewpoints alone. Compared with the ASD\nmeasurements, the RMSE of the reflectance spectra obtained from 3DMPCs at\ndifferent regions of interest was 0.08 with an average decrease of 58.93%\ncompared to the results obtained from the single-frame of MS images without 3D\nradiometric calibration. The 3D-calibrated plant 3DMPCs improved the predictive\naccuracy of PLSR for chlorophyll content, with an average increase of 0.07 in\nR2 and an average decrease of 21.25% in RMSE. Our approach introduced a fresh\nperspective on generating high-quality 3DMPCs of plants under the natural light\ncondition, enabling more precise analysis of plant morphological and\nphysiological parameters.\n","authors":["Pengyao Xie","Zhihong Ma","Ruiming Du","Xin Yang","Haiyan Cen"],"pdf_url":"https://arxiv.org/pdf/2305.06777v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00434v1","updated":"2023-12-01T09:06:06Z","published":"2023-12-01T09:06:06Z","title":"PEFTDebias : Capturing debiasing information using PEFTs","summary":" The increasing use of foundation models highlights the urgent need to address\nand eliminate implicit biases present in them that arise during pretraining. In\nthis paper, we introduce PEFTDebias, a novel approach that employs\nparameter-efficient fine-tuning (PEFT) to mitigate the biases within foundation\nmodels. PEFTDebias consists of two main phases: an upstream phase for acquiring\ndebiasing parameters along a specific bias axis, and a downstream phase where\nthese parameters are incorporated into the model and frozen during the\nfine-tuning process. By evaluating on four datasets across two bias axes namely\ngender and race, we find that downstream biases can be effectively reduced with\nPEFTs. In addition, we show that these parameters possess axis-specific\ndebiasing characteristics, enabling their effective transferability in\nmitigating biases in various downstream tasks. To ensure reproducibility, we\nrelease the code to do our experiments.\n","authors":["Sumit Agarwal","Aditya Srikanth Veerubhotla","Srijan Bansal"],"pdf_url":"https://arxiv.org/pdf/2312.00434v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.00427v1","updated":"2023-12-01T08:50:42Z","published":"2023-12-01T08:50:42Z","title":"From Mutual Information to Expected Dynamics: New Generalization Bounds\n for Heavy-Tailed SGD","summary":" Understanding the generalization abilities of modern machine learning\nalgorithms has been a major research topic over the past decades. In recent\nyears, the learning dynamics of Stochastic Gradient Descent (SGD) have been\nrelated to heavy-tailed dynamics. This has been successfully applied to\ngeneralization theory by exploiting the fractal properties of those dynamics.\nHowever, the derived bounds depend on mutual information (decoupling) terms\nthat are beyond the reach of computability. In this work, we prove\ngeneralization bounds over the trajectory of a class of heavy-tailed dynamics,\nwithout those mutual information terms. Instead, we introduce a geometric\ndecoupling term by comparing the learning dynamics (depending on the empirical\nrisk) with an expected one (depending on the population risk). We further\nupper-bound this geometric term, by using techniques from the heavy-tailed and\nthe fractal literature, making it fully computable. Moreover, as an attempt to\ntighten the bounds, we propose a PAC-Bayesian setting based on perturbed\ndynamics, in which the same geometric term plays a crucial role and can still\nbe bounded using the techniques described above.\n","authors":["Benjamin Dupuis","Paul Viallard"],"pdf_url":"https://arxiv.org/pdf/2312.00427v1.pdf","comment":"Accepted in the NeurIPS 2023 Workshop Heavy Tails in Machine Learning"},{"id":"http://arxiv.org/abs/2311.15222v2","updated":"2023-12-01T08:43:14Z","published":"2023-11-26T07:23:37Z","title":"Decision Tree Psychological Risk Assessment in Currency Trading","summary":" This research paper focuses on the integration of Artificial Intelligence\n(AI) into the currency trading landscape, positing the development of\npersonalized AI models, essentially functioning as intelligent personal\nassistants tailored to the idiosyncrasies of individual traders. The paper\nposits that AI models are capable of identifying nuanced patterns within the\ntrader's historical data, facilitating a more accurate and insightful\nassessment of psychological risk dynamics in currency trading. The PRI is a\ndynamic metric that experiences fluctuations in response to market conditions\nthat foster psychological fragility among traders. By employing sophisticated\ntechniques, a classifying decision tree is crafted, enabling clearer\ndecision-making boundaries within the tree structure. By incorporating the\nuser's chronological trade entries, the model becomes adept at identifying\ncritical junctures when psychological risks are heightened. The real-time\nnature of the calculations enhances the model's utility as a proactive tool,\noffering timely alerts to traders about impending moments of psychological\nrisks. The implications of this research extend beyond the confines of currency\ntrading, reaching into the realms of other industries where the judicious\napplication of personalized modeling emerges as an efficient and strategic\napproach. This paper positions itself at the intersection of cutting-edge\ntechnology and the intricate nuances of human psychology, offering a\ntransformative paradigm for decision making support in dynamic and\nhigh-pressure environments.\n","authors":["Jai Pal"],"pdf_url":"https://arxiv.org/pdf/2311.15222v2.pdf","comment":"8 pages, 3 figures, 7 listings"},{"id":"http://arxiv.org/abs/1911.05467v3","updated":"2023-12-01T08:37:27Z","published":"2019-11-07T06:30:47Z","title":"ChebNet: Efficient and Stable Constructions of Deep Neural Networks with\n Rectified Power Units via Chebyshev Approximations","summary":" In a previous study [B. Li, S. Tang and H. Yu, Commun. Comput. Phy.\n27(2):379-411, 2020], it is shown that deep neural networks built with\nrectified power units (RePU) as activation functions can give better\napproximation for sufficient smooth functions than those built with rectified\nlinear units, by converting polynomial approximations using power series into\ndeep neural networks with optimal complexity and no approximation error.\nHowever, in practice, power series approximations are not easy to obtain due to\nthe associated stability issue. In this paper, we propose a new and more stable\nway to construct RePU deep neural networks based on Chebyshev polynomial\napproximations. By using a hierarchical structure of Chebyshev polynomial\napproximation in frequency domain, we obtain efficient and stable deep neural\nnetwork construction, which we call ChebNet. The approximation of smooth\nfunctions by ChebNets is no worse than the approximation by deep RePU nets\nusing power series. On the same time, ChebNets are much more stable. Numerical\nresults show that the constructed ChebNets can be further fine-tuned to obtain\nmuch better results than those obtained by tuning deep RePU nets constructed by\npower series approach. As spectral accuracy is hard to obtain by direct\ntraining of deep neural networks, ChebNets provide a practical way to obtain\nspectral accuracy, it is expected to be useful in real applications that\nrequire efficient approximations of smooth functions.\n","authors":["Shanshan Tang","Bo Li","Haijun Yu"],"pdf_url":"https://arxiv.org/pdf/1911.05467v3.pdf","comment":"6 figures, 3 tables, to appear on Communications in Mathematics and\n Statistics"},{"id":"http://arxiv.org/abs/2304.12693v2","updated":"2023-12-01T08:26:28Z","published":"2023-04-25T09:54:35Z","title":"Phylo2Vec: a vector representation for binary trees","summary":" Binary phylogenetic trees inferred from biological data are central to\nunderstanding the shared evolutionary history of organisms. Inferring the\nplacement of latent nodes in a tree by any optimality criterion (e.g., maximum\nlikelihood) is an NP-hard problem, propelling the development of myriad\nheuristic approaches. Yet, these heuristics often lack a systematic means of\nuniformly sampling random trees or effectively exploring a tree space that\ngrows factorially, which are crucial to optimisation problems such as machine\nlearning. Accordingly, we present Phylo2Vec, a new parsimonious representation\nof a phylogenetic tree. Phylo2Vec maps any binary tree with $n$ leaves to an\ninteger vector of length $n$. We prove that Phylo2Vec is both well-defined and\nbijective to the space of phylogenetic trees. The advantages of Phylo2Vec are\ntwofold: i) easy uniform sampling of binary trees and ii) systematic ability to\ntraverse tree space in very large or small jumps. As a proof of concept, we use\nPhylo2Vec for maximum likelihood inference on five real-world datasets and show\nthat a simple hill climbing-based optimisation efficiently traverses the\nvastness of tree space from a random to an optimal tree.\n","authors":["Matthew J Penn","Neil Scheidwasser","Mark P Khurana","David A Duchêne","Christl A Donnelly","Samir Bhatt"],"pdf_url":"https://arxiv.org/pdf/2304.12693v2.pdf","comment":"35 pages, 8 figures, 2 tables, 2 supplementary figures"},{"id":"http://arxiv.org/abs/2312.00411v1","updated":"2023-12-01T08:21:05Z","published":"2023-12-01T08:21:05Z","title":"A framework for mining lifestyle profiles through multi-dimensional and\n high-order mobility feature clustering","summary":" Human mobility demonstrates a high degree of regularity, which facilitates\nthe discovery of lifestyle profiles. Existing research has yet to fully utilize\nthe regularities embedded in high-order features extracted from human mobility\nrecords in such profiling. This study proposes a progressive feature extraction\nstrategy that mines high-order mobility features from users' moving trajectory\nrecords from the spatial, temporal, and semantic dimensions. Specific features\nare extracted such as travel motifs, rhythms decomposed by discrete Fourier\ntransform (DFT) of mobility time series, and vectorized place semantics by\nword2vec, respectively to the three dimensions, and they are further clustered\nto reveal the users' lifestyle characteristics. An experiment using a\ntrajectory dataset of over 500k users in Shenzhen, China yields seven user\nclusters with different lifestyle profiles that can be well interpreted by\ncommon sense. The results suggest the possibility of fine-grained user\nprofiling through cross-order trajectory feature engineering and clustering.\n","authors":["Yeshuo Shu","Gangcheng Zhang","Keyi Liu","Jintong Tang","Liyan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.00411v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00775v2","updated":"2023-12-01T08:19:03Z","published":"2023-11-01T18:38:31Z","title":"Harnessing machine learning for accurate treatment of overlapping\n opacity species in GCMs","summary":" To understand high precision observations of exoplanets and brown dwarfs, we\nneed detailed and complex general circulation models (GCMs) that incorporate\nhydrodynamics, chemistry, and radiation. In this study, we specifically examine\nthe coupling between chemistry and radiation in GCMs and compare different\nmethods for mixing opacities of different chemical species in the correlated-k\nassumption, when equilibrium chemistry cannot be assumed. We propose a fast\nmachine learning method based on DeepSets (DS), which effectively combines\nindividual correlated-k opacities (k-tables). We evaluate the DS method\nalongside other published methods like adaptive equivalent extinction (AEE) and\nrandom overlap with rebinning and resorting (RORR). We integrate these mixing\nmethods into our GCM (expeRT/MITgcm) and assess their accuracy and performance\nfor the example of the hot Jupiter HD~209458 b. Our findings indicate that the\nDS method is both accurate and efficient for GCM usage, whereas RORR is too\nslow. Additionally, we observe that the accuracy of AEE depends on its specific\nimplementation and may introduce numerical issues in achieving radiative\ntransfer solution convergence. We then apply the DS mixing method in a\nsimplified chemical disequilibrium situation, where we model the rainout of TiO\nand VO, and confirm that the rainout of TiO and VO would hinder the formation\nof a stratosphere. To further expedite the development of consistent\ndisequilibrium chemistry calculations in GCMs, we provide documentation and\ncode for coupling the DS mixing method with correlated-k radiative transfer\nsolvers. The DS method has been extensively tested to be accurate enough for\nGCMs, however, other methods might be needed for accelerating atmospheric\nretrievals.\n","authors":["Aaron David Schneider","Paul Mollière","Gilles Louppe","Ludmila Carone","Uffe Gråe Jørgensen","Leen Decin","Christiane Helling"],"pdf_url":"https://arxiv.org/pdf/2311.00775v2.pdf","comment":"Recommended for publication in A&A, following minor revisions,\n version sent to referee"},{"id":"http://arxiv.org/abs/2307.15090v2","updated":"2023-12-01T08:12:06Z","published":"2023-07-27T00:37:18Z","title":"Understanding Forward Process of Convolutional Neural Network","summary":" This paper reveal the selective rotation in the CNNs' forward processing. It\nelucidates the activation function as a discerning mechanism that unifies and\nquantizes the rotational aspects of the input data. Experiments show how this\ndefined methodology reflects the progress network distinguish inputs based on\nstatistical indicators, which can be comprehended or analyzed by applying\nstructured mathematical tools. Our findings also unveil the consistency between\nartificial neural networks and the human brain in their data processing\npattern.\n","authors":["Peixin Tian"],"pdf_url":"https://arxiv.org/pdf/2307.15090v2.pdf","comment":"something wrong in this paper"},{"id":"http://arxiv.org/abs/2310.12609v2","updated":"2023-12-01T08:07:38Z","published":"2023-10-19T09:39:07Z","title":"Denoising Heat-inspired Diffusion with Insulators for Collision Free\n Motion Planning","summary":" Diffusion models have risen as a powerful tool in robotics due to their\nflexibility and multi-modality. While some of these methods effectively address\ncomplex problems, they often depend heavily on inference-time obstacle\ndetection and require additional equipment. Addressing these challenges, we\npresent a method that, during inference time, simultaneously generates only\nreachable goals and plans motions that avoid obstacles, all from a single\nvisual input. Central to our approach is the novel use of a collision-avoiding\ndiffusion kernel for training. Through evaluations against behavior-cloning and\nclassical diffusion models, our framework has proven its robustness. It is\nparticularly effective in multi-modal environments, navigating toward goals and\navoiding unreachable ones blocked by obstacles, while ensuring collision\navoidance.\n","authors":["Junwoo Chang","Hyunwoo Ryu","Jiwoo Kim","Soochul Yoo","Joohwan Seo","Nikhil Prakash","Jongeun Choi","Roberto Horowitz"],"pdf_url":"https://arxiv.org/pdf/2310.12609v2.pdf","comment":"9 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.00404v1","updated":"2023-12-01T07:54:07Z","published":"2023-12-01T07:54:07Z","title":"A Causality-Aware Pattern Mining Scheme for Group Activity Recognition\n in a Pervasive Sensor Space","summary":" Human activity recognition (HAR) is a key challenge in pervasive computing\nand its solutions have been presented based on various disciplines.\nSpecifically, for HAR in a smart space without privacy and accessibility\nissues, data streams generated by deployed pervasive sensors are leveraged. In\nthis paper, we focus on a group activity by which a group of users perform a\ncollaborative task without user identification and propose an efficient group\nactivity recognition scheme which extracts causality patterns from pervasive\nsensor event sequences generated by a group of users to support as good\nrecognition accuracy as the state-of-the-art graphical model. To filter out\nirrelevant noise events from a given data stream, a set of rules is leveraged\nto highlight causally related events. Then, a pattern-tree algorithm extracts\nfrequent causal patterns by means of a growing tree structure. Based on the\nextracted patterns, a weighted sum-based pattern matching algorithm computes\nthe likelihoods of stored group activities to the given test event sequence by\nmeans of matched event pattern counts for group activity recognition. We\nevaluate the proposed scheme using the data collected from our testbed and\nCASAS datasets where users perform their tasks on a daily basis and validate\nits effectiveness in a real environment. Experiment results show that the\nproposed scheme performs higher recognition accuracy and with a small amount of\nruntime overhead than the existing schemes.\n","authors":["Hyunju Kim","Heesuk Son","Dongman Lee"],"pdf_url":"https://arxiv.org/pdf/2312.00404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.14611v2","updated":"2023-12-01T07:53:28Z","published":"2022-10-26T10:34:20Z","title":"Automatic Diagnosis of Myocarditis Disease in Cardiac MRI Modality using\n Deep Transformers and Explainable Artificial Intelligence","summary":" Myocarditis is a significant cardiovascular disease (CVD) that poses a threat\nto the health of many individuals by causing damage to the myocardium. The\noccurrence of microbes and viruses, including the likes of HIV, plays a crucial\nrole in the development of myocarditis disease (MCD). The images produced\nduring cardiac magnetic resonance imaging (CMRI) scans are low contrast, which\ncan make it challenging to diagnose cardiovascular diseases. In other hand,\nchecking numerous CMRI slices for each CVD patient can be a challenging task\nfor medical doctors. To overcome the existing challenges, researchers have\nsuggested the use of artificial intelligence (AI)-based computer-aided\ndiagnosis systems (CADS). The presented paper outlines a CADS for the detection\nof MCD from CMR images, utilizing deep learning (DL) methods. The proposed CADS\nconsists of several steps, including dataset, preprocessing, feature\nextraction, classification, and post-processing. First, the Z-Alizadeh dataset\nwas selected for the experiments. Subsequently, the CMR images underwent\nvarious preprocessing steps, including denoising, resizing, as well as data\naugmentation (DA) via CutMix and MixUp techniques. In the following, the most\ncurrent deep pre-trained and transformer models are used for feature extraction\nand classification on the CMR images. The findings of our study reveal that\ntransformer models exhibit superior performance in detecting MCD as opposed to\npre-trained architectures. In terms of DL architectures, the Turbulence Neural\nTransformer (TNT) model exhibited impressive accuracy, reaching 99.73%\nutilizing a 10-fold cross-validation approach. Additionally, to pinpoint areas\nof suspicion for MCD in CMRI images, the Explainable-based Grad Cam method was\nemployed.\n","authors":["Mahboobeh Jafari","Afshin Shoeibi","Navid Ghassemi","Jonathan Heras","Sai Ho Ling","Amin Beheshti","Yu-Dong Zhang","Shui-Hua Wang","Roohallah Alizadehsani","Juan M. Gorriz","U. Rajendra Acharya","Hamid Alinejad Rokny"],"pdf_url":"https://arxiv.org/pdf/2210.14611v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.04119v2","updated":"2023-12-01T07:49:26Z","published":"2023-08-08T08:19:43Z","title":"Constructing Custom Thermodynamics Using Deep Learning","summary":" One of the most exciting applications of artificial intelligence (AI) is\nautomated scientific discovery based on previously amassed data, coupled with\nrestrictions provided by known physical principles, including symmetries and\nconservation laws. Such automated hypothesis creation and verification can\nassist scientists in studying complex phenomena, where traditional physical\nintuition may fail. Here we develop a platform based on a generalized Onsager\nprinciple to learn macroscopic dynamical descriptions of arbitrary stochastic\ndissipative systems directly from observations of their microscopic\ntrajectories. Our method simultaneously constructs reduced thermodynamic\ncoordinates and interprets the dynamics on these coordinates. We demonstrate\nits effectiveness by studying theoretically and validating experimentally the\nstretching of long polymer chains in an externally applied field. Specifically,\nwe learn three interpretable thermodynamic coordinates and build a dynamical\nlandscape of polymer stretching, including the identification of stable and\ntransition states and the control of the stretching rate. Our general\nmethodology can be used to address a wide range of scientific and technological\napplications.\n","authors":["Xiaoli Chen","Beatrice W. Soh","Zi-En Ooi","Eleonore Vissol-Gaudin","Haijun Yu","Kostya S. Novoselov","Kedar Hippalgaonkar","Qianxiao Li"],"pdf_url":"https://arxiv.org/pdf/2308.04119v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15890v2","updated":"2023-12-01T07:39:02Z","published":"2023-11-27T14:56:47Z","title":"Stability-Informed Initialization of Neural Ordinary Differential\n Equations","summary":" This paper addresses the training of Neural Ordinary Differential Equations\n(neural ODEs), and in particular explores the interplay between numerical\nintegration techniques, stability regions, step size, and initialization\ntechniques. It is shown how the choice of integration technique implicitly\nregularizes the learned model, and how the solver's corresponding stability\nregion affects training and prediction performance. From this analysis, a\nstability-informed parameter initialization technique is introduced. The\neffectiveness of the initialization method is displayed across several learning\nbenchmarks and industrial applications.\n","authors":["Theodor Westny","Arman Mohammadi","Daniel Jung","Erik Frisk"],"pdf_url":"https://arxiv.org/pdf/2311.15890v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00396v1","updated":"2023-12-01T07:38:05Z","published":"2023-12-01T07:38:05Z","title":"GFN-SR: Symbolic Regression with Generative Flow Networks","summary":" Symbolic regression (SR) is an area of interpretable machine learning that\naims to identify mathematical expressions, often composed of simple functions,\nthat best fit in a given set of covariates $X$ and response $y$. In recent\nyears, deep symbolic regression (DSR) has emerged as a popular method in the\nfield by leveraging deep reinforcement learning to solve the complicated\ncombinatorial search problem. In this work, we propose an alternative framework\n(GFN-SR) to approach SR with deep learning. We model the construction of an\nexpression tree as traversing through a directed acyclic graph (DAG) so that\nGFlowNet can learn a stochastic policy to generate such trees sequentially.\nEnhanced with an adaptive reward baseline, our method is capable of generating\na diverse set of best-fitting expressions. Notably, we observe that GFN-SR\noutperforms other SR algorithms in noisy data regimes, owing to its ability to\nlearn a distribution of rewards over a space of candidate solutions.\n","authors":["Sida Li","Ioana Marinescu","Sebastian Musslick"],"pdf_url":"https://arxiv.org/pdf/2312.00396v1.pdf","comment":"Accepted by the NeurIPS 2023 AI4Science Workshop"},{"id":"http://arxiv.org/abs/2307.03675v2","updated":"2023-12-01T07:32:20Z","published":"2023-07-07T15:45:05Z","title":"GeoPhy: Differentiable Phylogenetic Inference via Geometric Gradients of\n Tree Topologies","summary":" Phylogenetic inference, grounded in molecular evolution models, is essential\nfor understanding the evolutionary relationships in biological data. Accounting\nfor the uncertainty of phylogenetic tree variables, which include tree\ntopologies and evolutionary distances on branches, is crucial for accurately\ninferring species relationships from molecular data and tasks requiring\nvariable marginalization. Variational Bayesian methods are key to developing\nscalable, practical models; however, it remains challenging to conduct\nphylogenetic inference without restricting the combinatorially vast number of\npossible tree topologies. In this work, we introduce a novel, fully\ndifferentiable formulation of phylogenetic inference that leverages a unique\nrepresentation of topological distributions in continuous geometric spaces.\nThrough practical considerations on design spaces and control variates for\ngradient estimations, our approach, GeoPhy, enables variational inference\nwithout limiting the topological candidates. In experiments using real\nbenchmark datasets, GeoPhy significantly outperformed other approximate\nBayesian methods that considered whole topologies.\n","authors":["Takahiro Mimori","Michiaki Hamada"],"pdf_url":"https://arxiv.org/pdf/2307.03675v2.pdf","comment":"NeurIPS 2023 camera ready"},{"id":"http://arxiv.org/abs/2312.00388v1","updated":"2023-12-01T07:19:42Z","published":"2023-12-01T07:19:42Z","title":"LinguaLinked: A Distributed Large Language Model Inference System for\n Mobile Devices","summary":" Deploying Large Language Models (LLMs) locally on mobile devices presents a\nsignificant challenge due to their extensive memory requirements. In this\npaper, we introduce LinguaLinked, a system for decentralized, distributed LLM\ninference on mobile devices. LinguaLinked enables collaborative execution of\nthe inference task across multiple trusted devices. LinguaLinked ensures data\nprivacy by processing information locally. LinguaLinked uses three key\nstrategies. First, an optimized model assignment technique segments LLMs and\nuses linear optimization to align segments with each device's capabilities.\nSecond, an optimized data transmission mechanism ensures efficient and\nstructured data flow between model segments while also maintaining the\nintegrity of the original model structure. Finally, LinguaLinked incorporates a\nruntime load balancer that actively monitors and redistributes tasks among\nmobile devices to prevent bottlenecks, enhancing the system's overall\nefficiency and responsiveness. We demonstrate that LinguaLinked facilitates\nefficient LLM inference while maintaining consistent throughput and minimal\nlatency through extensive testing across various mobile devices, from high-end\nto low-end Android devices. In our evaluations, compared to the baseline,\nLinguaLinked achieves an inference performance acceleration of $1.11\\times$ to\n$1.61\\times$ in single-threaded settings, $1.73\\times$ to $2.65\\times$ with\nmulti-threading. Additionally, runtime load balancing yields an overall\ninference acceleration of $1.29\\times$ to $1.32\\times$.\n","authors":["Junchen Zhao","Yurun Song","Simeng Liu","Ian G. Harris","Sangeetha Abdu Jyothi"],"pdf_url":"https://arxiv.org/pdf/2312.00388v1.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.00386v1","updated":"2023-12-01T07:15:51Z","published":"2023-12-01T07:15:51Z","title":"Local monotone operator learning using non-monotone operators: MnM-MOL","summary":" The recovery of magnetic resonance (MR) images from undersampled measurements\nis a key problem that has seen extensive research in recent years. Unrolled\napproaches, which rely on end-to-end training of convolutional neural network\n(CNN) blocks within iterative reconstruction algorithms, offer state-of-the-art\nperformance. These algorithms require a large amount of memory during training,\nmaking them difficult to employ in high-dimensional applications. Deep\nequilibrium (DEQ) models and the recent monotone operator learning (MOL)\napproach were introduced to eliminate the need for unrolling, thus reducing the\nmemory demand during training. Both approaches require a Lipschitz constraint\non the network to ensure that the forward and backpropagation iterations\nconverge. Unfortunately, the constraint often results in reduced performance\ncompared to unrolled methods. The main focus of this work is to relax the\nconstraint on the CNN block in two different ways. Inspired by\nconvex-non-convex regularization strategies, we now impose the monotone\nconstraint on the sum of the gradient of the data term and the CNN block,\nrather than constrain the CNN itself to be a monotone operator. This approach\nenables the CNN to learn possibly non-monotone score functions, which can\ntranslate to improved performance. In addition, we only restrict the operator\nto be monotone in a local neighborhood around the image manifold. Our\ntheoretical results show that the proposed algorithm is guaranteed to converge\nto the fixed point and that the solution is robust to input perturbations,\nprovided that it is initialized close to the true solution. Our empirical\nresults show that the relaxed constraints translate to improved performance and\nthat the approach enjoys robustness to input perturbations similar to MOL.\n","authors":["Maneesh John","Jyothi Rikhab Chand","Mathews Jacob"],"pdf_url":"https://arxiv.org/pdf/2312.00386v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2308.00629v4","updated":"2023-12-01T07:15:04Z","published":"2023-08-01T15:56:24Z","title":"Hessian-Aware Bayesian Optimization for Decision Making Systems","summary":" Many approaches for optimizing decision making systems rely on gradient based\nmethods requiring informative feedback from the environment. However, in the\ncase where such feedback is sparse or uninformative, such approaches may result\nin poor performance. Derivative-free approaches such as Bayesian Optimization\nmitigate the dependency on the quality of gradient feedback, but are known to\nscale poorly in the high-dimension setting of complex decision making systems.\nThis problem is exacerbated if the system requires interactions between several\nactors cooperating to accomplish a shared goal. To address the dimensionality\nchallenge, we propose a compact multi-layered architecture modeling the\ndynamics of actor interactions through the concept of role. We introduce\nHessian-aware Bayesian Optimization to efficiently optimize the multi-layered\narchitecture parameterized by a large number of parameters, and give the first\nimproved regret bound in additive high-dimensional Bayesian Optimization since\nMutny & Krause (2018). Our approach shows strong empirical results under\nmalformed or sparse reward.\n","authors":["Mohit Rajpal","Lac Gia Tran","Yehong Zhang","Bryan Kian Hsiang Low"],"pdf_url":"https://arxiv.org/pdf/2308.00629v4.pdf","comment":"Fixed a typo"},{"id":"http://arxiv.org/abs/2309.01885v2","updated":"2023-12-01T07:04:05Z","published":"2023-09-05T01:39:09Z","title":"QuantEase: Optimization-based Quantization for Language Models","summary":" With the rising popularity of Large Language Models (LLMs), there has been an\nincreasing interest in compression techniques that enable their efficient\ndeployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs.\nDrawing from recent advances, our work introduces QuantEase, a layer-wise\nquantization framework where individual layers undergo separate quantization.\nThe problem is framed as a discrete-structured non-convex optimization,\nprompting the development of algorithms rooted in Coordinate Descent (CD)\ntechniques. These CD-based methods provide high-quality solutions to the\ncomplex non-convex layer-wise quantization problems. Notably, our CD-based\napproach features straightforward updates, relying solely on matrix and vector\noperations, circumventing the need for matrix inversion or decomposition. We\nalso explore an outlier-aware variant of our approach, allowing for retaining\nsignificant weights (outliers) with complete precision. Our proposal attains\nstate-of-the-art performance in terms of perplexity and zero-shot accuracy in\nempirical evaluations across various LLMs and datasets, with relative\nimprovements up to 15% over methods such as GPTQ. Leveraging careful linear\nalgebra optimizations, QuantEase can quantize models like Falcon-180B on a\nsingle NVIDIA A100 GPU in $\\sim$3 hours. Particularly noteworthy is our\noutlier-aware algorithm's capability to achieve near or sub-3-bit quantization\nof LLMs with an acceptable drop in accuracy, obviating the need for non-uniform\nquantization or grouping techniques, improving upon methods such as SpQR by up\nto two times in terms of perplexity.\n","authors":["Kayhan Behdin","Ayan Acharya","Aman Gupta","Qingquan Song","Siyu Zhu","Sathiya Keerthi","Rahul Mazumder"],"pdf_url":"https://arxiv.org/pdf/2309.01885v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00379v1","updated":"2023-12-01T06:57:11Z","published":"2023-12-01T06:57:11Z","title":"Optimal Sample Complexity of Contrastive Learning","summary":" Contrastive learning is a highly successful technique for learning\nrepresentations of data from labeled tuples, specifying the distance relations\nwithin the tuple. We study the sample complexity of contrastive learning, i.e.\nthe minimum number of labeled tuples sufficient for getting high generalization\naccuracy. We give tight bounds on the sample complexity in a variety of\nsettings, focusing on arbitrary distance functions, both general\n$\\ell_p$-distances, and tree metrics. Our main result is an (almost) optimal\nbound on the sample complexity of learning $\\ell_p$-distances for integer $p$.\nFor any $p \\ge 1$ we show that $\\tilde \\Theta(\\min(nd,n^2))$ labeled tuples are\nnecessary and sufficient for learning $d$-dimensional representations of\n$n$-point datasets. Our results hold for an arbitrary distribution of the input\nsamples and are based on giving the corresponding bounds on the\nVapnik-Chervonenkis/Natarajan dimension of the associated problems. We further\nshow that the theoretical bounds on sample complexity obtained via VC/Natarajan\ndimension can have strong predictive power for experimental results, in\ncontrast with the folklore belief about a substantial gap between the\nstatistical learning theory and the practice of deep learning.\n","authors":["Noga Alon","Dmitrii Avdiukhin","Dor Elboim","Orr Fischer","Grigory Yaroslavtsev"],"pdf_url":"https://arxiv.org/pdf/2312.00379v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06625v2","updated":"2023-12-01T06:47:56Z","published":"2023-10-10T13:44:09Z","title":"iTransformer: Inverted Transformers Are Effective for Time Series\n Forecasting","summary":" The recent boom of linear forecasting models questions the ongoing passion\nfor architectural modifications of Transformer-based forecasters. These\nforecasters leverage Transformers to model the global dependencies over\ntemporal tokens of time series, with each token formed by multiple variates of\nthe same timestamp. However, Transformers are challenged in forecasting series\nwith larger lookback windows due to performance degradation and computation\nexplosion. Besides, the embedding for each temporal token fuses multiple\nvariates that represent potential delayed events and distinct physical\nmeasurements, which may fail in learning variate-centric representations and\nresult in meaningless attention maps. In this work, we reflect on the competent\nduties of Transformer components and repurpose the Transformer architecture\nwithout any modification to the basic components. We propose iTransformer that\nsimply applies the attention and feed-forward network on the inverted\ndimensions. Specifically, the time points of individual series are embedded\ninto variate tokens which are utilized by the attention mechanism to capture\nmultivariate correlations; meanwhile, the feed-forward network is applied for\neach variate token to learn nonlinear representations. The iTransformer model\nachieves state-of-the-art on challenging real-world datasets, which further\nempowers the Transformer family with promoted performance, generalization\nability across different variates, and better utilization of arbitrary lookback\nwindows, making it a nice alternative as the fundamental backbone of time\nseries forecasting.\n","authors":["Yong Liu","Tengge Hu","Haoran Zhang","Haixu Wu","Shiyu Wang","Lintao Ma","Mingsheng Long"],"pdf_url":"https://arxiv.org/pdf/2310.06625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.10847v2","updated":"2023-12-01T06:38:18Z","published":"2023-11-17T20:07:54Z","title":"Token-Level Adaptation of LoRA Adapters for Downstream Task\n Generalization","summary":" This paper introduces a method for adapting LoRA adapters in smaller-sized\nlanguage models to arbitrary downstream tasks. Unlike standard\nmixture-of-expert architectures, our method employs a gradient-free routing\nfunction to choose a weighted combination of experts without increasing the\ncompute requirements for training or inference. The results show that\ntoken-level adaptation of LoRA adapters outperforms the base Llama-2-7b model\nacross mathematical (GSM8K), scientific (ARC-Challenge), reading comprehension\n(SQuAD), and coding (CodeAlpaca-20k) tasks. Further evaluations also show that\nthe average performance of token-level adaptation outperforms individual models\nfine-tuned for each of the tasks with the best performance observed in\nadaptation of every-other token during inference. The code for this study is\nmade available through a public repository.\n","authors":["Joshua Belofsky"],"pdf_url":"https://arxiv.org/pdf/2311.10847v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00373v1","updated":"2023-12-01T06:33:39Z","published":"2023-12-01T06:33:39Z","title":"Streaming Bayesian Modeling for predicting Fat-Tailed Customer Lifetime\n Value","summary":" We develop an online learning MCMC approach applicable for hierarchical\nbayesian models and GLMS. We also develop a fat-tailed LTV model that\ngeneralizes over several kinds of fat and thin tails. We demonstrate both\ndevelopments on commercial LTV data from a large mobile app.\n","authors":["Alexey V. Calabourdin","Konstantin A. Aksenov"],"pdf_url":"https://arxiv.org/pdf/2312.00373v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2212.02003v2","updated":"2023-12-01T06:23:15Z","published":"2022-12-05T03:26:08Z","title":"Bayesian Learning with Information Gain Provably Bounds Risk for a\n Robust Adversarial Defense","summary":" We present a new algorithm to learn a deep neural network model robust\nagainst adversarial attacks. Previous algorithms demonstrate an adversarially\ntrained Bayesian Neural Network (BNN) provides improved robustness. We\nrecognize the adversarial learning approach for approximating the multi-modal\nposterior distribution of a Bayesian model can lead to mode collapse;\nconsequently, the model's achievements in robustness and performance are\nsub-optimal. Instead, we first propose preventing mode collapse to better\napproximate the multi-modal posterior distribution. Second, based on the\nintuition that a robust model should ignore perturbations and only consider the\ninformative content of the input, we conceptualize and formulate an information\ngain objective to measure and force the information learned from both benign\nand adversarial training instances to be similar. Importantly. we prove and\ndemonstrate that minimizing the information gain objective allows the\nadversarial risk to approach the conventional empirical risk. We believe our\nefforts provide a step toward a basis for a principled method of adversarially\ntraining BNNs. Our model demonstrate significantly improved robustness--up to\n20%--compared with adversarial training and Adv-BNN under PGD attacks with\n0.035 distortion on both CIFAR-10 and STL-10 datasets.\n","authors":["Bao Gia Doan","Ehsan Abbasnejad","Javen Qinfeng Shi","Damith C. Ranasinghe"],"pdf_url":"https://arxiv.org/pdf/2212.02003v2.pdf","comment":"Published at ICML 2022. Code is available at\n https://github.com/baogiadoan/IG-BNN"},{"id":"http://arxiv.org/abs/2309.07867v3","updated":"2023-12-01T06:18:22Z","published":"2023-09-14T17:14:26Z","title":"Beta Diffusion","summary":" We introduce beta diffusion, a novel generative modeling method that\nintegrates demasking and denoising to generate data within bounded ranges.\nUsing scaled and shifted beta distributions, beta diffusion utilizes\nmultiplicative transitions over time to create both forward and reverse\ndiffusion processes, maintaining beta distributions in both the forward\nmarginals and the reverse conditionals, given the data at any point in time.\nUnlike traditional diffusion-based generative models relying on additive\nGaussian noise and reweighted evidence lower bounds (ELBOs), beta diffusion is\nmultiplicative and optimized with KL-divergence upper bounds (KLUBs) derived\nfrom the convexity of the KL divergence. We demonstrate that the proposed KLUBs\nare more effective for optimizing beta diffusion compared to negative ELBOs,\nwhich can also be derived as the KLUBs of the same KL divergence with its two\narguments swapped. The loss function of beta diffusion, expressed in terms of\nBregman divergence, further supports the efficacy of KLUBs for optimization.\nExperimental results on both synthetic data and natural images demonstrate the\nunique capabilities of beta diffusion in generative modeling of range-bounded\ndata and validate the effectiveness of KLUBs in optimizing diffusion models,\nthereby making them valuable additions to the family of diffusion-based\ngenerative models and the optimization techniques used to train them.\n","authors":["Mingyuan Zhou","Tianqi Chen","Zhendong Wang","Huangjie Zheng"],"pdf_url":"https://arxiv.org/pdf/2309.07867v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00364v1","updated":"2023-12-01T06:11:14Z","published":"2023-12-01T06:11:14Z","title":"Benchmarking Multi-Domain Active Learning on Image Classification","summary":" Active learning aims to enhance model performance by strategically labeling\ninformative data points. While extensively studied, its effectiveness on\nlarge-scale, real-world datasets remains underexplored. Existing research\nprimarily focuses on single-source data, ignoring the multi-domain nature of\nreal-world data. We introduce a multi-domain active learning benchmark to\nbridge this gap. Our benchmark demonstrates that traditional single-domain\nactive learning strategies are often less effective than random selection in\nmulti-domain scenarios. We also introduce CLIP-GeoYFCC, a novel large-scale\nimage dataset built around geographical domains, in contrast to existing\ngenre-based domain datasets. Analysis on our benchmark shows that all\nmulti-domain strategies exhibit significant tradeoffs, with no strategy\noutperforming across all datasets or all metrics, emphasizing the need for\nfuture research.\n","authors":["Jiayi Li","Rohan Taori","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2312.00364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00362v1","updated":"2023-12-01T05:59:08Z","published":"2023-12-01T05:59:08Z","title":"Dancing with Images: Video Distillation via Static-Dynamic\n Disentanglement","summary":" Recently, dataset distillation has paved the way towards efficient machine\nlearning, especially for image datasets. However, the distillation for videos,\ncharacterized by an exclusive temporal dimension, remains an underexplored\ndomain. In this work, we provide the first systematic study of video\ndistillation and introduce a taxonomy to categorize temporal compression. Our\ninvestigation reveals that the temporal information is usually not well learned\nduring distillation , and the temporal dimension of synthetic data contributes\nlittle. The observations motivate our unified framework of disentangling the\ndynamic and static information in the videos. It first distills the videos into\nstill images as static memory and then compensates the dynamic and motion\ninformation with a learnable dynamic memory block. Our method achieves\nstate-of-the-art on video datasets at different scales, with notably smaller\nstorage expenditure. Our code will be publicly available.\n","authors":["Ziyu Wang","Yue Xu","Cewu Lu","Yong-Lu Li"],"pdf_url":"https://arxiv.org/pdf/2312.00362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00359v1","updated":"2023-12-01T05:38:17Z","published":"2023-12-01T05:38:17Z","title":"Temperature Balancing, Layer-wise Weight Analysis, and Neural Network\n Training","summary":" Regularization in modern machine learning is crucial, and it can take various\nforms in algorithmic design: training set, model family, error function,\nregularization terms, and optimizations. In particular, the learning rate,\nwhich can be interpreted as a temperature-like parameter within the statistical\nmechanics of learning, plays a crucial role in neural network training. Indeed,\nmany widely adopted training strategies basically just define the decay of the\nlearning rate over time. This process can be interpreted as decreasing a\ntemperature, using either a global learning rate (for the entire model) or a\nlearning rate that varies for each parameter. This paper proposes TempBalance,\na straightforward yet effective layer-wise learning rate method. TempBalance is\nbased on Heavy-Tailed Self-Regularization (HT-SR) Theory, an approach which\ncharacterizes the implicit self-regularization of different layers in trained\nmodels. We demonstrate the efficacy of using HT-SR-motivated metrics to guide\nthe scheduling and balancing of temperature across all network layers during\nmodel training, resulting in improved performance during testing. We implement\nTempBalance on CIFAR10, CIFAR100, SVHN, and TinyImageNet datasets using\nResNets, VGGs, and WideResNets with various depths and widths. Our results show\nthat TempBalance significantly outperforms ordinary SGD and carefully-tuned\nspectral norm regularization. We also show that TempBalance outperforms a\nnumber of state-of-the-art optimizers and learning rate schedulers.\n","authors":["Yefan Zhou","Tianyu Pang","Keqin Liu","Charles H. Martin","Michael W. Mahoney","Yaoqing Yang"],"pdf_url":"https://arxiv.org/pdf/2312.00359v1.pdf","comment":"NeurIPS 2023 Spotlight, first two authors contributed equally"},{"id":"http://arxiv.org/abs/2312.00358v1","updated":"2023-12-01T05:28:19Z","published":"2023-12-01T05:28:19Z","title":"Impact of Data Augmentation on QCNNs","summary":" In recent years, Classical Convolutional Neural Networks (CNNs) have been\napplied for image recognition successfully. Quantum Convolutional Neural\nNetworks (QCNNs) are proposed as a novel generalization to CNNs by using\nquantum mechanisms. The quantum mechanisms lead to an efficient training\nprocess in QCNNs by reducing the size of input from $N$ to $log_2N$. This paper\nimplements and compares both CNNs and QCNNs by testing losses and prediction\naccuracy on three commonly used datasets. The datasets include the MNIST\nhand-written digits, Fashion MNIST and cat/dog face images. Additionally, data\naugmentation (DA), a technique commonly used in CNNs to improve the performance\nof classification by generating similar images based on original inputs, is\nalso implemented in QCNNs. Surprisingly, the results showed that data\naugmentation didn't improve QCNNs performance. The reasons and logic behind\nthis result are discussed, hoping to expand our understanding of Quantum\nmachine learning theory.\n","authors":["Leting Zhouli","Peiyong Wang","Udaya Parampalli"],"pdf_url":"https://arxiv.org/pdf/2312.00358v1.pdf","comment":"14 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.00357v1","updated":"2023-12-01T05:27:29Z","published":"2023-12-01T05:27:29Z","title":"A Generalizable Deep Learning System for Cardiac MRI","summary":" Cardiac MRI allows for a comprehensive assessment of myocardial structure,\nfunction, and tissue characteristics. Here we describe a foundational vision\nsystem for cardiac MRI, capable of representing the breadth of human\ncardiovascular disease and health. Our deep learning model is trained via\nself-supervised contrastive learning, by which visual concepts in cine-sequence\ncardiac MRI scans are learned from the raw text of the accompanying radiology\nreports. We train and evaluate our model on data from four large academic\nclinical institutions in the United States. We additionally showcase the\nperformance of our models on the UK BioBank, and two additional publicly\navailable external datasets. We explore emergent zero-shot capabilities of our\nsystem, and demonstrate remarkable performance across a range of tasks;\nincluding the problem of left ventricular ejection fraction regression, and the\ndiagnosis of 35 different conditions such as cardiac amyloidosis and\nhypertrophic cardiomyopathy. We show that our deep learning system is capable\nof not only understanding the staggering complexity of human cardiovascular\ndisease, but can be directed towards clinical problems of interest yielding\nimpressive, clinical grade diagnostic accuracy with a fraction of the training\ndata typically required for such tasks.\n","authors":["Rohan Shad","Cyril Zakka","Dhamanpreet Kaur","Robyn Fong","Ross Warren Filice","John Mongan","Kimberly Kalianos","Nishith Khandwala","David Eng","Matthew Leipzig","Walter Witschey","Alejandro de Feria","Victor Ferrari","Euan Ashley","Michael A. Acker","Curtis Langlotz","William Hiesinger"],"pdf_url":"https://arxiv.org/pdf/2312.00357v1.pdf","comment":"21 page main manuscript, 4 figures. Supplementary Appendix and code\n will be made available on publication"},{"id":"http://arxiv.org/abs/2311.12078v2","updated":"2023-12-01T05:24:22Z","published":"2023-11-20T05:58:05Z","title":"Fast Controllable Diffusion Models for Undersampled MRI Reconstruction","summary":" Supervised deep learning methods have shown promise in undersampled Magnetic\nResonance Imaging (MRI) reconstruction, but their requirement for paired data\nlimits their generalizability to the diverse MRI acquisition parameters.\nRecently, unsupervised controllable generative diffusion models have been\napplied to undersampled MRI reconstruction, without paired data or model\nretraining for different MRI acquisitions. However, diffusion models are\ngenerally slow in sampling and state-of-the-art acceleration techniques can\nlead to sub-optimal results when directly applied to the controllable\ngeneration process. This study introduces a new algorithm called\nPredictor-Projector-Noisor (PPN), which enhances and accelerates controllable\ngeneration of diffusion models for undersampled MRI reconstruction. Our results\ndemonstrate that PPN produces high-fidelity MR images that conform to\nundersampled k-space measurements with significantly shorter reconstruction\ntime than other controllable sampling methods. In addition, the unsupervised\nPPN accelerated diffusion models are adaptable to different MRI acquisition\nparameters, making them more practical for clinical use than supervised\nlearning techniques.\n","authors":["Wei Jiang","Zhuang Xiong","Feng Liu","Nan Ye","Hongfu Sun"],"pdf_url":"https://arxiv.org/pdf/2311.12078v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16713v2","updated":"2023-12-01T05:24:15Z","published":"2023-08-18T06:30:18Z","title":"UAV-assisted Semantic Communication with Hybrid Action Reinforcement\n Learning","summary":" In this paper, we aim to explore the use of uplink semantic communications\nwith the assistance of UAV in order to improve data collection effiicency for\nmetaverse users in remote areas. To reduce the time for uplink data collection\nwhile balancing the trade-off between reconstruction quality and computational\nenergy cost, we propose a hybrid action reinforcement learning (RL) framework\nto make decisions on semantic model scale, channel allocation, transmission\npower, and UAV trajectory. The variables are classified into discrete type and\ncontinuous type, which are optimized by two different RL agents to generate the\ncombined action. Simulation results indicate that the proposed hybrid action\nreinforcement learning framework can effectively improve the efficiency of\nuplink semantic data collection under different parameter settings and\noutperforms the benchmark scenarios.\n","authors":["Peiyuan Si","Jun Zhao","Kwok-Yan Lam","Qing Yang"],"pdf_url":"https://arxiv.org/pdf/2309.16713v2.pdf","comment":"This paper appears in IEEE Global Communications Conference\n (GLOBECOM) 2023"},{"id":"http://arxiv.org/abs/2312.00356v1","updated":"2023-12-01T05:18:35Z","published":"2023-12-01T05:18:35Z","title":"Transfer learning for predicting source terms of principal component\n transport in chemically reactive flow","summary":" The objective of this study is to evaluate whether the number of requisite\ntraining samples can be reduced with the use of various transfer learning\nmodels for predicting, for example, the chemical source terms of the\ndata-driven reduced-order model that represents the homogeneous ignition\nprocess of a hydrogen/air mixture. Principal component analysis is applied to\nreduce the dimensionality of the hydrogen/air mixture in composition space.\nArtificial neural networks (ANNs) are used to tabulate the reaction rates of\nprincipal components, and subsequently, a system of ordinary differential\nequations is solved. As the number of training samples decreases at the target\ntask (i.e.,for T0 > 1000 K and various phi), the reduced-order model fails to\npredict the ignition evolution of a hydrogen/air mixture. Three transfer\nlearning strategies are then applied to the training of the ANN model with a\nsparse dataset. The performance of the reduced-order model with a sparse\ndataset is found to be remarkably enhanced if the training of the ANN model is\nrestricted by a regularization term that controls the degree of knowledge\ntransfer from source to target tasks. To this end, a novel transfer learning\nmethod is introduced, parameter control via partial initialization and\nregularization (PaPIR), whereby the amount of knowledge transferred is\nsystemically adjusted for the initialization and regularization of the ANN\nmodel in the target task. It is found that an additional performance gain can\nbe achieved by changing the initialization scheme of the ANN model in the\ntarget task when the task similarity between source and target tasks is\nrelatively low.\n","authors":["Ki Sung Jung","Tarek Echekki","Jacqueline H. Chen","Mohammad Khalil"],"pdf_url":"https://arxiv.org/pdf/2312.00356v1.pdf","comment":"41 pages, 14 figures"},{"id":"http://arxiv.org/abs/2312.00352v1","updated":"2023-12-01T05:00:02Z","published":"2023-12-01T05:00:02Z","title":"Quantum Kernel t-Distributed Stochastic Neighbor Embedding","summary":" Data visualization is important in understanding the characteristics of data\nthat are difficult to see directly. It is used to visualize loss landscapes and\noptimization trajectories to analyze optimization performance. Popular\noptimization analysis is performed by visualizing a loss landscape around the\nreached local or global minimum using principal component analysis. However,\nthis visualization depends on the variational parameters of a quantum circuit\nrather than quantum states, which makes it difficult to understand the\nmechanism of optimization process through the property of quantum states. Here,\nwe propose a quantum data visualization method using quantum kernels, which\nenables us to offer fast and highly accurate visualization of quantum states.\nIn our numerical experiments, we visualize hand-written digits dataset and\napply $k$-nearest neighbor algorithm to the low-dimensional data to\nquantitatively evaluate our proposed method compared with a classical kernel\nmethod. As a result, our proposed method achieves comparable accuracy to the\nstate-of-the-art classical kernel method, meaning that the proposed\nvisualization method based on quantum machine learning does not degrade the\nseparability of the input higher dimensional data. Furthermore, we visualize\nthe optimization trajectories of finding the ground states of transverse field\nIsing model and successfully find the trajectory characteristics. Since quantum\nstates are higher dimensional objects that can only be seen via observables,\nour visualization method, which inherits the similarity of quantum data, would\nbe useful in understanding the behavior of quantum circuits and algorithms.\n","authors":["Yoshiaki Kawase","Kosuke Mitarai","Keisuke Fujii"],"pdf_url":"https://arxiv.org/pdf/2312.00352v1.pdf","comment":"10pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.00344v1","updated":"2023-12-01T04:40:47Z","published":"2023-12-01T04:40:47Z","title":"TRC: Trust Region Conditional Value at Risk for Safe Reinforcement\n Learning","summary":" As safety is of paramount importance in robotics, reinforcement learning that\nreflects safety, called safe RL, has been studied extensively. In safe RL, we\naim to find a policy which maximizes the desired return while satisfying the\ndefined safety constraints. There are various types of constraints, among which\nconstraints on conditional value at risk (CVaR) effectively lower the\nprobability of failures caused by high costs since CVaR is a conditional\nexpectation obtained above a certain percentile. In this paper, we propose a\ntrust region-based safe RL method with CVaR constraints, called TRC. We first\nderive the upper bound on CVaR and then approximate the upper bound in a\ndifferentiable form in a trust region. Using this approximation, a subproblem\nto get policy gradients is formulated, and policies are trained by iteratively\nsolving the subproblem. TRC is evaluated through safe navigation tasks in\nsimulations with various robots and a sim-to-real environment with a Jackal\nrobot from Clearpath. Compared to other safe RL methods, the performance is\nimproved by 1.93 times while the constraints are satisfied in all experiments.\n","authors":["Dohyeong Kim","Songhwai Oh"],"pdf_url":"https://arxiv.org/pdf/2312.00344v1.pdf","comment":"RA-L and ICRA 2022"},{"id":"http://arxiv.org/abs/2312.00342v1","updated":"2023-12-01T04:29:19Z","published":"2023-12-01T04:29:19Z","title":"Efficient Off-Policy Safe Reinforcement Learning Using Trust Region\n Conditional Value at Risk","summary":" This paper aims to solve a safe reinforcement learning (RL) problem with risk\nmeasure-based constraints. As risk measures, such as conditional value at risk\n(CVaR), focus on the tail distribution of cost signals, constraining risk\nmeasures can effectively prevent a failure in the worst case. An on-policy safe\nRL method, called TRC, deals with a CVaR-constrained RL problem using a trust\nregion method and can generate policies with almost zero constraint violations\nwith high returns. However, to achieve outstanding performance in complex\nenvironments and satisfy safety constraints quickly, RL methods are required to\nbe sample efficient. To this end, we propose an off-policy safe RL method with\nCVaR constraints, called off-policy TRC. If off-policy data from replay buffers\nis directly used to train TRC, the estimation error caused by the\ndistributional shift results in performance degradation. To resolve this issue,\nwe propose novel surrogate functions, in which the effect of the distributional\nshift can be reduced, and introduce an adaptive trust-region constraint to\nensure a policy not to deviate far from replay buffers. The proposed method has\nbeen evaluated in simulation and real-world environments and satisfied safety\nconstraints within a few steps while achieving high returns even in complex\nrobotic tasks.\n","authors":["Dohyeong Kim","Songhwai Oh"],"pdf_url":"https://arxiv.org/pdf/2312.00342v1.pdf","comment":"RA-L and IROS 2022"},{"id":"http://arxiv.org/abs/2312.00336v1","updated":"2023-12-01T04:10:00Z","published":"2023-12-01T04:10:00Z","title":"Hypergraph Node Representation Learning with One-Stage Message Passing","summary":" Hypergraphs as an expressive and general structure have attracted\nconsiderable attention from various research domains. Most existing hypergraph\nnode representation learning techniques are based on graph neural networks, and\nthus adopt the two-stage message passing paradigm (i.e. node -> hyperedge ->\nnode). This paradigm only focuses on local information propagation and does not\neffectively take into account global information, resulting in less optimal\nrepresentations. Our theoretical analysis of representative two-stage message\npassing methods shows that, mathematically, they model different ways of local\nmessage passing through hyperedges, and can be unified into one-stage message\npassing (i.e. node -> node). However, they still only model local information.\nMotivated by this theoretical analysis, we propose a novel one-stage message\npassing paradigm to model both global and local information propagation for\nhypergraphs. We integrate this paradigm into HGraphormer, a Transformer-based\nframework for hypergraph node representation learning. HGraphormer injects the\nhypergraph structure information (local information) into Transformers (global\ninformation) by combining the attention matrix and hypergraph Laplacian.\nExtensive experiments demonstrate that HGraphormer outperforms recent\nhypergraph learning methods on five representative benchmark datasets on the\nsemi-supervised hypernode classification task, setting new state-of-the-art\nperformance, with accuracy improvements between 2.52% and 6.70%. Our code and\ndatasets are available.\n","authors":["Shilin Qu","Weiqing Wang","Yuan-Fang Li","Xin Zhou","Fajie Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.00336v1.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2308.11645v2","updated":"2023-12-01T03:32:25Z","published":"2023-08-17T03:46:23Z","title":"Neurological Prognostication of Post-Cardiac-Arrest Coma Patients Using\n EEG Data: A Dynamic Survival Analysis Framework with Competing Risks","summary":" Patients resuscitated from cardiac arrest who enter a coma are at high risk\nof death. Forecasting neurological outcomes of these patients (the task of\nneurological prognostication) could help with treatment decisions. In this\npaper, we propose, to the best of our knowledge, the first dynamic framework\nfor neurological prognostication of post-cardiac-arrest comatose patients using\nEEG data: our framework makes predictions for a patient over time as more EEG\ndata become available, and different training patients' available EEG time\nseries could vary in length. Predictions are phrased in terms of either\ntime-to-event outcomes (time-to-awakening or time-to-death) or as the patient's\nprobability of awakening or of dying across multiple time horizons. Our\nframework uses any dynamic survival analysis model that supports competing\nrisks in the form of estimating patient-level cumulative incidence functions.\nWe consider three competing risks as to what happens first to a patient:\nawakening, being withdrawn from life-sustaining therapies (and thus\ndeterministically dying), or dying (by other causes). We demonstrate our\nframework by benchmarking three existing dynamic survival analysis models that\nsupport competing risks on a real dataset of 922 patients. Our main\nexperimental findings are that: (1) the classical Fine and Gray model which\nonly uses a patient's static features and summary statistics from the patient's\nlatest hour's worth of EEG data is highly competitive, achieving accuracy\nscores as high as the recently developed Dynamic-DeepHit model that uses\nsubstantially more of the patient's EEG data; and (2) in an ablation study, we\nshow that our choice of modeling three competing risks results in a model that\nis at least as accurate while learning more information than simpler models\n(using two competing risks or a standard survival analysis setup with no\ncompeting risks).\n","authors":["Xiaobin Shen","Jonathan Elmer","George H. Chen"],"pdf_url":"https://arxiv.org/pdf/2308.11645v2.pdf","comment":"Machine Learning for Healthcare conference (MLHC 2023); this draft\n includes minor corrections (in explaining how Dynamic-DeepHit works)"},{"id":"http://arxiv.org/abs/2312.00313v1","updated":"2023-12-01T03:12:04Z","published":"2023-12-01T03:12:04Z","title":"Improving Normalization with the James-Stein Estimator","summary":" Stein's paradox holds considerable sway in high-dimensional statistics,\nhighlighting that the sample mean, traditionally considered the de facto\nestimator, might not be the most efficacious in higher dimensions. To address\nthis, the James-Stein estimator proposes an enhancement by steering the sample\nmeans toward a more centralized mean vector. In this paper, first, we establish\nthat normalization layers in deep learning use inadmissible estimators for mean\nand variance. Next, we introduce a novel method to employ the James-Stein\nestimator to improve the estimation of mean and variance within normalization\nlayers. We evaluate our method on different computer vision tasks: image\nclassification, semantic segmentation, and 3D object classification. Through\nthese evaluations, it is evident that our improved normalization layers\nconsistently yield superior accuracy across all tasks without extra\ncomputational burden. Moreover, recognizing that a plethora of shrinkage\nestimators surpass the traditional estimator in performance, we study two other\nprominent shrinkage estimators: Ridge and LASSO. Additionally, we provide\nvisual representations to intuitively demonstrate the impact of shrinkage on\nthe estimated layer statistics. Finally, we study the effect of regularization\nand batch size on our modified batch normalization. The studies show that our\nmethod is less sensitive to batch size and regularization, improving accuracy\nunder various setups.\n","authors":["Seyedalireza Khoshsirat","Chandra Kambhamettu"],"pdf_url":"https://arxiv.org/pdf/2312.00313v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.13670v4","updated":"2023-12-01T02:58:36Z","published":"2023-08-25T20:59:51Z","title":"Linear Oscillation: A Novel Activation Function for Vision Transformer","summary":" Activation functions are the linchpins of deep learning, profoundly\ninfluencing both the representational capacity and training dynamics of neural\nnetworks. They shape not only the nature of representations but also optimize\nconvergence rates and enhance generalization potential. Appreciating this\ncritical role, we present the Linear Oscillation (LoC) activation function,\ndefined as $f(x) = x \\times \\sin(\\alpha x + \\beta)$. Distinct from conventional\nactivation functions which primarily introduce non-linearity, LoC seamlessly\nblends linear trajectories with oscillatory deviations. The nomenclature\n\"Linear Oscillation\" is a nod to its unique attribute of infusing linear\nactivations with harmonious oscillations, capturing the essence of the\n\"Importance of Confusion\". This concept of \"controlled confusion\" within\nnetwork activations is posited to foster more robust learning, particularly in\ncontexts that necessitate discerning subtle patterns. Our empirical studies\nreveal that, when integrated into diverse neural architectures, the LoC\nactivation function consistently outperforms established counterparts like ReLU\nand Sigmoid. The stellar performance exhibited by the avant-garde Vision\nTransformer model using LoC further validates its efficacy. This study\nilluminates the remarkable benefits of the LoC over other prominent activation\nfunctions. It champions the notion that intermittently introducing deliberate\ncomplexity or \"confusion\" during training can spur more profound and nuanced\nlearning. This accentuates the pivotal role of judiciously selected activation\nfunctions in shaping the future of neural network training.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2308.13670v4.pdf","comment":"Not Completed Paper"},{"id":"http://arxiv.org/abs/2306.11667v5","updated":"2023-12-01T02:58:20Z","published":"2023-06-20T16:39:27Z","title":"G-NM: A Group of Numerical Time Series Prediction Models","summary":" In this study, we focus on the development and implementation of a\ncomprehensive ensemble of numerical time series forecasting models,\ncollectively referred to as the Group of Numerical Time Series Prediction Model\n(G-NM). This inclusive set comprises traditional models such as Autoregressive\nIntegrated Moving Average (ARIMA), Holt-Winters' method, and Support Vector\nRegression (SVR), in addition to modern neural network models including\nRecurrent Neural Network (RNN) and Long Short-Term Memory (LSTM). G-NM is\nexplicitly constructed to augment our predictive capabilities related to\npatterns and trends inherent in complex natural phenomena. By utilizing time\nseries data relevant to these events, G-NM facilitates the prediction of such\nphenomena over extended periods. The primary objective of this research is to\nboth advance our understanding of such occurrences and to significantly enhance\nthe accuracy of our forecasts. G-NM encapsulates both linear and non-linear\ndependencies, seasonalities, and trends present in time series data. Each of\nthese models contributes distinct strengths, from ARIMA's resilience in\nhandling linear trends and seasonality, SVR's proficiency in capturing\nnon-linear patterns, to LSTM's adaptability in modeling various components of\ntime series data. Through the exploitation of the G-NM potential, we strive to\nadvance the state-of-the-art in large-scale time series forecasting models. We\nanticipate that this research will represent a significant stepping stone in\nour ongoing endeavor to comprehend and forecast the complex events that\nconstitute the natural world.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2306.11667v5.pdf","comment":"Not Completed Paper"},{"id":"http://arxiv.org/abs/2307.16189v7","updated":"2023-12-01T02:57:03Z","published":"2023-07-30T10:03:36Z","title":"Stable Adam Optimization for 16-bit Neural Networks Training","summary":" In this research, we address critical concerns related to the numerical\ninstability observed in 16-bit computations of machine learning models. Such\ninstability, particularly when employing popular optimization algorithms like\nAdam, often leads to unstable training of deep neural networks. This not only\ndisrupts the learning process but also poses significant challenges in\ndeploying dependable models in real-world applications. Our investigation\nidentifies the epsilon hyperparameter as the primary source of this\ninstability. A nuanced exploration reveals that subtle adjustments to epsilon\nwithin 16-bit computations can enhance the numerical stability of Adam,\nenabling more stable training of 16-bit neural networks. We propose a novel,\ndependable approach that leverages updates from the Adam optimizer to bolster\nthe stability of the learning process. Our contributions provide deeper\ninsights into optimization challenges in low-precision computations and offer\nsolutions to ensure the stability of deep neural network training, paving the\nway for their dependable use in various applications.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2307.16189v7.pdf","comment":"Incompleted Contents Paper"},{"id":"http://arxiv.org/abs/2312.00305v1","updated":"2023-12-01T02:53:20Z","published":"2023-12-01T02:53:20Z","title":"Multiple Testing of Linear Forms for Noisy Matrix Completion","summary":" Many important tasks of large-scale recommender systems can be naturally cast\nas testing multiple linear forms for noisy matrix completion. These problems,\nhowever, present unique challenges because of the subtle bias-and-variance\ntradeoff of and an intricate dependence among the estimated entries induced by\nthe low-rank structure. In this paper, we develop a general approach to\novercome these difficulties by introducing new statistics for individual tests\nwith sharp asymptotics both marginally and jointly, and utilizing them to\ncontrol the false discovery rate (FDR) via a data splitting and symmetric\naggregation scheme. We show that valid FDR control can be achieved with\nguaranteed power under nearly optimal sample size requirements using the\nproposed methodology. Extensive numerical simulations and real data examples\nare also presented to further illustrate its practical merits.\n","authors":["Wanteng Ma","Lilun Du","Dong Xia","Ming Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.00305v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18587v2","updated":"2023-12-01T02:51:32Z","published":"2023-11-30T14:28:25Z","title":"Continuous 16-bit Training: Accelerating 32-bit Pre-Trained Neural\n Networks","summary":" In the field of deep learning, the prevalence of models initially trained\nwith 32-bit precision is a testament to its robustness and accuracy. However,\nthe continuous evolution of these models often demands further training, which\ncan be resource-intensive. This study introduces a novel approach where we\ncontinue the training of these pre-existing 32-bit models using 16-bit\nprecision. This technique not only caters to the need for efficiency in\ncomputational resources but also significantly improves the speed of additional\ntraining phases. By adopting 16-bit precision for ongoing training, we are able\nto substantially decrease memory requirements and computational burden, thereby\naccelerating the training process in a resource-limited setting. Our\nexperiments show that this method maintains the high standards of accuracy set\nby the original 32-bit training while providing a much-needed boost in training\nspeed. This approach is especially pertinent in today's context, where most\nmodels are initially trained in 32-bit and require periodic updates and\nrefinements. The findings from our research suggest that this strategy of\n16-bit continuation training can be a key solution for sustainable and\nefficient deep learning, offering a practical way to enhance pre-trained models\nrapidly and in a resource-conscious manner.\n","authors":["Juyoung Yun"],"pdf_url":"https://arxiv.org/pdf/2311.18587v2.pdf","comment":"Not Completed Paper"},{"id":"http://arxiv.org/abs/2312.00304v1","updated":"2023-12-01T02:47:00Z","published":"2023-12-01T02:47:00Z","title":"Developmental Pretraining (DPT) for Image Classification Networks","summary":" In the backdrop of increasing data requirements of Deep Neural Networks for\nobject recognition that is growing more untenable by the day, we present\nDevelopmental PreTraining (DPT) as a possible solution. DPT is designed as a\ncurriculum-based pre-training approach designed to rival traditional\npre-training techniques that are data-hungry. These training approaches also\nintroduce unnecessary features that could be misleading when the network is\nemployed in a downstream classification task where the data is sufficiently\ndifferent from the pre-training data and is scarce. We design the curriculum\nfor DPT by drawing inspiration from human infant visual development. DPT\nemploys a phased approach where carefully-selected primitive and universal\nfeatures like edges and shapes are taught to the network participating in our\npre-training regime. A model that underwent the DPT regime is tested against\nmodels with randomised weights to evaluate the viability of DPT.\n","authors":["Niranjan Rajesh","Debayan Gupta"],"pdf_url":"https://arxiv.org/pdf/2312.00304v1.pdf","comment":"7 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.00296v1","updated":"2023-12-01T02:24:07Z","published":"2023-12-01T02:24:07Z","title":"Towards Aligned Canonical Correlation Analysis: Preliminary Formulation\n and Proof-of-Concept Results","summary":" Canonical Correlation Analysis (CCA) has been widely applied to jointly embed\nmultiple views of data in a maximally correlated latent space. However, the\nalignment between various data perspectives, which is required by traditional\napproaches, is unclear in many practical cases. In this work we propose a new\nframework Aligned Canonical Correlation Analysis (ACCA), to address this\nchallenge by iteratively solving the alignment and multi-view embedding.\n","authors":["Biqian Cheng","Evangelos E. Papalexakis","Jia Chen"],"pdf_url":"https://arxiv.org/pdf/2312.00296v1.pdf","comment":"4 pages, 7 figures, MLG 2023"},{"id":"http://arxiv.org/abs/2305.17083v2","updated":"2023-12-01T02:21:35Z","published":"2023-05-26T16:48:05Z","title":"A Policy Gradient Method for Confounded POMDPs","summary":" In this paper, we propose a policy gradient method for confounded partially\nobservable Markov decision processes (POMDPs) with continuous state and\nobservation spaces in the offline setting. We first establish a novel\nidentification result to non-parametrically estimate any history-dependent\npolicy gradient under POMDPs using the offline data. The identification enables\nus to solve a sequence of conditional moment restrictions and adopt the min-max\nlearning procedure with general function approximation for estimating the\npolicy gradient. We then provide a finite-sample non-asymptotic bound for\nestimating the gradient uniformly over a pre-specified policy class in terms of\nthe sample size, length of horizon, concentratability coefficient and the\nmeasure of ill-posedness in solving the conditional moment restrictions.\nLastly, by deploying the proposed gradient estimation in the gradient ascent\nalgorithm, we show the global convergence of the proposed algorithm in finding\nthe history-dependent optimal policy under some technical conditions. To the\nbest of our knowledge, this is the first work studying the policy gradient\nmethod for POMDPs under the offline setting.\n","authors":["Mao Hong","Zhengling Qi","Yanxun Xu"],"pdf_url":"https://arxiv.org/pdf/2305.17083v2.pdf","comment":"95 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.00290v1","updated":"2023-12-01T02:09:18Z","published":"2023-12-01T02:09:18Z","title":"Learning to forecast diagnostic parameters using pre-trained weather\n embedding","summary":" Data-driven weather prediction (DDWP) models are increasingly becoming\npopular for weather forecasting. However, while operational weather forecasts\npredict a wide variety of weather variables, DDWPs currently forecast a\nspecific set of key prognostic variables. Non-prognostic (\"diagnostic\")\nvariables are sometimes modeled separately as dependent variables of the\nprognostic variables (c.f. FourCastNet), or by including the diagnostic\nvariable as a target in the DDWP. However, the cost of training and deploying\nbespoke models for each diagnostic variable can increase dramatically with more\ndiagnostic variables, and limit the operational use of such models. Likewise,\nretraining an entire DDWP each time a new diagnostic variable is added is also\ncost-prohibitive. We present an two-stage approach that allows new diagnostic\nvariables to be added to an end-to-end DDWP model without the expensive\nretraining. In the first stage, we train an autoencoder that learns to embed\nprognostic variables into a latent space. In the second stage, the autoencoder\nis frozen and \"downstream\" models are trained to predict diagnostic variables\nusing only the latent representations of prognostic variables as input. Our\nexperiments indicate that models trained using the two-stage approach offer\naccuracy comparable to training bespoke models, while leading to significant\nreduction in resource utilization during training and inference. This approach\nallows for new \"downstream\" models to be developed as needed, without affecting\nexisting models and thus reducing the friction in operationalizing new models.\n","authors":["Peetak P. Mitra","Vivek Ramavajjala"],"pdf_url":"https://arxiv.org/pdf/2312.00290v1.pdf","comment":"Accepted as a spotlight paper at the NeurIPS 2023 workshop on\n Tackling Climate Change with Machine Learning"},{"id":"http://arxiv.org/abs/2309.00267v2","updated":"2023-12-01T01:41:44Z","published":"2023-09-01T05:53:33Z","title":"RLAIF: Scaling Reinforcement Learning from Human Feedback with AI\n Feedback","summary":" Reinforcement learning from human feedback (RLHF) has proven effective in\naligning large language models (LLMs) with human preferences. However,\ngathering high-quality human preference labels can be a time-consuming and\nexpensive endeavor. RL from AI Feedback (RLAIF), introduced by Bai et al.,\noffers a promising alternative that leverages a powerful off-the-shelf LLM to\ngenerate preferences in lieu of human annotators. Across the tasks of\nsummarization, helpful dialogue generation, and harmless dialogue generation,\nRLAIF achieves comparable or superior performance to RLHF, as rated by human\nevaluators. Furthermore, RLAIF demonstrates the ability to outperform a\nsupervised fine-tuned baseline even when the LLM preference labeler is the same\nsize as the policy. In another experiment, directly prompting the LLM for\nreward scores achieves superior performance to the canonical RLAIF setup, where\nLLM preference labels are first distilled into a reward model. Finally, we\nconduct extensive studies on techniques for generating aligned AI preferences.\nOur results suggest that RLAIF can achieve human-level performance, offering a\npotential solution to the scalability limitations of RLHF.\n","authors":["Harrison Lee","Samrat Phatale","Hassan Mansoor","Thomas Mesnard","Johan Ferret","Kellie Lu","Colton Bishop","Ethan Hall","Victor Carbune","Abhinav Rastogi","Sushant Prakash"],"pdf_url":"https://arxiv.org/pdf/2309.00267v2.pdf","comment":"Added two more tasks and many more experiments and analyses (e.g.\n same-size RLAIF, direct RLAIF, cost analysis)"},{"id":"http://arxiv.org/abs/2311.17303v2","updated":"2023-12-01T01:34:47Z","published":"2023-11-29T01:25:00Z","title":"Enhancing the Performance of Neural Networks Through Causal Discovery\n and Integration of Domain Knowledge","summary":" In this paper, we develop a generic methodology to encode hierarchical\ncausality structure among observed variables into a neural network in order to\nimprove its predictive performance. The proposed methodology, called\ncausality-informed neural network (CINN), leverages three coherent steps to\nsystematically map the structural causal knowledge into the layer-to-layer\ndesign of neural network while strictly preserving the orientation of every\ncausal relationship. In the first step, CINN discovers causal relationships\nfrom observational data via directed acyclic graph (DAG) learning, where causal\ndiscovery is recast as a continuous optimization problem to avoid the\ncombinatorial nature. In the second step, the discovered hierarchical causality\nstructure among observed variables is systematically encoded into neural\nnetwork through a dedicated architecture and customized loss function. By\ncategorizing variables in the causal DAG as root, intermediate, and leaf nodes,\nthe hierarchical causal DAG is translated into CINN with a one-to-one\ncorrespondence between nodes in the causal DAG and units in the CINN while\nmaintaining the relative order among these nodes. Regarding the loss function,\nboth intermediate and leaf nodes in the DAG graph are treated as target outputs\nduring CINN training so as to drive co-learning of causal relationships among\ndifferent types of nodes. As multiple loss components emerge in CINN, we\nleverage the projection of conflicting gradients to mitigate gradient\ninterference among the multiple learning tasks. Computational experiments\nacross a broad spectrum of UCI data sets demonstrate substantial advantages of\nCINN in predictive performance over other state-of-the-art methods. In\naddition, an ablation study underscores the value of integrating structural and\nquantitative causal knowledge in enhancing the neural network's predictive\nperformance incrementally.\n","authors":["Xiaoge Zhang","Xiao-Lin Wang","Fenglei Fan","Yiu-Ming Cheung","Indranil Bose"],"pdf_url":"https://arxiv.org/pdf/2311.17303v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00279v1","updated":"2023-12-01T01:30:49Z","published":"2023-12-01T01:30:49Z","title":"Age-Based Scheduling for Mobile Edge Computing: A Deep Reinforcement\n Learning Approach","summary":" With the rapid development of Mobile Edge Computing (MEC), various real-time\napplications have been deployed to benefit people's daily lives. The\nperformance of these applications relies heavily on the freshness of collected\nenvironmental information, which can be quantified by its Age of Information\n(AoI). In the traditional definition of AoI, it is assumed that the status\ninformation can be actively sampled and directly used. However, for many\nMEC-enabled applications, the desired status information is updated in an\nevent-driven manner and necessitates data processing. To better serve these\napplications, we propose a new definition of AoI and, based on the redefined\nAoI, we formulate an online AoI minimization problem for MEC systems. Notably,\nthe problem can be interpreted as a Markov Decision Process (MDP), thus\nenabling its solution through Reinforcement Learning (RL) algorithms.\nNevertheless, the traditional RL algorithms are designed for MDPs with\ncompletely unknown system dynamics and hence usually suffer long convergence\ntimes. To accelerate the learning process, we introduce Post-Decision States\n(PDSs) to exploit the partial knowledge of the system's dynamics. We also\ncombine PDSs with deep RL to further improve the algorithm's applicability,\nscalability, and robustness. Numerical results demonstrate that our algorithm\noutperforms the benchmarks under various scenarios.\n","authors":["Xingqiu He","Chaoqun You","Tony Q. S. Quek"],"pdf_url":"https://arxiv.org/pdf/2312.00279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03438v2","updated":"2023-12-01T01:27:37Z","published":"2023-06-06T06:35:27Z","title":"Large Language Models of Code Fail at Completing Code with Potential\n Bugs","summary":" Large language models of code (Code-LLMs) have recently brought tremendous\nadvances to code completion, a fundamental feature of programming assistance\nand code intelligence. However, most existing works ignore the possible\npresence of bugs in the code context for generation, which are inevitable in\nsoftware development. Therefore, we introduce and study the buggy-code\ncompletion problem, inspired by the realistic scenario of real-time code\nsuggestion where the code context contains potential bugs -- anti-patterns that\ncan become bugs in the completed program. To systematically study the task, we\nintroduce two datasets: one with synthetic bugs derived from semantics-altering\noperator changes (buggy-HumanEval) and one with realistic bugs derived from\nuser submissions to coding problems (buggy-FixEval). We find that the presence\nof potential bugs significantly degrades the generation performance of the\nhigh-performing Code-LLMs. For instance, the passing rates of CODEGEN-2B-MONO\non test cases of buggy-HumanEval drop more than 50% given a single potential\nbug in the context. Finally, we investigate several post-hoc methods for\nmitigating the adverse effect of potential bugs and find that there remains a\nsignificant gap in post-mitigation performance.\n","authors":["Tuan Dinh","Jinman Zhao","Samson Tan","Renato Negrinho","Leonard Lausen","Sheng Zha","George Karypis"],"pdf_url":"https://arxiv.org/pdf/2306.03438v2.pdf","comment":"27 pages, accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00277v1","updated":"2023-12-01T01:26:38Z","published":"2023-12-01T01:26:38Z","title":"Text Attribute Control via Closed-Loop Disentanglement","summary":" Changing an attribute of a text without changing the content usually requires\nto first disentangle the text into irrelevant attributes and content\nrepresentations. After that, in the inference phase, the representation of one\nattribute is tuned to a different value, expecting that the corresponding\nattribute of the text can also be changed accordingly. The usual way of\ndisentanglement is to add some constraints on the latent space of an\nencoder-decoder architecture, including adversarial-based constraints and\nmutual-information-based constraints. However, the previous semi-supervised\nprocesses of attribute change are usually not enough to guarantee the success\nof attribute change and content preservation. In this paper, we propose a novel\napproach to achieve a robust control of attributes while enhancing content\npreservation. In this approach, we use a semi-supervised contrastive learning\nmethod to encourage the disentanglement of attributes in latent spaces.\nDifferently from previous works, we re-disentangle the reconstructed sentence\nand compare the re-disentangled latent space with the original latent space,\nwhich makes a closed-loop disentanglement process. This also helps content\npreservation. In addition, the contrastive learning method is also able to\nreplace the role of minimizing mutual information and adversarial training in\nthe disentanglement process, which alleviates the computation cost. We\nconducted experiments on three text datasets, including the Yelp Service review\ndataset, the Amazon Product review dataset, and the GoEmotions dataset. The\nexperimental results show the effectiveness of our model.\n","authors":["Lei Sha","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.00277v1.pdf","comment":"accepted by TACL 2023"},{"id":"http://arxiv.org/abs/2312.00276v1","updated":"2023-12-01T01:25:04Z","published":"2023-12-01T01:25:04Z","title":"Automating Continual Learning","summary":" General-purpose learning systems should improve themselves in open-ended\nfashion in ever-changing environments. Conventional learning algorithms for\nneural networks, however, suffer from catastrophic forgetting (CF) --\npreviously acquired skills are forgotten when a new task is learned. Instead of\nhand-crafting new algorithms for avoiding CF, we propose Automated Continual\nLearning (ACL) to train self-referential neural networks to meta-learn their\nown in-context continual (meta-)learning algorithms. ACL encodes all desiderata\n-- good performance on both old and new tasks -- into its meta-learning\nobjectives. Our experiments demonstrate that ACL effectively solves \"in-context\ncatastrophic forgetting\"; our ACL-learned algorithms outperform hand-crafted\nones, e.g., on the Split-MNIST benchmark in the replay-free setting, and\nenables continual learning of diverse tasks consisting of multiple few-shot and\nstandard image classification datasets.\n","authors":["Kazuki Irie","Róbert Csordás","Jürgen Schmidhuber"],"pdf_url":"https://arxiv.org/pdf/2312.00276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00271v1","updated":"2023-12-01T01:11:16Z","published":"2023-12-01T01:11:16Z","title":"Towards Clinical Prediction with Transparency: An Explainable AI\n Approach to Survival Modelling in Residential Aged Care","summary":" Background: Accurate survival time estimates aid end-of-life medical\ndecision-making. Objectives: Develop an interpretable survival model for\nelderly residential aged care residents using advanced machine learning.\nSetting: A major Australasian residential aged care provider. Participants:\nResidents aged 65+ admitted for long-term care from July 2017 to August 2023.\nSample size: 11,944 residents across 40 facilities. Predictors: Factors include\nage, gender, health status, co-morbidities, cognitive function, mood,\nnutrition, mobility, smoking, sleep, skin integrity, and continence. Outcome:\nProbability of survival post-admission, specifically calibrated for 6-month\nsurvival estimates. Statistical Analysis: Tested CoxPH, EN, RR, Lasso, GB, XGB,\nand RF models in 20 experiments with a 90/10 train/test split. Evaluated\naccuracy using C-index, Harrell's C-index, dynamic AUROC, IBS, and calibrated\nROC. Chose XGB for its performance and calibrated it for 1, 3, 6, and 12-month\npredictions using Platt scaling. Employed SHAP values to analyze predictor\nimpacts. Results: GB, XGB, and RF models showed the highest C-Index values\n(0.714, 0.712, 0.712). The optimal XGB model demonstrated a 6-month survival\nprediction AUROC of 0.746 (95% CI 0.744-0.749). Key mortality predictors\ninclude age, male gender, mobility, health status, pressure ulcer risk, and\nappetite. Conclusions: The study successfully applies machine learning to\ncreate a survival model for aged care, aligning with clinical insights on\nmortality risk factors and enhancing model interpretability and clinical\nutility through explainable AI.\n","authors":["Teo Susnjak","Elise Griffin","Mitchell McCutcheon","Kathleen Potter"],"pdf_url":"https://arxiv.org/pdf/2312.00271v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19718v3","updated":"2023-12-01T01:04:23Z","published":"2023-05-31T10:14:35Z","title":"A rule-general abductive learning by rough sets","summary":" In real-world tasks, there is usually a large amount of unlabeled data and\nlabeled data. The task of combining the two to learn is known as\nsemi-supervised learning. Experts can use logical rules to label unlabeled\ndata, but this operation is costly. The combination of perception and reasoning\nhas a good effect in processing such semi-supervised tasks with domain\nknowledge. However, acquiring domain knowledge and the correction, reduction\nand generation of rules remain complex problems to be solved. Rough set theory\nis an important method for solving knowledge processing in information systems.\nIn this paper, we propose a rule general abductive learning by rough set\n(RS-ABL). By transforming the target concept and sub-concepts of rules into\ninformation tables, rough set theory is used to solve the acquisition of domain\nknowledge and the correction, reduction and generation of rules at a lower\ncost. This framework can also generate more extensive negative rules to enhance\nthe breadth of the knowledge base. Compared with the traditional\nsemi-supervised learning method, RS-ABL has higher accuracy in dealing with\nsemi-supervised tasks.\n","authors":["Xu-chang Guo","Hou-biao Li"],"pdf_url":"https://arxiv.org/pdf/2305.19718v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2006.13309v4","updated":"2023-12-01T01:03:08Z","published":"2020-06-11T18:52:34Z","title":"Fast Deep Mixtures of Gaussian Process Experts","summary":" Mixtures of experts have become an indispensable tool for flexible modelling\nin a supervised learning context, allowing not only the mean function but the\nentire density of the output to change with the inputs. Sparse Gaussian\nprocesses (GP) have shown promise as a leading candidate for the experts in\nsuch models, and in this article, we propose to design the gating network for\nselecting the experts from such mixtures of sparse GPs using a deep neural\nnetwork (DNN). Furthermore, a fast one pass algorithm called\nCluster-Classify-Regress (CCR) is leveraged to approximate the maximum a\nposteriori (MAP) estimator extremely quickly. This powerful combination of\nmodel and algorithm together delivers a novel method which is flexible, robust,\nand extremely efficient. In particular, the method is able to outperform\ncompeting methods in terms of accuracy and uncertainty quantification. The cost\nis competitive on low-dimensional and small data sets, but is significantly\nlower for higher-dimensional and big data sets. Iteratively maximizing the\ndistribution of experts given allocations and allocations given experts does\nnot provide significant improvement, which indicates that the algorithm\nachieves a good approximation to the local MAP estimator very fast. This\ninsight can be useful also in the context of other mixture of experts models.\n","authors":["Clement Etienam","Kody Law","Sara Wade","Vitaly Zankin"],"pdf_url":"https://arxiv.org/pdf/2006.13309v4.pdf","comment":"22 pages, 28 figures, to be published in Machine Learning journal"},{"id":"http://arxiv.org/abs/2312.00268v1","updated":"2023-12-01T01:01:04Z","published":"2023-12-01T01:01:04Z","title":"Academic competitions","summary":" Academic challenges comprise effective means for (i) advancing the state of\nthe art, (ii) putting in the spotlight of a scientific community specific\ntopics and problems, as well as (iii) closing the gap for under represented\ncommunities in terms of accessing and participating in the shaping of research\nfields. Competitions can be traced back for centuries and their achievements\nhave had great influence in our modern world. Recently, they (re)gained\npopularity, with the overwhelming amounts of data that is being generated in\ndifferent domains, as well as the need of pushing the barriers of existing\nmethods, and available tools to handle such data. This chapter provides a\nsurvey of academic challenges in the context of machine learning and related\nfields. We review the most influential competitions in the last few years and\nanalyze challenges per area of knowledge. The aims of scientific challenges,\ntheir goals, major achievements and expectations for the next few years are\nreviewed.\n","authors":["Hugo Jair Escalante","Aleksandra Kruchinina"],"pdf_url":"https://arxiv.org/pdf/2312.00268v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06282v3","updated":"2023-12-01T00:55:13Z","published":"2023-10-10T03:32:33Z","title":"MuseChat: A Conversational Music Recommendation System for Videos","summary":" Music recommendation for videos attracts growing interest in multi-modal\nresearch. However, existing systems focus primarily on content compatibility,\noften ignoring the users' preferences. Their inability to interact with users\nfor further refinements or to provide explanations leads to a less satisfying\nexperience. We address these issues with MuseChat, a first-of-its-kind\ndialogue-based recommendation system that personalizes music suggestions for\nvideos. Our system consists of two key functionalities with associated modules:\nrecommendation and reasoning. The recommendation module takes a video along\nwith optional information including previous suggested music and user's\npreference as inputs and retrieves an appropriate music matching the context.\nThe reasoning module, equipped with the power of Large Language Model\n(Vicuna-7B) and extended to multi-modal inputs, is able to provide reasonable\nexplanation for the recommended music. To evaluate the effectiveness of\nMuseChat, we build a large-scale dataset, conversational music recommendation\nfor videos, that simulates a two-turn interaction between a user and a\nrecommender based on accurate music track information. Experiment results show\nthat MuseChat achieves significant improvements over existing video-based music\nretrieval methods as well as offers strong interpretability and\ninteractability.\n","authors":["Zhikang Dong","Bin Chen","Xiulong Liu","Pawel Polak","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.06282v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00267v1","updated":"2023-12-01T00:54:02Z","published":"2023-12-01T00:54:02Z","title":"Sample Efficient Reinforcement Learning from Human Feedback via Active\n Exploration","summary":" Preference-based feedback is important for many applications in reinforcement\nlearning where direct evaluation of a reward function is not feasible. A\nnotable recent example arises in reinforcement learning from human feedback\n(RLHF) on large language models. For many applications of RLHF, the cost of\nacquiring the human feedback can be substantial. In this work, we take\nadvantage of the fact that one can often choose contexts at which to obtain\nhuman feedback in order to most efficiently identify a good policy, and\nformalize this as an offline contextual dueling bandit problem. We give an\nupper-confidence-bound style algorithm for this problem and prove a polynomial\nworst-case regret bound. We then provide empirical confirmation in a synthetic\nsetting that our approach outperforms existing methods. After, we extend the\nsetting and methodology for practical use in RLHF training of large language\nmodels. Here, our method is able to reach better performance with fewer samples\nof human preferences than multiple baselines on three real-world datasets.\n","authors":["Viraj Mehta","Vikramjeet Das","Ojash Neopane","Yijia Dai","Ilija Bogunovic","Jeff Schneider","Willie Neiswanger"],"pdf_url":"https://arxiv.org/pdf/2312.00267v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.00414v1","updated":"2023-12-01T08:38:27Z","published":"2023-12-01T08:38:27Z","title":"Large-scale Vision-Language Models Learn Super Images for Efficient and\n High-Performance Partially Relevant Video Retrieval","summary":" In this paper, we propose an efficient and high-performance method for\npartially relevant video retrieval (PRVR), which aims to retrieve untrimmed\nlong videos that contain at least one relevant moment to the input text query.\nIn terms of both efficiency and performance, the overlooked bottleneck of\nprevious studies is the visual encoding of dense frames. This guides\nresearchers to choose lightweight visual backbones, yielding sub-optimal\nretrieval performance due to their limited capabilities of learned visual\nrepresentations. However, it is undesirable to simply replace them with\nhigh-performance large-scale vision-and-language models (VLMs) due to their low\nefficiency. To address these issues, instead of dense frames, we focus on super\nimages, which are created by rearranging the video frames in a $N \\times N$\ngrid layout. This reduces the number of visual encodings to $\\frac{1}{N^2}$ and\ncompensates for the low efficiency of large-scale VLMs, allowing us to adopt\nthem as powerful encoders. Surprisingly, we discover that with a simple\nquery-image attention trick, VLMs generalize well to super images effectively\nand demonstrate promising zero-shot performance against SOTA methods\nefficiently. In addition, we propose a fine-tuning approach by incorporating a\nfew trainable modules into the VLM backbones. The experimental results\ndemonstrate that our approaches efficiently achieve the best performance on\nActivityNet Captions and TVR.\n","authors":["Taichi Nishimura","Shota Nakada","Masayoshi Kondo"],"pdf_url":"https://arxiv.org/pdf/2312.00414v1.pdf","comment":"13 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.00401v1","updated":"2023-12-01T07:50:53Z","published":"2023-12-01T07:50:53Z","title":"VIoTGPT: Learning to Schedule Vision Tools towards Intelligent Video\n Internet of Things","summary":" Video Internet of Things (VIoT) has shown full potential in collecting an\nunprecedented volume of video data. Learning to schedule perceiving models and\nanalyzing the collected videos intelligently will be potential sparks for VIoT.\nIn this paper, to address the challenges posed by the fine-grained and\ninterrelated vision tool usage of VIoT, we build VIoTGPT, the framework based\non LLMs to correctly interact with humans, query knowledge videos, and invoke\nvision models to accomplish complicated tasks. To support VIoTGPT and related\nfuture works, we meticulously crafted the training dataset and established\nbenchmarks involving 11 representative vision models across three categories\nbased on semi-automatic annotations. To guide LLM to act as the intelligent\nagent towards intelligent VIoT, we resort to ReAct instruction tuning based on\nthe collected VIoT dataset to learn the tool capability. Quantitative and\nqualitative experimental results and analyses demonstrate the effectiveness of\nVIoTGPT.\n","authors":["Yaoyao Zhong","Mengshi Qi","Rui Wang","Yuhan Qiu","Yang Zhang","Huadong Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00401v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04591v3","updated":"2023-12-01T07:26:35Z","published":"2023-11-08T10:45:09Z","title":"Rethinking Event-based Human Pose Estimation with 3D Event\n Representations","summary":" Human pose estimation is a fundamental and appealing task in computer vision.\nTraditional frame-based cameras and videos are commonly applied, yet, they\nbecome less reliable in scenarios under high dynamic range or heavy motion\nblur. In contrast, event cameras offer a robust solution for navigating these\nchallenging contexts. Predominant methodologies incorporate event cameras into\nlearning frameworks by accumulating events into event frames. However, such\nmethods tend to marginalize the intrinsic asynchronous and high temporal\nresolution characteristics of events. This disregard leads to a loss in\nessential temporal dimension data, crucial for discerning distinct actions. To\naddress this issue and to unlock the 3D potential of event information, we\nintroduce two 3D event representations: the Rasterized Event Point Cloud\n(RasEPC) and the Decoupled Event Voxel (DEV). The RasEPC collates events within\nconcise temporal slices at identical positions, preserving 3D attributes with\nstatistical cues and markedly mitigating memory and computational demands.\nMeanwhile, the DEV representation discretizes events into voxels and projects\nthem across three orthogonal planes, utilizing decoupled event attention to\nretrieve 3D cues from the 2D planes. Furthermore, we develop and release\nEV-3DPW, a synthetic event-based dataset crafted to facilitate training and\nquantitative analysis in outdoor scenes. On the public real-world DHP19\ndataset, our event point cloud technique excels in real-time mobile\npredictions, while the decoupled event voxel method achieves the highest\naccuracy. Experiments on EV-3DPW demonstrate that the robustness of our\nproposed 3D representation methods compared to traditional RGB images and event\nframe techniques under the same backbones. Our code and dataset have been made\npublicly available at https://github.com/MasterHow/EventPointPose.\n","authors":["Xiaoting Yin","Hao Shi","Jiaan Chen","Ze Wang","Yaozu Ye","Huajian Ni","Kailun Yang","Kaiwei Wang"],"pdf_url":"https://arxiv.org/pdf/2311.04591v3.pdf","comment":"Extended version of arXiv:2206.04511. The code and dataset are\n available at https://github.com/MasterHow/EventPointPose"},{"id":"http://arxiv.org/abs/2312.00347v1","updated":"2023-12-01T04:51:01Z","published":"2023-12-01T04:51:01Z","title":"RTQ: Rethinking Video-language Understanding Based on Image-text Model","summary":" Recent advancements in video-language understanding have been established on\nthe foundation of image-text models, resulting in promising outcomes due to the\nshared knowledge between images and videos. However, video-language\nunderstanding presents unique challenges due to the inclusion of highly complex\nsemantic details, which result in information redundancy, temporal dependency,\nand scene complexity. Current techniques have only partially tackled these\nissues, and our quantitative analysis indicates that some of these methods are\ncomplementary. In light of this, we propose a novel framework called RTQ\n(Refine, Temporal model, and Query), which addresses these challenges\nsimultaneously. The approach involves refining redundant information within\nframes, modeling temporal relations among frames, and querying task-specific\ninformation from the videos. Remarkably, our model demonstrates outstanding\nperformance even in the absence of video-language pre-training, and the results\nare comparable with or superior to those achieved by state-of-the-art\npre-training methods.\n","authors":["Xiao Wang","Yaoyu Li","Tian Gan","Zheng Zhang","Jingjing Lv","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.00347v1.pdf","comment":"Accepted by ACM MM 2023 as Oral representation"},{"id":"http://arxiv.org/abs/2312.02188v1","updated":"2023-12-01T23:56:00Z","published":"2023-12-01T23:56:00Z","title":"Video Summarization: Towards Entity-Aware Captions","summary":" Existing popular video captioning benchmarks and models deal with generic\ncaptions devoid of specific person, place or organization named entities. In\ncontrast, news videos present a challenging setting where the caption requires\nsuch named entities for meaningful summarization. As such, we propose the task\nof summarizing news video directly to entity-aware captions. We also release a\nlarge-scale dataset, VIEWS (VIdeo NEWS), to support research on this task.\nFurther, we propose a method that augments visual information from videos with\ncontext retrieved from external world knowledge to generate entity-aware\ncaptions. We demonstrate the effectiveness of our approach on three video\ncaptioning models. We also show that our approach generalizes to existing news\nimage captions dataset. With all the extensive experiments and insights, we\nbelieve we establish a solid basis for future research on this challenging\ntask.\n","authors":["Hammad A. Ayyubi","Tianqi Liu","Arsha Nagrani","Xudong Lin","Mingda Zhang","Anurag Arnab","Feng Han","Yukun Zhu","Jialu Liu","Shih-Fu Chang"],"pdf_url":"https://arxiv.org/pdf/2312.02188v1.pdf","comment":null}]},"2023-12-04T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.02149v1","updated":"2023-12-04T18:59:25Z","published":"2023-12-04T18:59:25Z","title":"Generative Powers of Ten","summary":" We present a method that uses a text-to-image model to generate consistent\ncontent across multiple image scales, enabling extreme semantic zooms into a\nscene, e.g., ranging from a wide-angle landscape view of a forest to a macro\nshot of an insect sitting on one of the tree branches. We achieve this through\na joint multi-scale diffusion sampling approach that encourages consistency\nacross different scales while preserving the integrity of each individual\nsampling process. Since each generated scale is guided by a different text\nprompt, our method enables deeper levels of zoom than traditional\nsuper-resolution methods that may struggle to create new contextual structure\nat vastly different scales. We compare our method qualitatively with\nalternative techniques in image super-resolution and outpainting, and show that\nour method is most effective at generating consistent multi-scale content.\n","authors":["Xiaojuan Wang","Janne Kontkanen","Brian Curless","Steve Seitz","Ira Kemelmacher","Ben Mildenhall","Pratul Srinivasan","Dor Verbin","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2312.02149v1.pdf","comment":"Project page: https://powers-of-10.github.io/"},{"id":"http://arxiv.org/abs/2312.02143v1","updated":"2023-12-04T18:58:57Z","published":"2023-12-04T18:58:57Z","title":"Competition-Level Problems Are Effective Evaluators of LLMs","summary":" Large language models (LLMs) have demonstrated impressive reasoning\ncapabilities, yet there is ongoing debate about these abilities and the\npotential data contamination problem recently. This paper aims to evaluate the\nreasoning capacities of LLMs, specifically in solving recent competition-level\nprogramming problems in Codeforces, which are expert-crafted and unique,\nrequiring deep understanding and robust reasoning skills. We first provide a\ncomprehensive evaluation of GPT-4's peiceived zero-shot performance on this\ntask, considering various aspects such as problems' release time, difficulties,\nand types of errors encountered. Surprisingly, the peiceived performance of\nGPT-4 has experienced a cliff like decline in problems after September 2021\nconsistently across all the difficulties and types of problems, which shows the\npotential data contamination, as well as the challenges for any existing LLM to\nsolve unseen complex reasoning problems. We further explore various approaches\nsuch as fine-tuning, Chain-of-Thought prompting and problem description\nsimplification, unfortunately none of them is able to consistently mitigate the\nchallenges. Through our work, we emphasis the importance of this excellent data\nsource for assessing the genuine reasoning capabilities of LLMs, and foster the\ndevelopment of LLMs with stronger reasoning abilities and better generalization\nin the future.\n","authors":["Yiming Huang","Zhenghao Lin","Xiao Liu","Yeyun Gong","Shuai Lu","Fangyu Lei","Yaobo Liang","Yelong Shen","Chen Lin","Nan Duan","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.02143v1.pdf","comment":"In progress"},{"id":"http://arxiv.org/abs/2312.02125v1","updated":"2023-12-04T18:52:26Z","published":"2023-12-04T18:52:26Z","title":"TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and\n Advanced Decoding Techniques","summary":" Recent advances in language models (LMs), have demonstrated significant\nefficacy in tasks related to the arts and humanities. While LMs have exhibited\nexceptional performance across a wide range of natural language processing\ntasks, there are notable challenges associated with their utilization on small\ndatasets and their ability to replicate more creative human capacities. In this\nstudy, we aim to address these challenges by training a Persian classical\npoetry generation model using a transformer architecture on a specialized\ndataset with no pretraining. Additionally, we propose a novel decoding method\nto enhance coherence and meaningfulness in the generated poetry, effectively\nmanaging the tradeoff between diversity and quality. Furthermore, the results\nof our training approach and the proposed decoding method are evaluated through\ncomprehensive set of automatic and human evaluations and showed its superior\ncapability to generate coherent and meaningful poetry in compare to other\ndecoding methods and an existing Persian large language model (LLM).\n","authors":["Amir Panahandeh","Hanie Asemi","Esmail Nourani"],"pdf_url":"https://arxiv.org/pdf/2312.02125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02120v1","updated":"2023-12-04T18:50:35Z","published":"2023-12-04T18:50:35Z","title":"Magicoder: Source Code Is All You Need","summary":" We introduce Magicoder, a series of fully open-source (code, weights, and\ndata) Large Language Models (LLMs) for code that significantly closes the gap\nwith top code models while having no more than 7B parameters. Magicoder models\nare trained on 75K synthetic instruction data using OSS-Instruct, a novel\napproach to enlightening LLMs with open-source code snippets to generate\nhigh-quality instruction data for code. Our main motivation is to mitigate the\ninherent bias of the synthetic data generated by LLMs by empowering them with a\nwealth of open-source references for the production of more diverse, realistic,\nand controllable data. The orthogonality of OSS-Instruct and other data\ngeneration methods like Evol-Instruct further enables us to build an enhanced\nMagicoderS. Both Magicoder and MagicoderS substantially outperform\nstate-of-the-art code models with similar or even larger sizes on a wide range\nof coding benchmarks, including Python text-to-code generation, multilingual\ncoding, and data-science program completion. Notably, MagicoderS-CL-7B based on\nCodeLlama even surpasses the prominent ChatGPT on HumanEval+ (66.5 vs. 65.9 in\npass@1). Overall, OSS-Instruct opens a new direction for low-bias and\nhigh-quality instruction tuning using abundant open-source references.\n","authors":["Yuxiang Wei","Zhe Wang","Jiawei Liu","Yifeng Ding","Lingming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02120v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02119v1","updated":"2023-12-04T18:49:23Z","published":"2023-12-04T18:49:23Z","title":"Tree of Attacks: Jailbreaking Black-Box LLMs Automatically","summary":" While Large Language Models (LLMs) display versatile functionality, they\ncontinue to generate harmful, biased, and toxic content, as demonstrated by the\nprevalence of human-designed jailbreaks. In this work, we present Tree of\nAttacks with Pruning (TAP), an automated method for generating jailbreaks that\nonly requires black-box access to the target LLM. TAP utilizes an LLM to\niteratively refine candidate (attack) prompts using tree-of-thoughts reasoning\nuntil one of the generated prompts jailbreaks the target. Crucially, before\nsending prompts to the target, TAP assesses them and prunes the ones unlikely\nto result in jailbreaks. Using tree-of-thought reasoning allows TAP to navigate\na large search space of prompts and pruning reduces the total number of queries\nsent to the target. In empirical evaluations, we observe that TAP generates\nprompts that jailbreak state-of-the-art LLMs (including GPT4 and GPT4-Turbo)\nfor more than 80% of the prompts using only a small number of queries. This\nsignificantly improves upon the previous state-of-the-art black-box method for\ngenerating jailbreaks.\n","authors":["Anay Mehrotra","Manolis Zampetakis","Paul Kassianik","Blaine Nelson","Hyrum Anderson","Yaron Singer","Amin Karbasi"],"pdf_url":"https://arxiv.org/pdf/2312.02119v1.pdf","comment":"An implementation of the presented method is available at\n https://github.com/RICommunity/TAP"},{"id":"http://arxiv.org/abs/2312.02118v1","updated":"2023-12-04T18:49:06Z","published":"2023-12-04T18:49:06Z","title":"When it Rains, it Pours: Modeling Media Storms and the News Ecosystem","summary":" Most events in the world receive at most brief coverage by the news media.\nOccasionally, however, an event will trigger a media storm, with voluminous and\nwidespread coverage lasting for weeks instead of days. In this work, we develop\nand apply a pairwise article similarity model, allowing us to identify story\nclusters in corpora covering local and national online news, and thereby create\na comprehensive corpus of media storms over a nearly two year period. Using\nthis corpus, we investigate media storms at a new level of granularity,\nallowing us to validate claims about storm evolution and topical distribution,\nand provide empirical support for previously hypothesized patterns of influence\nof storms on media coverage and intermedia agenda setting.\n","authors":["Benjamin Litterer","David Jurgens","Dallas Card"],"pdf_url":"https://arxiv.org/pdf/2312.02118v1.pdf","comment":"Findings of EMNLP 2023; 16 pages; 12 figures; 4 tables"},{"id":"http://arxiv.org/abs/2308.01987v2","updated":"2023-12-04T17:54:41Z","published":"2023-08-03T18:49:45Z","title":"Bengali Fake Reviews: A Benchmark Dataset and Detection System","summary":" The proliferation of fake reviews on various online platforms has created a\nmajor concern for both consumers and businesses. Such reviews can deceive\ncustomers and cause damage to the reputation of products or services, making it\ncrucial to identify them. Although the detection of fake reviews has been\nextensively studied in English language, detecting fake reviews in non-English\nlanguages such as Bengali is still a relatively unexplored research area. This\npaper introduces the Bengali Fake Review Detection (BFRD) dataset, the first\npublicly available dataset for identifying fake reviews in Bengali. The dataset\nconsists of 7710 non-fake and 1339 fake food-related reviews collected from\nsocial media posts. To convert non-Bengali words in a review, a unique pipeline\nhas been proposed that translates English words to their corresponding Bengali\nmeaning and also back transliterates Romanized Bengali to Bengali. We have\nconducted rigorous experimentation using multiple deep learning and pre-trained\ntransformer language models to develop a reliable detection system. Finally, we\npropose a weighted ensemble model that combines four pre-trained transformers:\nBanglaBERT, BanglaBERT Base, BanglaBERT Large, and BanglaBERT Generator .\nAccording to the experiment results, the proposed ensemble model obtained a\nweighted F1-score of 0.9843 on 13390 reviews, including 1339 actual fake\nreviews and 5356 augmented fake reviews generated with the nlpaug library. The\nremaining 6695 reviews were randomly selected from the 7710 non-fake instances.\nThe model achieved a 0.9558 weighted F1-score when the fake reviews were\naugmented using the bnaug library.\n","authors":["G. M. Shahariar","Md. Tanvir Rouf Shawon","Faisal Muhammad Shah","Mohammad Shafiul Alam","Md. Shahriar Mahbub"],"pdf_url":"https://arxiv.org/pdf/2308.01987v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02073v1","updated":"2023-12-04T17:35:42Z","published":"2023-12-04T17:35:42Z","title":"A Glitch in the Matrix? Locating and Detecting Language Model Grounding\n with Fakepedia","summary":" Large language models (LLMs) have demonstrated impressive capabilities in\nstoring and recalling factual knowledge, but also in adapting to novel\nin-context information. Yet, the mechanisms underlying their in-context\ngrounding remain unknown, especially in situations where in-context information\ncontradicts factual knowledge embedded in the parameters. This is critical for\nretrieval-augmented generation methods, which enrich the context with\nup-to-date information, hoping that grounding can rectify the outdated\nparametric knowledge. In this study, we introduce Fakepedia, a counterfactual\ndataset designed to evaluate grounding abilities when the parametric knowledge\nclashes with the in-context information. We benchmark various LLMs with\nFakepedia and discover that GPT-4-turbo has a strong preference for its\nparametric knowledge. Mistral-7B, on the contrary, is the model that most\nrobustly chooses the grounded answer. Then, we conduct causal mediation\nanalysis on LLM components when answering Fakepedia queries. We demonstrate\nthat inspection of the computational graph alone can predict LLM grounding with\n92.8% accuracy, especially because few MLPs in the Transformer can predict\nnon-grounded behavior. Our results, together with existing findings about\nfactual recall mechanisms, provide a coherent narrative of how grounding and\nfactual recall mechanisms interact within LLMs.\n","authors":["Giovanni Monea","Maxime Peyrard","Martin Josifoski","Vishrav Chaudhary","Jason Eisner","Emre Kıcıman","Hamid Palangi","Barun Patra","Robert West"],"pdf_url":"https://arxiv.org/pdf/2312.02073v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09512v2","updated":"2023-12-04T17:23:55Z","published":"2022-12-19T14:48:08Z","title":"Rethinking Label Smoothing on Multi-hop Question Answering","summary":" Multi-Hop Question Answering (MHQA) is a significant area in question\nanswering, requiring multiple reasoning components, including document\nretrieval, supporting sentence prediction, and answer span extraction. In this\nwork, we analyze the primary factors limiting the performance of multi-hop\nreasoning and introduce label smoothing into the MHQA task. This is aimed at\nenhancing the generalization capabilities of MHQA systems and mitigating\noverfitting of answer spans and reasoning paths in training set. We propose a\nnovel label smoothing technique, F1 Smoothing, which incorporates uncertainty\ninto the learning process and is specifically tailored for Machine Reading\nComprehension (MRC) tasks. Inspired by the principles of curriculum learning,\nwe introduce the Linear Decay Label Smoothing Algorithm (LDLA), which\nprogressively reduces uncertainty throughout the training process. Experiment\non the HotpotQA dataset demonstrates the effectiveness of our methods in\nenhancing performance and generalizability in multi-hop reasoning, achieving\nnew state-of-the-art results on the leaderboard.\n","authors":["Zhangyue Yin","Yuxin Wang","Xiannian Hu","Yiguang Wu","Hang Yan","Xinyu Zhang","Zhao Cao","Xuanjing Huang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2212.09512v2.pdf","comment":"13 pages, 8 figures, accepted by CCL2023"},{"id":"http://arxiv.org/abs/2312.02065v1","updated":"2023-12-04T17:19:53Z","published":"2023-12-04T17:19:53Z","title":"Know Your Audience: Do LLMs Adapt to Different Age and Education Levels?","summary":" Large language models (LLMs) offer a range of new possibilities, including\nadapting the text to different audiences and their reading needs. But how well\ndo they adapt? We evaluate the readability of answers generated by four\nstate-of-the-art LLMs (commercial and open-source) to science questions when\nprompted to target different age groups and education levels. To assess the\nadaptability of LLMs to diverse audiences, we compare the readability scores of\nthe generated responses against the recommended comprehension level of each age\nand education group. We find large variations in the readability of the answers\nby different LLMs. Our results suggest LLM answers need to be better adapted to\nthe intended audience demographics to be more comprehensible. They underline\nthe importance of enhancing the adaptability of LLMs in education settings to\ncater to diverse age and education levels. Overall, current LLMs have set\nreadability ranges and do not adapt well to different audiences, even when\nprompted. That limits their potential for educational purposes.\n","authors":["Donya Rooein","Amanda Cercas Curry","Dirk Hovy"],"pdf_url":"https://arxiv.org/pdf/2312.02065v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02051v1","updated":"2023-12-04T17:09:52Z","published":"2023-12-04T17:09:52Z","title":"TimeChat: A Time-sensitive Multimodal Large Language Model for Long\n Video Understanding","summary":" This work proposes TimeChat, a time-sensitive multimodal large language model\nspecifically designed for long video understanding. Our model incorporates two\nkey architectural contributions: (1) a timestamp-aware frame encoder that binds\nvisual content with the timestamp of each frame, and (2) a sliding video\nQ-Former that produces a video token sequence of varying lengths to accommodate\nvideos of various durations. Additionally, we construct an instruction-tuning\ndataset, encompassing 6 tasks and a total of 125K instances, to further enhance\nTimeChat's instruction-following performance. Experiment results across various\nvideo understanding tasks, such as dense captioning, temporal grounding, and\nhighlight detection, demonstrate TimeChat's strong zero-shot temporal\nlocalization and reasoning capabilities. For example, it achieves +9.2 F1 score\nand +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5)\non Charades-STA, compared to state-of-the-art video large language models,\nholding the potential to serve as a versatile video assistant for long-form\nvideo comprehension tasks and satisfy realistic user requirements.\n","authors":["Shuhuai Ren","Linli Yao","Shicheng Li","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2312.02051v1.pdf","comment":"17 pages, 10 figures, code is available at\n https://github.com/RenShuhuai-Andy/TimeChat"},{"id":"http://arxiv.org/abs/2311.14743v4","updated":"2023-12-04T16:31:30Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n Foundation Models Under Distribution Shift","summary":" Foundation models, specifically Large Language Models (LLM's), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align LLM's. These reward models are additionally used at\ninference-time to estimate LLM responses' adherence to those desired behaviors.\nHowever, there is little work measuring how robust these reward models are to\ndistribution shifts. In this work, we evaluate how reward model performance -\nmeasured via accuracy and calibration (i.e. alignment between accuracy and\nconfidence) - is affected by distribution shift. We show novel calibration\npatterns and accuracy drops due to OOD prompts and responses, and that the\nreward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting to detect these distribution shifts\nin prompts and responses.\n","authors":["Ben Pikus","Will LeVine","Tony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.09928v2","updated":"2023-12-04T16:18:33Z","published":"2022-12-20T00:33:11Z","title":"Improving the Robustness of Summarization Models by Detecting and\n Removing Input Noise","summary":" The evaluation of abstractive summarization models typically uses test data\nthat is identically distributed as training data. In real-world practice,\ndocuments to be summarized may contain input noise caused by text extraction\nartifacts or data pipeline bugs. The robustness of model performance under\ndistribution shift caused by such noise is relatively under-studied. We present\na large empirical study quantifying the sometimes severe loss in performance\n(up to 12 ROUGE-1 points) from different types of input noise for a range of\ndatasets and model sizes. We then propose a light-weight method for detecting\nand removing such noise in the input during model inference without requiring\nany extra training, auxiliary models, or even prior knowledge of the type of\nnoise. Our proposed approach effectively mitigates the loss in performance,\nrecovering a large fraction of the performance drop, sometimes as large as 11\nROUGE-1 points.\n","authors":["Kundan Krishna","Yao Zhao","Jie Ren","Balaji Lakshminarayanan","Jiaming Luo","Mohammad Saleh","Peter J. Liu"],"pdf_url":"https://arxiv.org/pdf/2212.09928v2.pdf","comment":"EMNLP Findings 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2311.18803v2","updated":"2023-12-04T16:13:21Z","published":"2023-11-30T18:49:43Z","title":"BioCLIP: A Vision Foundation Model for the Tree of Life","summary":" Images of the natural world, collected by a variety of cameras, from drones\nto individual phones, are increasingly abundant sources of biological\ninformation. There is an explosion of computational methods and tools,\nparticularly computer vision, for extracting biologically relevant information\nfrom images for science and conservation. Yet most of these are bespoke\napproaches designed for a specific task and are not easily adaptable or\nextendable to new questions, contexts, and datasets. A vision model for general\norganismal biology questions on images is of timely need. To approach this, we\ncurate and release TreeOfLife-10M, the largest and most diverse ML-ready\ndataset of biology images. We then develop BioCLIP, a foundation model for the\ntree of life, leveraging the unique properties of biology captured by\nTreeOfLife-10M, namely the abundance and variety of images of plants, animals,\nand fungi, together with the availability of rich structured biological\nknowledge. We rigorously benchmark our approach on diverse fine-grained biology\nclassification tasks, and find that BioCLIP consistently and substantially\noutperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation\nreveals that BioCLIP has learned a hierarchical representation conforming to\nthe tree of life, shedding light on its strong generalizability. Our code,\nmodels and data will be made available at\nhttps://github.com/Imageomics/bioclip.\n","authors":["Samuel Stevens","Jiaman Wu","Matthew J Thompson","Elizabeth G Campolongo","Chan Hee Song","David Edward Carlyn","Li Dong","Wasila M Dahdul","Charles Stewart","Tanya Berger-Wolf","Wei-Lun Chao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2311.18803v2.pdf","comment":"18 pages; updated title"},{"id":"http://arxiv.org/abs/2310.02980v2","updated":"2023-12-04T16:01:13Z","published":"2023-10-04T17:17:06Z","title":"Never Train from Scratch: Fair Comparison of Long-Sequence Models\n Requires Data-Driven Priors","summary":" Modeling long-range dependencies across sequences is a longstanding goal in\nmachine learning and has led to architectures, such as state space models, that\ndramatically outperform Transformers on long sequences. However, these\nimpressive empirical gains have been by and large demonstrated on benchmarks\n(e.g. Long Range Arena), where models are randomly initialized and trained to\npredict a target label from an input sequence. In this work, we show that\nrandom initialization leads to gross overestimation of the differences between\narchitectures and that pretraining with standard denoising objectives, using\n$\\textit{only the downstream task data}$, leads to dramatic gains across\nmultiple architectures and to very small gaps between Transformers and state\nspace models (SSMs). In stark contrast to prior works, we find vanilla\nTransformers to match the performance of S4 on Long Range Arena when properly\npretrained, and we improve the best reported results of SSMs on the PathX-256\ntask by 20 absolute points. Subsequently, we analyze the utility of\npreviously-proposed structured parameterizations for SSMs and show they become\nmostly redundant in the presence of data-driven initialization obtained through\npretraining. Our work shows that, when evaluating different architectures on\nsupervised tasks, incorporation of data-driven priors via pretraining is\nessential for reliable performance estimation, and can be done efficiently.\n","authors":["Ido Amos","Jonathan Berant","Ankit Gupta"],"pdf_url":"https://arxiv.org/pdf/2310.02980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14296v2","updated":"2023-12-04T15:53:50Z","published":"2023-05-23T17:39:54Z","title":"USB: A Unified Summarization Benchmark Across Tasks and Domains","summary":" While the NLP community has produced numerous summarization benchmarks, none\nprovide the rich annotations required to simultaneously address many important\nproblems related to control and reliability. We introduce a Wikipedia-derived\nbenchmark, complemented by a rich set of crowd-sourced annotations, that\nsupports $8$ interrelated tasks: (i) extractive summarization; (ii) abstractive\nsummarization; (iii) topic-based summarization; (iv) compressing selected\nsentences into a one-line summary; (v) surfacing evidence for a summary\nsentence; (vi) predicting the factual accuracy of a summary sentence; (vii)\nidentifying unsubstantiated spans in a summary sentence; (viii) correcting\nfactual errors in summaries. We compare various methods on this benchmark and\ndiscover that on multiple tasks, moderately-sized fine-tuned models\nconsistently outperform much larger few-shot prompted language models. For\nfactuality-related tasks, we also evaluate existing heuristics to create\ntraining data and find that training on them results in worse performance than\ntraining on $20\\times$ less human-labeled data. Our articles draw from $6$\ndomains, facilitating cross-domain analysis. On some tasks, the amount of\ntraining data matters more than the domain where it comes from, while for other\ntasks training specifically on data from the target domain, even if limited, is\nmore beneficial.\n","authors":["Kundan Krishna","Prakhar Gupta","Sanjana Ramprasad","Byron C. Wallace","Jeffrey P. Bigham","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2305.14296v2.pdf","comment":"EMNLP Findings 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2311.17065v2","updated":"2023-12-04T15:37:57Z","published":"2023-11-22T17:14:18Z","title":"Efficient Deep Speech Understanding at the Edge","summary":" In contemporary speech understanding (SU), a sophisticated pipeline is\nemployed, encompassing the ingestion of streaming voice input. The pipeline\nexecutes beam search iteratively, invoking a deep neural network to generate\ntentative outputs (referred to as hypotheses) in an autoregressive manner.\nPeriodically, the pipeline assesses attention and Connectionist Temporal\nClassification (CTC) scores.\n This paper aims to enhance SU performance on edge devices with limited\nresources. Adopting a hybrid strategy, our approach focuses on accelerating\non-device execution and offloading inputs surpassing the device's capacity.\nWhile this approach is established, we tackle SU's distinctive challenges\nthrough innovative techniques: (1) Late Contextualization: This involves the\nparallel execution of a model's attentive encoder during input ingestion. (2)\nPilot Inference: Addressing temporal load imbalances in the SU pipeline, this\ntechnique aims to mitigate them effectively. (3) Autoregression Offramps:\nDecisions regarding offloading are made solely based on hypotheses, presenting\na novel approach.\n These techniques are designed to seamlessly integrate with existing speech\nmodels, pipelines, and frameworks, offering flexibility for independent or\ncombined application. Collectively, they form a hybrid solution for edge SU.\nOur prototype, named XYZ, has undergone testing on Arm platforms featuring 6 to\n8 cores, demonstrating state-of-the-art accuracy. Notably, it achieves a 2x\nreduction in end-to-end latency and a corresponding 2x decrease in offloading\nrequirements.\n","authors":["Rongxiang Wang","Felix Xiaozhu Lin"],"pdf_url":"https://arxiv.org/pdf/2311.17065v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06851v3","updated":"2023-12-04T15:34:29Z","published":"2023-11-12T14:01:38Z","title":"Automatic Textual Normalization for Hate Speech Detection","summary":" Social media data is a valuable resource for research, yet it contains a wide\nrange of non-standard words (NSW). These irregularities hinder the effective\noperation of NLP tools. Current state-of-the-art methods for the Vietnamese\nlanguage address this issue as a problem of lexical normalization, involving\nthe creation of manual rules or the implementation of multi-staged deep\nlearning frameworks, which necessitate extensive efforts to craft intricate\nrules. In contrast, our approach is straightforward, employing solely a\nsequence-to-sequence (Seq2Seq) model. In this research, we provide a dataset\nfor textual normalization, comprising 2,181 human-annotated comments with an\ninter-annotator agreement of 0.9014. By leveraging the Seq2Seq model for\ntextual normalization, our results reveal that the accuracy achieved falls\nslightly short of 70%. Nevertheless, textual normalization enhances the\naccuracy of the Hate Speech Detection (HSD) task by approximately 2%,\ndemonstrating its potential to improve the performance of complex NLP tasks.\nOur dataset is accessible for research purposes.\n","authors":["Anh Thi-Hoang Nguyen","Dung Ha Nguyen","Nguyet Thi Nguyen","Khanh Thanh-Duy Ho","Kiet Van Nguyen"],"pdf_url":"https://arxiv.org/pdf/2311.06851v3.pdf","comment":"Accepted to present at 2023 International Conference on Intelligent\n Systems Design and Applications (ISDA2023)"},{"id":"http://arxiv.org/abs/2312.01957v1","updated":"2023-12-04T15:16:12Z","published":"2023-12-04T15:16:12Z","title":"Distilled Self-Critique of LLMs with Synthetic Data: a Bayesian\n Perspective","summary":" This paper proposes an interpretation of RLAIF as Bayesian inference by\nintroducing distilled Self-Critique (dSC), which refines the outputs of a LLM\nthrough a Gibbs sampler that is later distilled into a fine-tuned model. Only\nrequiring synthetic data, dSC is exercised in experiments regarding safety,\nsentiment, and privacy control, showing it can be a viable and cheap\nalternative to align LLMs. Code released at\n\\url{https://github.com/vicgalle/distilled-self-critique}.\n","authors":["Victor Gallego"],"pdf_url":"https://arxiv.org/pdf/2312.01957v1.pdf","comment":"Submitted to ICLR 2024 (TinyPapers track)"},{"id":"http://arxiv.org/abs/2312.01954v1","updated":"2023-12-04T15:12:04Z","published":"2023-12-04T15:12:04Z","title":"Zero- and Few-Shots Knowledge Graph Triplet Extraction with Large\n Language Models","summary":" In this work, we tested the Triplet Extraction (TE) capabilities of a variety\nof Large Language Models (LLMs) of different sizes in the Zero- and Few-Shots\nsettings. In detail, we proposed a pipeline that dynamically gathers contextual\ninformation from a Knowledge Base (KB), both in the form of context triplets\nand of (sentence, triplets) pairs as examples, and provides it to the LLM\nthrough a prompt. The additional context allowed the LLMs to be competitive\nwith all the older fully trained baselines based on the Bidirectional Long\nShort-Term Memory (BiLSTM) Network architecture. We further conducted a\ndetailed analysis of the quality of the gathered KB context, finding it to be\nstrongly correlated with the final TE performance of the model. In contrast,\nthe size of the model appeared to only logarithmically improve the TE\ncapabilities of the LLMs.\n","authors":["Andrea Papaluca","Daniel Krefl","Sergio Mendez Rodriguez","Artem Lensky","Hanna Suominen"],"pdf_url":"https://arxiv.org/pdf/2312.01954v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12509v2","updated":"2023-12-04T15:07:13Z","published":"2023-06-21T18:45:56Z","title":"Joint Prompt Optimization of Stacked LLMs using Variational Inference","summary":" Large language models (LLMs) can be seen as atomic units of computation\nmapping sequences to a distribution over sequences. Thus, they can be seen as\nstochastic language layers in a language network, where the learnable\nparameters are the natural language prompts at each layer. By stacking two such\nlayers and feeding the output of one layer to the next, we obtain a Deep\nLanguage Network (DLN). We first show how to effectively perform prompt\noptimization for a 1-Layer language network (DLN-1). Then, we present an\nextension that applies to 2-layer DLNs (DLN-2), where two prompts must be\nlearned. The key idea is to consider the output of the first layer as a latent\nvariable, which requires inference, and prompts to be learned as the parameters\nof the generative distribution. We first test the effectiveness of DLN-1 in\nmultiple reasoning and natural language understanding tasks. Then, we show that\nDLN-2 can reach higher performance than a single layer, showing promise that we\nmight reach comparable performance to GPT-4, even when each LLM in the network\nis smaller and less powerful.\n","authors":["Alessandro Sordoni","Xingdi Yuan","Marc-Alexandre Côté","Matheus Pereira","Adam Trischler","Ziang Xiao","Arian Hosseini","Friederike Niedtner","Nicolas Le Roux"],"pdf_url":"https://arxiv.org/pdf/2306.12509v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.01921v1","updated":"2023-12-04T14:29:28Z","published":"2023-12-04T14:29:28Z","title":"A Machine Learning Approach Towards SKILL Code Autocompletion","summary":" As Moore's Law continues to increase the complexity of electronic systems,\nElectronic Design Automation (EDA) must advance to meet global demand. An\nimportant example of an EDA technology is SKILL, a scripting language used to\ncustomize and extend EDA software. Recently, code generation models using the\ntransformer architecture have achieved impressive results in academic settings\nand have even been used in commercial developer tools to improve developer\nproductivity. To the best of our knowledge, this study is the first to apply\ntransformers to SKILL code autocompletion towards improving the productivity of\nhardware design engineers. In this study, a novel, data-efficient methodology\nfor generating SKILL code is proposed and experimentally validated. More\nspecifically, we propose a novel methodology for (i) creating a high-quality\nSKILL dataset with both unlabeled and labeled data, (ii) a training strategy\nwhere T5 models pre-trained on general programming language code are fine-tuned\non our custom SKILL dataset using unsupervised and supervised learning, and\n(iii) evaluating synthesized SKILL code. We show that models trained using the\nproposed methodology outperform baselines in terms of human-judgment score and\nBLEU score. A major challenge faced was the extremely small amount of available\nSKILL code data that can be used to train a transformer model to generate SKILL\ncode. Despite our validated improvements, the extremely small dataset available\nto us was still not enough to train a model that can reliably autocomplete\nSKILL code. We discuss this and other limitations as well as future work that\ncould address these limitations.\n","authors":["Enrique Dehaerne","Bappaditya Dey","Wannes Meert"],"pdf_url":"https://arxiv.org/pdf/2312.01921v1.pdf","comment":"Accepted for SPIE Advanced Lithography + Patterning, 2024"},{"id":"http://arxiv.org/abs/2310.09909v3","updated":"2023-12-04T14:13:35Z","published":"2023-10-15T18:32:27Z","title":"Can GPT-4V(ision) Serve Medical Applications? Case Studies on GPT-4V for\n Multimodal Medical Diagnosis","summary":" Driven by the large foundation models, the development of artificial\nintelligence has witnessed tremendous progress lately, leading to a surge of\ngeneral interest from the public. In this study, we aim to assess the\nperformance of OpenAI's newest model, GPT-4V(ision), specifically in the realm\nof multimodal medical diagnosis. Our evaluation encompasses 17 human body\nsystems, including Central Nervous System, Head and Neck, Cardiac, Chest,\nHematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology,\nObstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma,\nPediatrics, with images taken from 8 modalities used in daily clinic routine,\ne.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI),\nPositron Emission Tomography (PET), Digital Subtraction Angiography (DSA),\nMammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on\nmultiple clinical tasks with or without patent history provided, including\nimaging modality and anatomy recognition, disease diagnosis, report generation,\ndisease localisation.\n Our observation shows that, while GPT-4V demonstrates proficiency in\ndistinguishing between medical image modalities and anatomy, it faces\nsignificant challenges in disease diagnosis and generating comprehensive\nreports. These findings underscore that while large multimodal models have made\nsignificant advancements in computer vision and natural language processing, it\nremains far from being used to effectively support real-world medical\napplications and clinical decision-making.\n All images used in this report can be found in\nhttps://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation.\n","authors":["Chaoyi Wu","Jiayu Lei","Qiaoyu Zheng","Weike Zhao","Weixiong Lin","Xiaoman Zhang","Xiao Zhou","Ziheng Zhao","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2310.09909v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01242v2","updated":"2023-12-04T13:36:59Z","published":"2023-06-02T02:42:58Z","title":"Responsible Task Automation: Empowering Large Language Models as\n Responsible Task Automators","summary":" The recent success of Large Language Models (LLMs) signifies an impressive\nstride towards artificial general intelligence. They have shown a promising\nprospect in automatically completing tasks upon user instructions, functioning\nas brain-like coordinators. The associated risks will be revealed as we\ndelegate an increasing number of tasks to machines for automated completion. A\nbig question emerges: how can we make machines behave responsibly when helping\nhumans automate tasks as personal copilots? In this paper, we explore this\nquestion in depth from the perspectives of feasibility, completeness and\nsecurity. In specific, we present Responsible Task Automation (ResponsibleTA)\nas a fundamental framework to facilitate responsible collaboration between\nLLM-based coordinators and executors for task automation with three empowered\ncapabilities: 1) predicting the feasibility of the commands for executors; 2)\nverifying the completeness of executors; 3) enhancing the security (e.g., the\nprotection of users' privacy). We further propose and compare two paradigms for\nimplementing the first two capabilities. One is to leverage the generic\nknowledge of LLMs themselves via prompt engineering while the other is to adopt\ndomain-specific learnable models. Moreover, we introduce a local memory\nmechanism for achieving the third capability. We evaluate our proposed\nResponsibleTA on UI task automation and hope it could bring more attentions to\nensuring LLMs more responsible in diverse scenarios.\n","authors":["Zhizheng Zhang","Xiaoyi Zhang","Wenxuan Xie","Yan Lu"],"pdf_url":"https://arxiv.org/pdf/2306.01242v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01858v1","updated":"2023-12-04T12:45:30Z","published":"2023-12-04T12:45:30Z","title":"Evaluating Dependencies in Fact Editing for Language Models: Specificity\n and Implication Awareness","summary":" The potential of using a large language model (LLM) as a knowledge base (KB)\nhas sparked significant interest. To manage the knowledge acquired by LLMs, we\nneed to ensure that the editing of learned facts respects internal logical\nconstraints, which are known as dependency of knowledge. Existing work on\nediting LLMs has partially addressed the issue of dependency, when the editing\nof a fact should apply to its lexical variations without disrupting irrelevant\nones. However, they neglect the dependency between a fact and its logical\nimplications. We propose an evaluation protocol with an accompanying\nquestion-answering dataset, DepEdit, that provides a comprehensive assessment\nof the editing process considering the above notions of dependency. Our\nprotocol involves setting up a controlled environment in which we edit facts\nand monitor their impact on LLMs, along with their implications based on\nIf-Then rules. Extensive experiments on DepEdit show that existing knowledge\nediting methods are sensitive to the surface form of knowledge, and that they\nhave limited performance in inferring the implications of edited facts.\n","authors":["Zichao Li","Ines Arous","Siva Reddy","Jackie C. K. Cheung"],"pdf_url":"https://arxiv.org/pdf/2312.01858v1.pdf","comment":"Findings of EMNLP2023"},{"id":"http://arxiv.org/abs/2010.02180v3","updated":"2023-12-04T12:23:52Z","published":"2020-10-05T17:27:31Z","title":"Pareto Probing: Trading Off Accuracy for Complexity","summary":" The question of how to probe contextual word representations for linguistic\nstructure in a way that is both principled and useful has seen significant\nattention recently in the NLP literature. In our contribution to this\ndiscussion, we argue for a probe metric that reflects the fundamental trade-off\nbetween probe complexity and performance: the Pareto hypervolume. To measure\ncomplexity, we present a number of parametric and non-parametric metrics. Our\nexperiments using Pareto hypervolume as an evaluation metric show that probes\noften do not conform to our expectations -- e.g., why should the non-contextual\nfastText representations encode more morpho-syntactic information than the\ncontextual BERT representations? These results suggest that common, simplistic\nprobing tasks, such as part-of-speech labeling and dependency arc labeling, are\ninadequate to evaluate the linguistic structure encoded in contextual word\nrepresentations. This leads us to propose full dependency parsing as a probing\ntask. In support of our suggestion that harder probing tasks are necessary, our\nexperiments with dependency parsing reveal a wide gap in syntactic knowledge\nbetween contextual and non-contextual representations.\n","authors":["Tiago Pimentel","Naomi Saphra","Adina Williams","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2010.02180v3.pdf","comment":"Tiago Pimentel and Naomi Saphra contributed equally to this work.\n Camera ready version of EMNLP 2020 publication. In this new version, we fixed\n some notation issues in the appendix, and added a new appendix section\n describing our MLP. Code available in\n https://github.com/rycolab/pareto-probing"},{"id":"http://arxiv.org/abs/2312.01837v1","updated":"2023-12-04T12:20:25Z","published":"2023-12-04T12:20:25Z","title":"Prompting Disentangled Embeddings for Knowledge Graph Completion with\n Pre-trained Language Model","summary":" Both graph structures and textual information play a critical role in\nKnowledge Graph Completion (KGC). With the success of Pre-trained Language\nModels (PLMs) such as BERT, they have been applied for text encoding for KGC.\nHowever, the current methods mostly prefer to fine-tune PLMs, leading to huge\ntraining costs and limited scalability to larger PLMs. In contrast, we propose\nto utilize prompts and perform KGC on a frozen PLM with only the prompts\ntrained. Accordingly, we propose a new KGC method named PDKGC with two prompts\n-- a hard task prompt which is to adapt the KGC task to the PLM pre-training\ntask of token prediction, and a disentangled structure prompt which learns\ndisentangled graph representation so as to enable the PLM to combine more\nrelevant structure knowledge with the text information. With the two prompts,\nPDKGC builds a textual predictor and a structural predictor, respectively, and\ntheir combination leads to more comprehensive entity prediction. Solid\nevaluation on two widely used KGC datasets has shown that PDKGC often\noutperforms the baselines including the state-of-the-art, and its components\nare all effective. Our codes and data are available at\nhttps://github.com/genggengcss/PDKGC.\n","authors":["Yuxia Geng","Jiaoyan Chen","Yuhang Zeng","Zhuo Chen","Wen Zhang","Jeff Z. Pan","Yuxiang Wang","Xiaoliang Xu"],"pdf_url":"https://arxiv.org/pdf/2312.01837v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2312.01823v1","updated":"2023-12-04T11:53:56Z","published":"2023-12-04T11:53:56Z","title":"Exchange-of-Thought: Enhancing Large Language Model Capabilities through\n Cross-Model Communication","summary":" Large Language Models (LLMs) have recently made significant strides in\ncomplex reasoning tasks through the Chain-of-Thought technique. Despite this\nprogress, their reasoning is often constrained by their intrinsic\nunderstanding, lacking external insights. To address this, we propose\nExchange-of-Thought (EoT), a novel framework that enables cross-model\ncommunication during problem-solving. Drawing inspiration from network\ntopology, EoT integrates four unique communication paradigms: Memory, Report,\nRelay, and Debate. This paper delves into the communication dynamics and volume\nassociated with each paradigm. To counterbalance the risks of incorrect\nreasoning chains, we implement a robust confidence evaluation mechanism within\nthese communications. Our experiments across diverse complex reasoning tasks\ndemonstrate that EoT significantly surpasses established baselines,\nunderscoring the value of external insights in enhancing LLM performance.\nFurthermore, we show that EoT achieves these superior results in a\ncost-effective manner, marking a promising advancement for efficient and\ncollaborative AI problem-solving.\n","authors":["Zhangyue Yin","Qiushi Sun","Cheng Chang","Qipeng Guo","Junqi Dai","Xuanjing Huang","Xipeng Qiu"],"pdf_url":"https://arxiv.org/pdf/2312.01823v1.pdf","comment":"19 pages, 11 figures, accepted by EMNLP2023"},{"id":"http://arxiv.org/abs/2312.00372v2","updated":"2023-12-04T11:42:35Z","published":"2023-12-01T06:30:31Z","title":"Event-driven Real-time Retrieval in Web Search","summary":" Information retrieval in real-time search presents unique challenges distinct\nfrom those encountered in classical web search. These challenges are\nparticularly pronounced due to the rapid change of user search intent, which is\ninfluenced by the occurrence and evolution of breaking news events, such as\nearthquakes, elections, and wars. Previous dense retrieval methods, which\nprimarily focused on static semantic representation, lack the capacity to\ncapture immediate search intent, leading to inferior performance in retrieving\nthe most recent event-related documents in time-sensitive scenarios. To address\nthis issue, this paper expands the query with event information that represents\nreal-time search intent. The Event information is then integrated with the\nquery through a cross-attention mechanism, resulting in a time-context query\nrepresentation. We further enhance the model's capacity for event\nrepresentation through multi-task training. Since publicly available datasets\nsuch as MS-MARCO do not contain any event information on the query side and\nhave few time-sensitive queries, we design an automatic data collection and\nannotation pipeline to address this issue, which includes ModelZoo-based Coarse\nAnnotation and LLM-driven Fine Annotation processes. In addition, we share the\ntraining tricks such as two-stage training and hard negative sampling. Finally,\nwe conduct a set of offline experiments on a million-scale production dataset\nto evaluate our approach and deploy an A/B testing in a real online system to\nverify the performance. Extensive experimental results demonstrate that our\nproposed approach significantly outperforms existing state-of-the-art baseline\nmethods.\n","authors":["Nan Yang","Shusen Zhang","Yannan Zhang","Xiaoling Bai","Hualong Deng","Tianhua Zhou","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.16458v4","updated":"2023-12-04T11:05:29Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained large language models have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks and to be appropriately specialized to\nparticular domains. Here, we target bioinformatics due to the amount of\nspecialized domain knowledge, algorithms, and data operations this discipline\nrequires. We present BioCoder, a benchmark developed to evaluate large language\nmodels (LLMs) in generating bioinformatics-specific code. BioCoder spans a\nbroad spectrum of the field and covers cross-file dependencies, class\ndeclarations, and global variables. It incorporates 1026 Python functions and\n1243 Java methods extracted from GitHub, along with 253 examples from the\nRosalind Project, all pertaining to bioinformatics. Using topic modeling we\nshow that overall coverage of the included code is representative of the full\nspectrum of bioinformatics calculations. BioCoder incorporates a fuzz-testing\nframework for evaluation. We have applied it to evaluate many models including\nInCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, InstructCodeT5+,\nGPT-3.5, and GPT-4. Furthermore, we finetuned StarCoder, demonstrating how our\ndataset can effectively enhance the performance of LLMs on our benchmark (by\n>15% in terms of Pass@K in certain prompt configurations and always >3%). The\nresults highlight two key aspects of successful models: (1) Successful models\naccommodate a long prompt (> ~2600 tokens) with full context, for functional\ndependencies. (2) They contain specific domain knowledge of bioinformatics,\nbeyond just general coding knowledge. This is evident from the performance gain\nof GPT-3.5/4 compared to the smaller models on the benchmark (50% vs up to\n~25%). Our dataset, benchmark, Docker images, and scripts required for testing\nare all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15786v2","updated":"2023-12-04T10:20:57Z","published":"2023-11-27T13:01:59Z","title":"YUAN 2.0: A Large Language Model with Localized Filtering-based\n Attention","summary":" In this work, we develop and release Yuan 2.0, a series of large language\nmodels with parameters ranging from 2.1 billion to 102.6 billion. The Localized\nFiltering-based Attention (LFA) is introduced to incorporate prior knowledge of\nlocal dependencies of natural language into Attention. A data filtering and\ngenerating system is presented to build pre-training and fine-tuning dataset in\nhigh quality. A distributed training method with non-uniform pipeline parallel,\ndata parallel, and optimizer parallel is proposed, which greatly reduces the\nbandwidth requirements of intra-node communication, and achieves good\nperformance in large-scale distributed training. Yuan 2.0 models display\nimpressive ability in code generation, math problem-solving, and chatting\ncompared with existing models. The latest version of YUAN 2.0, including model\nweights and source code, is accessible at Github.\n","authors":["Shaohua Wu","Xudong Zhao","Shenling Wang","Jiangang Luo","Lingjun Li","Xi Chen","Bing Zhao","Wei Wang","Tong Yu","Rongguo Zhang","Jiahua Zhang","Chao Wang"],"pdf_url":"https://arxiv.org/pdf/2311.15786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01787v1","updated":"2023-12-04T10:20:36Z","published":"2023-12-04T10:20:36Z","title":"Developing Linguistic Patterns to Mitigate Inherent Human Bias in\n Offensive Language Detection","summary":" With the proliferation of social media, there has been a sharp increase in\noffensive content, particularly targeting vulnerable groups, exacerbating\nsocial problems such as hatred, racism, and sexism. Detecting offensive\nlanguage use is crucial to prevent offensive language from being widely shared\non social media. However, the accurate detection of irony, implication, and\nvarious forms of hate speech on social media remains a challenge. Natural\nlanguage-based deep learning models require extensive training with large,\ncomprehensive, and labeled datasets. Unfortunately, manually creating such\ndatasets is both costly and error-prone. Additionally, the presence of\nhuman-bias in offensive language datasets is a major concern for deep learning\nmodels. In this paper, we propose a linguistic data augmentation approach to\nreduce bias in labeling processes, which aims to mitigate the influence of\nhuman bias by leveraging the power of machines to improve the accuracy and\nfairness of labeling processes. This approach has the potential to improve\noffensive language classification tasks across multiple languages and reduce\nthe prevalence of offensive content on social media.\n","authors":["Toygar Tanyel","Besher Alkurdi","Serkan Ayvaz"],"pdf_url":"https://arxiv.org/pdf/2312.01787v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11550v3","updated":"2023-12-04T10:15:37Z","published":"2023-05-19T09:36:48Z","title":"Viewing Knowledge Transfer in Multilingual Machine Translation Through a\n Representational Lens","summary":" We argue that translation quality alone is not a sufficient metric for\nmeasuring knowledge transfer in multilingual neural machine translation. To\nsupport this claim, we introduce Representational Transfer Potential (RTP),\nwhich measures representational similarities between languages. We show that\nRTP can measure both positive and negative transfer (interference), and find\nthat RTP is strongly correlated with changes in translation quality, indicating\nthat transfer does occur. Furthermore, we investigate data and language\ncharacteristics that are relevant for transfer, and find that multi-parallel\noverlap is an important yet under-explored feature. Based on this, we develop a\nnovel training scheme, which uses an auxiliary similarity loss that encourages\nrepresentations to be more invariant across languages by taking advantage of\nmulti-parallel data. We show that our method yields increased translation\nquality for low- and mid-resource languages across multiple data and model\nsetups.\n","authors":["David Stap","Vlad Niculae","Christof Monz"],"pdf_url":"https://arxiv.org/pdf/2305.11550v3.pdf","comment":"Accepted to EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.15896v2","updated":"2023-12-04T09:26:22Z","published":"2023-10-24T14:57:34Z","title":"BianQue: Balancing the Questioning and Suggestion Ability of Health LLMs\n with Multi-turn Health Conversations Polished by ChatGPT","summary":" Large language models (LLMs) have performed well in providing general and\nextensive health suggestions in single-turn conversations, exemplified by\nsystems such as ChatGPT, ChatGLM, ChatDoctor, DoctorGLM, and etc. However, the\nlimited information provided by users during single turn results in inadequate\npersonalization and targeting of the generated suggestions, which requires\nusers to independently select the useful part. It is mainly caused by the\nmissing ability to engage in multi-turn questioning. In real-world medical\nconsultations, doctors usually employ a series of iterative inquiries to\ncomprehend the patient's condition thoroughly, enabling them to provide\neffective and personalized suggestions subsequently, which can be defined as\nchain of questioning (CoQ) for LLMs. To improve the CoQ of LLMs, we propose\nBianQue, a ChatGLM-based LLM finetuned with the self-constructed health\nconversation dataset BianQueCorpus that is consist of multiple turns of\nquestioning and health suggestions polished by ChatGPT. Experimental results\ndemonstrate that the proposed BianQue can simultaneously balance the\ncapabilities of both questioning and health suggestions, which will help\npromote the research and application of LLMs in the field of proactive health.\n","authors":["Yirong Chen","Zhenyu Wang","Xiaofen Xing","huimin zheng","Zhipei Xu","Kai Fang","Junhong Wang","Sihang Li","Jieling Wu","Qi Liu","Xiangmin Xu"],"pdf_url":"https://arxiv.org/pdf/2310.15896v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11753v2","updated":"2023-12-04T09:05:05Z","published":"2023-10-18T07:28:12Z","title":"Bias in Emotion Recognition with ChatGPT","summary":" This technical report explores the ability of ChatGPT in recognizing emotions\nfrom text, which can be the basis of various applications like interactive\nchatbots, data annotation, and mental health analysis. While prior research has\nshown ChatGPT's basic ability in sentiment analysis, its performance in more\nnuanced emotion recognition is not yet explored. Here, we conducted experiments\nto evaluate its performance of emotion recognition across different datasets\nand emotion labels. Our findings indicate a reasonable level of reproducibility\nin its performance, with noticeable improvement through fine-tuning. However,\nthe performance varies with different emotion labels and datasets, highlighting\nan inherent instability and possible bias. The choice of dataset and emotion\nlabels significantly impacts ChatGPT's emotion recognition performance. This\npaper sheds light on the importance of dataset and label selection, and the\npotential of fine-tuning in enhancing ChatGPT's emotion recognition\ncapabilities, providing a groundwork for better integration of emotion analysis\nin applications using ChatGPT.\n","authors":["Naoki Wake","Atsushi Kanehira","Kazuhiro Sasabuchi","Jun Takamatsu","Katsushi Ikeuchi"],"pdf_url":"https://arxiv.org/pdf/2310.11753v2.pdf","comment":"5 pages, 4 figures, 6 tables"},{"id":"http://arxiv.org/abs/2305.17691v2","updated":"2023-12-04T08:33:13Z","published":"2023-05-28T10:58:00Z","title":"Plug-and-Play Knowledge Injection for Pre-trained Language Models","summary":" Injecting external knowledge can improve the performance of pre-trained\nlanguage models (PLMs) on various downstream NLP tasks. However, massive\nretraining is required to deploy new knowledge injection methods or knowledge\nbases for downstream tasks. In this work, we are the first to study how to\nimprove the flexibility and efficiency of knowledge injection by reusing\nexisting downstream models. To this end, we explore a new paradigm\nplug-and-play knowledge injection, where knowledge bases are injected into\nfrozen existing downstream models by a knowledge plugin. Correspondingly, we\npropose a plug-and-play injection method map-tuning, which trains a mapping of\nknowledge embeddings to enrich model inputs with mapped embeddings while\nkeeping model parameters frozen. Experimental results on three knowledge-driven\nNLP tasks show that existing injection methods are not suitable for the new\nparadigm, while map-tuning effectively improves the performance of downstream\nmodels. Moreover, we show that a frozen downstream model can be well adapted to\ndifferent domains with different mapping networks of domain knowledge. Our code\nand models are available at https://github.com/THUNLP/Knowledge-Plugin.\n","authors":["Zhengyan Zhang","Zhiyuan Zeng","Yankai Lin","Huadong Wang","Deming Ye","Chaojun Xiao","Xu Han","Zhiyuan Liu","Peng Li","Maosong Sun","Jie Zhou"],"pdf_url":"https://arxiv.org/pdf/2305.17691v2.pdf","comment":"ACL 2023"},{"id":"http://arxiv.org/abs/2312.01714v1","updated":"2023-12-04T08:07:21Z","published":"2023-12-04T08:07:21Z","title":"Retrieval-augmented Multi-modal Chain-of-Thoughts Reasoning for Large\n Language Models","summary":" The advancement of Large Language Models(LLMs) has brought substantial\nattention to the Chain of Thought(CoT) approach, primarily due to its ability\nto enhance the capability of LLMs on tasks requiring complex reasoning.\nMoreover, the significance of CoT approaches extends to the application of LLMs\nfor multi-modal tasks, such as multi-modal question answering. However, the\nselection of optimal CoT demonstration examples in multi-modal reasoning for\nLLMs remains less explored for LLMs due to the inherent complexity of\nmulti-modal examples. In this paper, we introduce a novel approach that\naddresses this challenge by using retrieval mechanisms to dynamically and\nautomatically select demonstration examples based on cross-modal similarities.\nThis method aims to refine the CoT reasoning process in multi-modal scenarios\nvia informing LLMs with more relevant and informative examples. Furthermore, we\nemploy a stratified sampling method categorising demonstration examples into\ngroups based on their types and retrieving examples from different groups\nrespectively to promote the diversity of demonstration examples. Through a\nseries of experiments, we demonstrate that our approach significantly improves\nthe performance of LLMs, achieving state-of-the-art results in multi-modal\nreasoning tasks. Specifically, our methods demonstrate significant advancements\non the ScienceQA dataset. While our method based on ChatGPT outperforms the\nChameleon(ChatGPT) by 2.74% with an accuracy of 82.67%, the GPT4-based approach\nsurpasses the Chameleon(GPT-4) by 0.89%, achieving 87.43% on accuracy under the\nsame setting. Moreover, our best performing show a 6.05% increase over\nChameleon for ChatGPT-based models and a 4.57% increase for GPT-4-based models.\n","authors":["Bingshuai Liu","Chenyang Lyu","Zijun Min","Zhanyu Wang","Jinsong Su","Longyue Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01714v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2312.01701v1","updated":"2023-12-04T07:43:02Z","published":"2023-12-04T07:43:02Z","title":"Mitigating Fine-Grained Hallucination by Fine-Tuning Large\n Vision-Language Models with Caption Rewrites","summary":" Large language models (LLMs) have shown remarkable performance in natural\nlanguage processing (NLP) tasks. To comprehend and execute diverse human\ninstructions over image data, instruction-tuned large vision-language models\n(LVLMs) have been introduced. However, LVLMs may suffer from different types of\nobject hallucinations. Nevertheless, LVLMs are evaluated for coarse-grained\nobject hallucinations only (i.e., generated objects non-existent in the input\nimage). The fine-grained object attributes and behaviors non-existent in the\nimage may still be generated but not measured by the current evaluation\nmethods. In this paper, we thus focus on reducing fine-grained hallucinations\nof LVLMs. We propose \\textit{ReCaption}, a framework that consists of two\ncomponents: rewriting captions using ChatGPT and fine-tuning the\ninstruction-tuned LVLMs on the rewritten captions. We also propose a\nfine-grained probing-based evaluation method named \\textit{Fine-Grained Object\nHallucination Evaluation} (\\textit{FGHE}). Our experiment results demonstrate\nthat ReCaption effectively reduces fine-grained object hallucination for\ndifferent LVLM options and improves their text generation quality. The code can\nbe found at https://github.com/Anonymousanoy/FOHE.\n","authors":["Lei Wang","Jiabang He","Shenshen Li","Ning Liu","Ee-Peng Lim"],"pdf_url":"https://arxiv.org/pdf/2312.01701v1.pdf","comment":"MMM 2024"},{"id":"http://arxiv.org/abs/2312.01700v1","updated":"2023-12-04T07:42:16Z","published":"2023-12-04T07:42:16Z","title":"Data Management For Large Language Models: A Survey","summary":" Data plays a fundamental role in the training of Large Language Models\n(LLMs). Effective data management, particularly in the formulation of a\nwell-suited training dataset, holds significance for enhancing model\nperformance and improving training efficiency during pretraining and supervised\nfine-tuning phases. Despite the considerable importance of data management, the\ncurrent research community still falls short in providing a systematic analysis\nof the rationale behind management strategy selection, its consequential\neffects, methodologies for evaluating curated datasets, and the ongoing pursuit\nof improved strategies. Consequently, the exploration of data management has\nattracted more and more attention among the research community. This survey\nprovides a comprehensive overview of current research in data management within\nboth the pretraining and supervised fine-tuning stages of LLMs, covering\nvarious noteworthy aspects of data management strategy design: data quantity,\ndata quality, domain/task composition, etc. Looking toward the future, we\nextrapolate existing challenges and outline promising directions for\ndevelopment in this field. Therefore, this survey serves as a guiding resource\nfor practitioners aspiring to construct powerful LLMs through effective data\nmanagement practices. The collection of the latest papers is available at\nhttps://github.com/ZigeW/data_management_LLM.\n","authors":["Zige Wang","Wanjun Zhong","Yufei Wang","Qi Zhu","Fei Mi","Baojun Wang","Lifeng Shang","Xin Jiang","Qun Liu"],"pdf_url":"https://arxiv.org/pdf/2312.01700v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2312.01678v1","updated":"2023-12-04T07:01:54Z","published":"2023-12-04T07:01:54Z","title":"Jellyfish: A Large Language Model for Data Preprocessing","summary":" In this paper, we present Jellyfish, an open-source LLM as a universal task\nsolver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned\nwith the datasets of several typical DP tasks including error detection, data\nimputation, schema matching, and entity matching, and delivers generalizability\nto other tasks. Remarkably, Jellyfish can operate on a local, single, and\nlow-priced GPU with its 13 billion parameters, ensuring data security and\nenabling further tuning. Its proficiency in understanding natural language\nallows users to manually craft instructions for DP tasks. Unlike many existing\nmethods that heavily rely on prior knowledge, Jellyfish acquires domain\nknowledge during its tuning process and integrates optional knowledge injection\nduring inference. A distinctive feature of Jellyfish is its interpreter, which\nelucidates its output decisions. To construct Jellyfish, we develop a series of\npre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance\nserializer, which automatically translates raw data into model prompts, and a\nknowledge injector, which optionally introduces task- and dataset-specific\nknowledge to enhance DP performance. Our evaluation of Jellyfish, using a range\nof real datasets, shows its competitiveness compared to state-of-the-art\nmethods and its strong generalizability to unseen tasks. Jellyfish's\nperformance rivals that of GPT series models, and its interpreter offers\nenhanced reasoning capabilities compared to GPT-3.5. Furthermore, our\nevaluation highlights the effectiveness of the techniques employed in\nconstructing Jellyfish. Our model is available at Hugging Face:\nhttps://huggingface.co/NECOUDBFM/Jellyfish .\n","authors":["Haochen Zhang","Yuyang Dong","Chuan Xiao","Masafumi Oyamada"],"pdf_url":"https://arxiv.org/pdf/2312.01678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15452v5","updated":"2023-12-04T07:01:15Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" In the realm of embodied artificial intelligence, the reasoning capabilities\nof Large Language Models (LLMs) play a pivotal role. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v5.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2309.09126v2","updated":"2023-12-04T06:53:49Z","published":"2023-09-17T01:36:02Z","title":"How much can ChatGPT really help Computational Biologists in\n Programming?","summary":" ChatGPT, a recently developed product by openAI, is successfully leaving its\nmark as a multi-purpose natural language based chatbot. In this paper, we are\nmore interested in analyzing its potential in the field of computational\nbiology. A major share of work done by computational biologists these days\ninvolve coding up bioinformatics algorithms, analyzing data, creating\npipelining scripts and even machine learning modeling and feature extraction.\nThis paper focuses on the potential influence (both positive and negative) of\nChatGPT in the mentioned aspects with illustrative examples from different\nperspectives. Compared to other fields of computer science, computational\nbiology has - (1) less coding resources, (2) more sensitivity and bias issues\n(deals with medical data) and (3) more necessity of coding assistance (people\nfrom diverse background come to this field). Keeping such issues in mind, we\ncover use cases such as code writing, reviewing, debugging, converting,\nrefactoring and pipelining using ChatGPT from the perspective of computational\nbiologists in this paper.\n","authors":["Chowdhury Rafeed Rahman","Limsoon Wong"],"pdf_url":"https://arxiv.org/pdf/2309.09126v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13760v2","updated":"2023-12-04T06:47:21Z","published":"2023-10-20T18:43:49Z","title":"Enhancing Abstractiveness of Summarization Models through Calibrated\n Distillation","summary":" Sequence-level knowledge distillation reduces the size of Seq2Seq models for\nmore efficient abstractive summarization. However, it often leads to a loss of\nabstractiveness in summarization. In this paper, we propose a novel approach\nnamed DisCal to enhance the level of abstractiveness (measured by n-gram\noverlap) without sacrificing the informativeness (measured by ROUGE) of\ngenerated summaries. DisCal exposes diverse pseudo summaries with two\nsupervision to the student model. Firstly, the best pseudo summary is\nidentified in terms of abstractiveness and informativeness and used for\nsequence-level distillation. Secondly, their ranks are used to ensure the\nstudent model to assign higher prediction scores to summaries with higher\nranks. Our experiments show that DisCal outperforms prior methods in\nabstractive summarization distillation, producing highly abstractive and\ninformative summaries.\n","authors":["Hwanjun Song","Igor Shalyminov","Hang Su","Siffi Singh","Kaisheng Yao","Saab Mansour"],"pdf_url":"https://arxiv.org/pdf/2310.13760v2.pdf","comment":"Accepted at EMNLP-Findings 2023"},{"id":"http://arxiv.org/abs/2312.01672v1","updated":"2023-12-04T06:45:47Z","published":"2023-12-04T06:45:47Z","title":"STADEE: STAtistics-based DEEp Detection of Machine Generated Text","summary":" We present STADEE, a \\textbf{STA}tistics-based \\textbf{DEE}p detection method\nto identify machine-generated text, addressing the limitations of current\nmethods that rely heavily on fine-tuning pre-trained language models (PLMs).\nSTADEE integrates key statistical text features with a deep classifier,\nfocusing on aspects like token probability and cumulative probability, crucial\nfor handling nucleus sampling. Tested across diverse datasets and scenarios\n(in-domain, out-of-domain, and in-the-wild), STADEE demonstrates superior\nperformance, achieving an 87.05% F1 score in-domain and outperforming both\ntraditional statistical methods and fine-tuned PLMs, especially in\nout-of-domain and in-the-wild settings, highlighting its effectiveness and\ngeneralizability.\n","authors":["Zheng Chen","Huming Liu"],"pdf_url":"https://arxiv.org/pdf/2312.01672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15500v2","updated":"2023-12-04T06:24:02Z","published":"2023-11-27T02:55:34Z","title":"Function-constrained Program Synthesis","summary":" This work introduces (1) a technique that allows large language models (LLMs)\nto leverage user-provided code when solving programming tasks and (2) a method\nto iteratively generate modular sub-functions that can aid future code\ngeneration attempts when the initial code generated by the LLM is inadequate.\nGenerating computer programs in general-purpose programming languages like\nPython poses a challenge for LLMs when instructed to use code provided in the\nprompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code\ncompletions in real-time by drawing on all code available in a development\nenvironment. However, restricting code-specific LLMs to use only in-context\ncode is not straightforward, as the model is not explicitly instructed to use\nthe user-provided code and users cannot highlight precisely which snippets of\ncode the model should incorporate into its context. Moreover, current systems\nlack effective recovery methods, forcing users to iteratively re-prompt the\nmodel with modified prompts until a sufficient solution is reached. Our method\ndiffers from traditional LLM-powered code-generation by constraining\ncode-generation to an explicit function set and enabling recovery from failed\nattempts through automatically generated sub-functions. When the LLM cannot\nproduce working code, we generate modular sub-functions to aid subsequent\nattempts at generating functional code. A by-product of our method is a library\nof reusable sub-functions that can solve related tasks, imitating a software\nteam where efficiency scales with experience. We also introduce a new\n\"half-shot\" evaluation paradigm that provides tighter estimates of LLMs' coding\nabilities compared to traditional zero-shot evaluation. Our proposed evaluation\nmethod encourages models to output solutions in a structured format, decreasing\nsyntax errors that can be mistaken for poor coding ability.\n","authors":["Patrick Hajali","Ignas Budvytis"],"pdf_url":"https://arxiv.org/pdf/2311.15500v2.pdf","comment":"17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop; corrected typo on\n fig 1 caption"},{"id":"http://arxiv.org/abs/2312.01661v1","updated":"2023-12-04T06:23:37Z","published":"2023-12-04T06:23:37Z","title":"ChatGPT as a Math Questioner? Evaluating ChatGPT on Generating\n Pre-university Math Questions","summary":" Mathematical questioning is crucial for assessing students problem-solving\nskills. Since manually creating such questions requires substantial effort,\nautomatic methods have been explored. Existing state-of-the-art models rely on\nfine-tuning strategies and struggle to generate questions that heavily involve\nmultiple steps of logical and arithmetic reasoning. Meanwhile, large language\nmodels(LLMs) such as ChatGPT have excelled in many NLP tasks involving logical\nand arithmetic reasoning. Nonetheless, their applications in generating\neducational questions are underutilized, especially in the field of\nmathematics. To bridge this gap, we take the first step to conduct an in-depth\nanalysis of ChatGPT in generating pre-university math questions. Our analysis\nis categorized into two main settings: context-aware and context-unaware. In\nthe context-aware setting, we evaluate ChatGPT on existing math\nquestion-answering benchmarks covering elementary, secondary, and ternary\nclasses. In the context-unaware setting, we evaluate ChatGPT in generating math\nquestions for each lesson from pre-university math curriculums that we crawl.\nOur crawling results in TopicMath, a comprehensive and novel collection of\npre-university math curriculums collected from 121 math topics and 428 lessons\nfrom elementary, secondary, and tertiary classes. Through this analysis, we aim\nto provide insight into the potential of ChatGPT as a math questioner.\n","authors":["Phuoc Pham Van Long","Duc Anh Vu","Nhat M. Hoang","Xuan Long Do","Anh Tuan Luu"],"pdf_url":"https://arxiv.org/pdf/2312.01661v1.pdf","comment":"Accepted at the 39th ACM/SIGAPP Symposium On Applied Computing (SAC\n 2024), Main Conference"},{"id":"http://arxiv.org/abs/2312.01648v1","updated":"2023-12-04T06:01:32Z","published":"2023-12-04T06:01:32Z","title":"Characterizing Large Language Model Geometry Solves Toxicity Detection\n and Generation","summary":" Large Language Models~(LLMs) drive current AI breakthroughs despite very\nlittle being known about their internal representations, e.g., how to extract a\nfew informative features to solve various downstream tasks. To provide a\npractical and principled answer, we propose to characterize LLMs from a\ngeometric perspective. We obtain in closed form (i) the intrinsic dimension in\nwhich the Multi-Head Attention embeddings are constrained to exist and (ii) the\npartition and per-region affine mappings of the per-layer feedforward networks.\nOur results are informative, do not rely on approximations, and are actionable.\nFirst, we show that, motivated by our geometric interpretation, we can bypass\nLlama$2$'s RLHF by controlling its embedding's intrinsic dimension through\ninformed prompt manipulation. Second, we derive $7$ interpretable spline\nfeatures that can be extracted from any (pre-trained) LLM layer, providing a\nrich abstract representation of their inputs. Those features alone ($224$ for\nMistral-7B and Llama$2$-7B) are sufficient to help solve toxicity detection,\ninfer the domain of the prompt, and even tackle the Jigsaw challenge, which\naims at characterizing the type of toxicity of various prompts. Our results\ndemonstrate how, even in large-scale regimes, exact theoretical results can\nanswer practical questions in language models. Code:\n\\url{https://github.com/RandallBalestriero/SplineLLM}.\n","authors":["Randall Balestriero","Romain Cosentino","Sarath Shekkizhar"],"pdf_url":"https://arxiv.org/pdf/2312.01648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01642v1","updated":"2023-12-04T05:48:18Z","published":"2023-12-04T05:48:18Z","title":"Voice-Based Smart Assistant System for Vehicles using RASA","summary":" Conversational AIs, or chatbots, mimic human speech when conversing. Smart\nassistants facilitate the automation of several tasks that needed human\nintervention earlier. Because of their accuracy, absence of dependence on human\nresources, and accessibility around the clock, chatbots can be employed in\nvehicles too. Due to people's propensity to divert their attention away from\nthe task of driving while engaging in other activities like calling, playing\nmusic, navigation, and getting updates on the weather forecast and latest news,\nroad safety has declined and accidents have increased as a result. It would be\nadvantageous to automate these tasks using voice commands rather than carrying\nthem out manually. This paper focuses on the development of a voice-based smart\nassistance application for vehicles based on the RASA framework. The smart\nassistant provides functionalities like navigation, communication via calls,\ngetting weather forecasts and the latest news updates, and music that are\ncompletely voice-based in nature.\n","authors":["Aditya Paranjape","Yash Patwardhan","Vedant Deshpande","Aniket Darp","Jayashree Jagdale"],"pdf_url":"https://arxiv.org/pdf/2312.01642v1.pdf","comment":"6 pages, 4 figures, accepted at IEEE International Conference on\n Computational Intelligence, Networks and Security ICCINS-2023"},{"id":"http://arxiv.org/abs/2305.13256v2","updated":"2023-12-04T04:35:55Z","published":"2023-05-22T17:27:57Z","title":"TaskWeb: Selecting Better Source Tasks for Multi-task NLP","summary":" Recent work in NLP has shown promising results in training models on large\namounts of tasks to achieve better generalization. However, it is not\nwell-understood how tasks are related, and how helpful training tasks can be\nchosen for a new task. In this work, we investigate whether knowing task\nrelationships via pairwise task transfer improves choosing one or more source\ntasks that help to learn a new target task. We provide TaskWeb, a large-scale\nbenchmark of pairwise task transfers for 22 NLP tasks using three different\nmodel types, sizes, and adaptation methods, spanning about 25,000 experiments.\nThen, we design a new method TaskShop based on our analysis of TaskWeb.\nTaskShop uses TaskWeb to estimate the benefit of using a source task for\nlearning a new target task, and to choose a subset of helpful training tasks\nfor multi-task training. Our method improves overall rankings and top-k\nprecision of source tasks by 10% and 38%, respectively. We also use TaskShop to\nbuild much smaller multi-task training sets that improve zero-shot performances\nacross 11 different target tasks by at least 4.3%.\n","authors":["Joongwon Kim","Akari Asai","Gabriel Ilharco","Hannaneh Hajishirzi"],"pdf_url":"https://arxiv.org/pdf/2305.13256v2.pdf","comment":"21 pages, 14 figures"},{"id":"http://arxiv.org/abs/2305.13269v3","updated":"2023-12-04T04:10:28Z","published":"2023-05-22T17:34:23Z","title":"Chain-of-Knowledge: Grounding Large Language Models via Dynamic\n Knowledge Adapting over Heterogeneous Sources","summary":" We present chain-of-knowledge (CoK), a novel framework that augments large\nlanguage models (LLMs) by dynamically incorporating grounding information from\nheterogeneous sources. It results in more factual rationales and reduced\nhallucination in generation. Specifically, CoK consists of three stages:\nreasoning preparation, dynamic knowledge adapting, and answer consolidation.\nGiven a knowledge-intensive question, CoK first prepares several preliminary\nrationales and answers while identifying the relevant knowledge domains. If\nthere is no majority consensus among the answers from samples, CoK corrects the\nrationales step by step by adapting knowledge from the identified domains.\nThese corrected rationales can plausibly serve as a better foundation for the\nfinal answer consolidation. Unlike prior studies that primarily use\nunstructured data, CoK also leverages structured knowledge sources such as\nWikidata and tables that provide more reliable factual information. To access\nboth unstructured and structured knowledge sources in the dynamic knowledge\nadapting stage, we propose an adaptive query generator that allows the\ngeneration of queries for various types of query languages, including SPARQL,\nSQL, and natural sentences. Moreover, to minimize error propagation between\nrationales, CoK corrects the rationales progressively using preceding corrected\nrationales to generate and correct subsequent rationales. Extensive experiments\nshow that CoK consistently improves the performance of LLMs on\nknowledge-intensive tasks across different domains.\n","authors":["Xingxuan Li","Ruochen Zhao","Yew Ken Chia","Bosheng Ding","Shafiq Joty","Soujanya Poria","Lidong Bing"],"pdf_url":"https://arxiv.org/pdf/2305.13269v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01592v1","updated":"2023-12-04T03:16:48Z","published":"2023-12-04T03:16:48Z","title":"Expand BERT Representation with Visual Information via Grounded Language\n Learning with Multimodal Partial Alignment","summary":" Language models have been supervised with both language-only objective and\nvisual grounding in existing studies of visual-grounded language learning.\nHowever, due to differences in the distribution and scale of visual-grounded\ndatasets and language corpora, the language model tends to mix up the context\nof the tokens that occurred in the grounded data with those that do not. As a\nresult, during representation learning, there is a mismatch between the visual\ninformation and the contextual meaning of the sentence. To overcome this\nlimitation, we propose GroundedBERT - a grounded language learning method that\nenhances the BERT representation with visually grounded information.\nGroundedBERT comprises two components: (i) the original BERT which captures the\ncontextual representation of words learned from the language corpora, and (ii)\na visual grounding module which captures visual information learned from\nvisual-grounded datasets. Moreover, we employ Optimal Transport (OT),\nspecifically its partial variant, to solve the fractional alignment problem\nbetween the two modalities. Our proposed method significantly outperforms the\nbaseline language models on various language tasks of the GLUE and SQuAD\ndatasets.\n","authors":["Cong-Duy Nguyen","The-Anh Vu-Le","Thong Nguyen","Tho Quan","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2312.01592v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01582v1","updated":"2023-12-04T02:40:28Z","published":"2023-12-04T02:40:28Z","title":"Explaining with Contrastive Phrasal Highlighting: A Case Study in\n Assisting Humans to Detect Translation Differences","summary":" Explainable NLP techniques primarily explain by answering \"Which tokens in\nthe input are responsible for this prediction?''. We argue that for NLP models\nthat make predictions by comparing two input texts, it is more useful to\nexplain by answering \"What differences between the two inputs explain this\nprediction?''. We introduce a technique to generate contrastive highlights that\nexplain the predictions of a semantic divergence model via\nphrase-alignment-guided erasure. We show that the resulting highlights match\nhuman rationales of cross-lingual semantic differences better than popular\npost-hoc saliency techniques and that they successfully help people detect\nfine-grained meaning differences in human translations and critical machine\ntranslation errors.\n","authors":["Eleftheria Briakou","Navita Goyal","Marine Carpuat"],"pdf_url":"https://arxiv.org/pdf/2312.01582v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.01575v1","updated":"2023-12-04T02:17:14Z","published":"2023-12-04T02:17:14Z","title":"A Challenging Multimodal Video Summary: Simultaneously Extracting and\n Generating Keyframe-Caption Pairs from Video","summary":" This paper proposes a practical multimodal video summarization task setting\nand a dataset to train and evaluate the task. The target task involves\nsummarizing a given video into a predefined number of keyframe-caption pairs\nand displaying them in a listable format to grasp the video content quickly.\nThis task aims to extract crucial scenes from the video in the form of images\n(keyframes) and generate corresponding captions explaining each keyframe's\nsituation. This task is useful as a practical application and presents a highly\nchallenging problem worthy of study. Specifically, achieving simultaneous\noptimization of the keyframe selection performance and caption quality\nnecessitates careful consideration of the mutual dependence on both preceding\nand subsequent keyframes and captions. To facilitate subsequent research in\nthis field, we also construct a dataset by expanding upon existing datasets and\npropose an evaluation framework. Furthermore, we develop two baseline systems\nand report their respective performance.\n","authors":["Keito Kudo","Haruki Nagasawa","Jun Suzuki","Nobuyuki Shimizu"],"pdf_url":"https://arxiv.org/pdf/2312.01575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01564v1","updated":"2023-12-04T01:42:09Z","published":"2023-12-04T01:42:09Z","title":"APoLLo: Unified Adapter and Prompt Learning for Vision Language Models","summary":" The choice of input text prompt plays a critical role in the performance of\nVision-Language Pretrained (VLP) models such as CLIP. We present APoLLo, a\nunified multi-modal approach that combines Adapter and Prompt learning for\nVision-Language models. Our method is designed to substantially improve the\ngeneralization capabilities of VLP models when they are fine-tuned in a\nfew-shot setting. We introduce trainable cross-attention-based adapter layers\nin conjunction with vision and language encoders to strengthen the alignment\nbetween the two modalities. We enforce consistency between the respective\nencoder branches (receiving augmented inputs) to prevent overfitting in\ndownstream tasks. Our method is evaluated on three representative tasks:\ngeneralization to novel classes, cross-dataset evaluation, and unseen domain\nshifts. In practice, APoLLo achieves a relative gain up to 6.03% over MaPLe\n(SOTA) on novel classes for 10 diverse image recognition datasets.\n","authors":["Sanjoy Chowdhury","Sayan Nag","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2312.01564v1.pdf","comment":"Accepted at EMNLP 2023 (Main track)"},{"id":"http://arxiv.org/abs/2312.01552v1","updated":"2023-12-04T00:46:11Z","published":"2023-12-04T00:46:11Z","title":"The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context\n Learning","summary":" The alignment tuning process of large language models (LLMs) typically\ninvolves instruction learning through supervised fine-tuning (SFT) and\npreference tuning via reinforcement learning from human feedback (RLHF). A\nrecent study, LIMA (Zhou et al. 2023), shows that using merely 1K examples for\nSFT can achieve significant alignment performance as well, suggesting that the\neffect of alignment tuning might be \"superficial.\" This raises questions about\nhow exactly the alignment tuning transforms a base LLM.\n We analyze the effect of alignment tuning by examining the token distribution\nshift between base LLMs and their aligned counterpart. Our findings reveal that\nbase LLMs and their alignment-tuned versions perform nearly identically in\ndecoding on the majority of token positions. Most distribution shifts occur\nwith stylistic tokens. These direct evidence strongly supports the Superficial\nAlignment Hypothesis suggested by LIMA.\n Based on these findings, we rethink the alignment of LLMs by posing the\nresearch question: how effectively can we align base LLMs without SFT or RLHF?\nTo address this, we introduce a simple, tuning-free alignment method, URIAL.\nURIAL achieves effective alignment purely through in-context learning (ICL)\nwith base LLMs, requiring as few as three constant stylistic examples and a\nsystem prompt. We conduct a fine-grained and interpretable evaluation on a\ndiverse set of examples, named JUST-EVAL-INSTRUCT. Results demonstrate that\nbase LLMs with URIAL can match or even surpass the performance of LLMs aligned\nwith SFT or SFT+RLHF. We show that the gap between tuning-free and tuning-based\nalignment methods can be significantly reduced through strategic prompting and\nICL. Our findings on the superficial nature of alignment tuning and results\nwith URIAL suggest that deeper analysis and theoretical understanding of\nalignment is crucial to future LLM research.\n","authors":["Bill Yuchen Lin","Abhilasha Ravichander","Ximing Lu","Nouha Dziri","Melanie Sclar","Khyathi Chandu","Chandra Bhagavatula","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2312.01552v1.pdf","comment":"26 pages, 8 figures. Project website:\n https://allenai.github.io/re-align/"},{"id":"http://arxiv.org/abs/2306.17840v3","updated":"2023-12-04T23:31:33Z","published":"2023-06-30T17:58:02Z","title":"Statler: State-Maintaining Language Models for Embodied Reasoning","summary":" There has been a significant research interest in employing large language\nmodels to empower intelligent robots with complex reasoning. Existing work\nfocuses on harnessing their abilities to reason about the histories of their\nactions and observations. In this paper, we explore a new dimension in which\nlarge language models may benefit robotics planning. In particular, we propose\nStatler, a framework in which large language models are prompted to maintain an\nestimate of the world state, which are often unobservable, and track its\ntransition as new actions are taken. Our framework then conditions each action\non the estimate of the current world state. Despite being conceptually simple,\nour Statler framework significantly outperforms strong competing methods (e.g.,\nCode-as-Policies) on several robot planning tasks. Additionally, it has the\npotential advantage of scaling up to more challenging long-horizon planning\ntasks. We release our code at https://github.com/ripl/statler\n","authors":["Takuma Yoneda","Jiading Fang","Peng Li","Huanyu Zhang","Tianchong Jiang","Shengjie Lin","Ben Picker","David Yunis","Hongyuan Mei","Matthew R. Walter"],"pdf_url":"https://arxiv.org/pdf/2306.17840v3.pdf","comment":"In submission; Project website: https://statler-lm.github.io/"},{"id":"http://arxiv.org/abs/2312.02382v1","updated":"2023-12-04T22:56:31Z","published":"2023-12-04T22:56:31Z","title":"New Evaluation Metrics Capture Quality Degradation due to LLM\n Watermarking","summary":" With the increasing use of large-language models (LLMs) like ChatGPT,\nwatermarking has emerged as a promising approach for tracing machine-generated\ncontent. However, research on LLM watermarking often relies on simple\nperplexity or diversity-based measures to assess the quality of watermarked\ntext, which can mask important limitations in watermarking. Here we introduce\ntwo new easy-to-use methods for evaluating watermarking algorithms for LLMs: 1)\nevaluation by LLM-judger with specific guidelines; and 2) binary classification\non text embeddings to distinguish between watermarked and unwatermarked text.\nWe apply these methods to characterize the effectiveness of current\nwatermarking techniques. Our experiments, conducted across various datasets,\nreveal that current watermarking methods are detectable by even simple\nclassifiers, challenging the notion of watermarking subtlety. We also found,\nthrough the LLM judger, that watermarking impacts text quality, especially in\ndegrading the coherence and depth of the response. Our findings underscore the\ntrade-off between watermark robustness and text quality and highlight the\nimportance of having more informative metrics to assess watermarking quality.\n","authors":["Karanpartap Singh","James Zou"],"pdf_url":"https://arxiv.org/pdf/2312.02382v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.15448v2","updated":"2023-12-04T22:31:26Z","published":"2023-06-21T16:42:15Z","title":"Understanding Social Reasoning in Language Models with Language Models","summary":" As Large Language Models (LLMs) become increasingly integrated into our\neveryday lives, understanding their ability to comprehend human mental states\nbecomes critical for ensuring effective interactions. However, despite the\nrecent attempts to assess the Theory-of-Mind (ToM) reasoning capabilities of\nLLMs, the degree to which these models can align with human ToM remains a\nnuanced topic of exploration. This is primarily due to two distinct challenges:\n(1) the presence of inconsistent results from previous evaluations, and (2)\nconcerns surrounding the validity of existing evaluation methodologies. To\naddress these challenges, we present a novel framework for procedurally\ngenerating evaluations with LLMs by populating causal templates. Using our\nframework, we create a new social reasoning benchmark (BigToM) for LLMs which\nconsists of 25 controls and 5,000 model-written evaluations. We find that human\nparticipants rate the quality of our benchmark higher than previous\ncrowd-sourced evaluations and comparable to expert-written evaluations. Using\nBigToM, we evaluate the social reasoning capabilities of a variety of LLMs and\ncompare model performances with human performance. Our results suggest that\nGPT4 has ToM capabilities that mirror human inference patterns, though less\nreliable, while other LLMs struggle.\n","authors":["Kanishk Gandhi","Jan-Philipp Fränken","Tobias Gerstenberg","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2306.15448v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11863v3","updated":"2023-12-04T22:28:33Z","published":"2023-05-19T17:53:03Z","title":"Scaling laws for language encoding models in fMRI","summary":" Representations from transformer-based unidirectional language models are\nknown to be effective at predicting brain responses to natural language.\nHowever, most studies comparing language models to brains have used GPT-2 or\nsimilarly sized language models. Here we tested whether larger open-source\nmodels such as those from the OPT and LLaMA families are better at predicting\nbrain responses recorded using fMRI. Mirroring scaling results from other\ncontexts, we found that brain prediction performance scales logarithmically\nwith model size from 125M to 30B parameter models, with ~15% increased encoding\nperformance as measured by correlation with a held-out test set across 3\nsubjects. Similar logarithmic behavior was observed when scaling the size of\nthe fMRI training set. We also characterized scaling for acoustic encoding\nmodels that use HuBERT, WavLM, and Whisper, and we found comparable\nimprovements with model size. A noise ceiling analysis of these large,\nhigh-performance encoding models showed that performance is nearing the\ntheoretical maximum for brain areas such as the precuneus and higher auditory\ncortex. These results suggest that increasing scale in both models and data\nwill yield incredibly effective models of language processing in the brain,\nenabling better scientific understanding as well as applications such as\ndecoding.\n","authors":["Richard Antonello","Aditya Vaidya","Alexander G. Huth"],"pdf_url":"https://arxiv.org/pdf/2305.11863v3.pdf","comment":"Accepted to the Thirty-seventh Annual Conference on Neural\n Information Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2312.02337v1","updated":"2023-12-04T20:46:48Z","published":"2023-12-04T20:46:48Z","title":"Measuring Distributional Shifts in Text: The Advantage of Language\n Model-Based Embeddings","summary":" An essential part of monitoring machine learning models in production is\nmeasuring input and output data drift. In this paper, we present a system for\nmeasuring distributional shifts in natural language data and highlight and\ninvestigate the potential advantage of using large language models (LLMs) for\nthis problem. Recent advancements in LLMs and their successful adoption in\ndifferent domains indicate their effectiveness in capturing semantic\nrelationships for solving various natural language processing problems. The\npower of LLMs comes largely from the encodings (embeddings) generated in the\nhidden layers of the corresponding neural network. First we propose a\nclustering-based algorithm for measuring distributional shifts in text data by\nexploiting such embeddings. Then we study the effectiveness of our approach\nwhen applied to text embeddings generated by both LLMs and classical embedding\nalgorithms. Our experiments show that general-purpose LLM-based embeddings\nprovide a high sensitivity to data drift compared to other embedding methods.\nWe propose drift sensitivity as an important evaluation metric to consider when\ncomparing language models. Finally, we present insights and lessons learned\nfrom deploying our framework as part of the Fiddler ML Monitoring platform over\na period of 18 months.\n","authors":["Gyandev Gupta","Bashir Rastegarpanah","Amalendu Iyer","Joshua Rubin","Krishnaram Kenthapadi"],"pdf_url":"https://arxiv.org/pdf/2312.02337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02334v1","updated":"2023-12-04T20:42:26Z","published":"2023-12-04T20:42:26Z","title":"An Evaluation Framework for Mapping News Headlines to Event Classes in a\n Knowledge Graph","summary":" Mapping ongoing news headlines to event-related classes in a rich knowledge\nbase can be an important component in a knowledge-based event analysis and\nforecasting solution. In this paper, we present a methodology for creating a\nbenchmark dataset of news headlines mapped to event classes in Wikidata, and\nresources for the evaluation of methods that perform the mapping. We use the\ndataset to study two classes of unsupervised methods for this task: 1)\nadaptations of classic entity linking methods, and 2) methods that treat the\nproblem as a zero-shot text classification problem. For the first approach, we\nevaluate off-the-shelf entity linking systems. For the second approach, we\nexplore a) pre-trained natural language inference (NLI) models, and b)\npre-trained large generative language models. We present the results of our\nevaluation, lessons learned, and directions for future work. The dataset and\nscripts for evaluation are made publicly available.\n","authors":["Steve Fonin Mbouadeu","Martin Lorenzo","Ken Barker","Oktie Hassanzadeh"],"pdf_url":"https://arxiv.org/pdf/2312.02334v1.pdf","comment":"Presented at CASE 2023 @ RANLP\n https://aclanthology.org/2023.case-1.6/"},{"id":"http://arxiv.org/abs/2312.02331v1","updated":"2023-12-04T20:33:24Z","published":"2023-12-04T20:33:24Z","title":"Revisiting Topic-Guided Language Models","summary":" A recent line of work in natural language processing has aimed to combine\nlanguage models and topic models. These topic-guided language models augment\nneural language models with topic models, unsupervised learning methods that\ncan discover document-level patterns of word use. This paper compares the\neffectiveness of these methods in a standardized setting. We study four\ntopic-guided language models and two baselines, evaluating the held-out\npredictive performance of each model on four corpora. Surprisingly, we find\nthat none of these methods outperform a standard LSTM language model baseline,\nand most fail to learn good topics. Further, we train a probe of the neural\nlanguage model that shows that the baseline's hidden states already encode\ntopic information. We make public all code used for this study.\n","authors":["Carolina Zheng","Keyon Vafa","David M. Blei"],"pdf_url":"https://arxiv.org/pdf/2312.02331v1.pdf","comment":"Published in Transactions on Machine Learning Research (TMLR)\n (12/2023)"},{"id":"http://arxiv.org/abs/2312.02317v1","updated":"2023-12-04T19:58:07Z","published":"2023-12-04T19:58:07Z","title":"GNN2R: Weakly-Supervised Rationale-Providing Question Answering over\n Knowledge Graphs","summary":" Most current methods for multi-hop question answering (QA) over knowledge\ngraphs (KGs) only provide final conclusive answers without explanations, such\nas a set of KG entities that is difficult for normal users to review and\ncomprehend. This issue severely limits the application of KG-based QA in\nreal-world scenarios. However, it is non-trivial to solve due to two\nchallenges: First, annotations of reasoning chains of multi-hop questions,\nwhich could serve as supervision for explanation generation, are usually\nlacking. Second, it is difficult to maintain high efficiency when explicit KG\ntriples need to be retrieved to generate explanations. In this paper, we\npropose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to\nsolve this issue. GNN2R can provide both final answers and reasoning subgraphs\nas a rationale behind final answers efficiently with only weak supervision that\nis available through question-final answer pairs. We extensively evaluated\nGNN2R with detailed analyses in experiments. The results demonstrate that, in\nterms of effectiveness, efficiency, and quality of generated explanations,\nGNN2R outperforms existing state-of-the-art methods that are applicable to this\ntask. Our code and pre-trained models are available at\nhttps://github.com/ruijie-wang-uzh/GNN2R.\n","authors":["Ruijie Wang","Luca Rossetto","Michael Cochez","Abraham Bernstein"],"pdf_url":"https://arxiv.org/pdf/2312.02317v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02314v1","updated":"2023-12-04T19:52:56Z","published":"2023-12-04T19:52:56Z","title":"Fine-tuning pre-trained extractive QA models for clinical document\n parsing","summary":" Electronic health records (EHRs) contain a vast amount of high-dimensional\nmulti-modal data that can accurately represent a patient's medical history.\nUnfortunately, most of this data is either unstructured or semi-structured,\nrendering it unsuitable for real-time and retrospective analyses. A remote\npatient monitoring (RPM) program for Heart Failure (HF) patients needs to have\naccess to clinical markers like EF (Ejection Fraction) or LVEF (Left\nVentricular Ejection Fraction) in order to ascertain eligibility and\nappropriateness for the program. This paper explains a system that can parse\nechocardiogram reports and verify EF values. This system helps identify\neligible HF patients who can be enrolled in such a program. At the heart of\nthis system is a pre-trained extractive QA transformer model that is fine-tuned\non custom-labeled data. The methods used to prepare such a model for deployment\nare illustrated by running experiments on a public clinical dataset like\nMIMIC-IV-Note. The pipeline can be used to generalize solutions to similar\nproblems in a low-resource setting. We found that the system saved over 1500\nhours for our clinicians over 12 months by automating the task at scale.\n","authors":["Ashwyn Sharma","David I. Feldman","Aneesh Jain"],"pdf_url":"https://arxiv.org/pdf/2312.02314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02310v1","updated":"2023-12-04T19:48:02Z","published":"2023-12-04T19:48:02Z","title":"VaQuitA: Enhancing Alignment in LLM-Assisted Video Understanding","summary":" Recent advancements in language-model-based video understanding have been\nprogressing at a remarkable pace, spurred by the introduction of Large Language\nModels (LLMs). However, the focus of prior research has been predominantly on\ndevising a projection layer that maps video features to tokens, an approach\nthat is both rudimentary and inefficient. In our study, we introduce a\ncutting-edge framework, VaQuitA, designed to refine the synergy between video\nand textual information. At the data level, instead of sampling frames\nuniformly, we implement a sampling method guided by CLIP-score rankings, which\nenables a more aligned selection of frames with the given question. At the\nfeature level, we integrate a trainable Video Perceiver alongside a\nVisual-Query Transformer (abbreviated as VQ-Former), which bolsters the\ninterplay between the input question and the video features. We also discover\nthat incorporating a simple prompt, \"Please be critical\", into the LLM input\ncan substantially enhance its video comprehension capabilities. Our\nexperimental results indicate that VaQuitA consistently sets a new benchmark\nfor zero-shot video question-answering tasks and is adept at producing\nhigh-quality, multi-turn video dialogues with users.\n","authors":["Yizhou Wang","Ruiyi Zhang","Haoliang Wang","Uttaran Bhattacharya","Yun Fu","Gang Wu"],"pdf_url":"https://arxiv.org/pdf/2312.02310v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02296v1","updated":"2023-12-04T19:26:13Z","published":"2023-12-04T19:26:13Z","title":"LLMs Accelerate Annotation for Medical Information Extraction","summary":" The unstructured nature of clinical notes within electronic health records\noften conceals vital patient-related information, making it challenging to\naccess or interpret. To uncover this hidden information, specialized Natural\nLanguage Processing (NLP) models are required. However, training these models\nnecessitates large amounts of labeled data, a process that is both\ntime-consuming and costly when relying solely on human experts for annotation.\nIn this paper, we propose an approach that combines Large Language Models\n(LLMs) with human expertise to create an efficient method for generating ground\ntruth labels for medical text annotation. By utilizing LLMs in conjunction with\nhuman annotators, we significantly reduce the human annotation burden, enabling\nthe rapid creation of labeled datasets. We rigorously evaluate our method on a\nmedical information extraction task, demonstrating that our approach not only\nsubstantially cuts down on human intervention but also maintains high accuracy.\nThe results highlight the potential of using LLMs to improve the utilization of\nunstructured clinical data, allowing for the swift deployment of tailored NLP\nsolutions in healthcare.\n","authors":["Akshay Goel","Almog Gueta","Omry Gilon","Chang Liu","Sofia Erell","Lan Huong Nguyen","Xiaohong Hao","Bolous Jaber","Shashir Reddy","Rupesh Kartha","Jean Steiner","Itay Laish","Amir Feder"],"pdf_url":"https://arxiv.org/pdf/2312.02296v1.pdf","comment":"Published in proceedings of the Machine Learning for Health (ML4H)\n Symposium 2023"},{"id":"http://arxiv.org/abs/2310.16570v2","updated":"2023-12-04T19:23:33Z","published":"2023-10-25T11:57:13Z","title":"Give Me the Facts! A Survey on Factual Knowledge Probing in Pre-trained\n Language Models","summary":" Pre-trained Language Models (PLMs) are trained on vast unlabeled data, rich\nin world knowledge. This fact has sparked the interest of the community in\nquantifying the amount of factual knowledge present in PLMs, as this explains\ntheir performance on downstream tasks, and potentially justifies their use as\nknowledge bases. In this work, we survey methods and datasets that are used to\nprobe PLMs for factual knowledge. Our contributions are: (1) We propose a\ncategorization scheme for factual probing methods that is based on how their\ninputs, outputs and the probed PLMs are adapted; (2) We provide an overview of\nthe datasets used for factual probing; (3) We synthesize insights about\nknowledge retention and prompt optimization in PLMs, analyze obstacles to\nadopting PLMs as knowledge bases and outline directions for future work.\n","authors":["Paul Youssef","Osman Alperen Koraş","Meijie Li","Jörg Schlötterer","Christin Seifert"],"pdf_url":"https://arxiv.org/pdf/2310.16570v2.pdf","comment":"Accepted at EMNLP Findings 2023"},{"id":"http://arxiv.org/abs/2312.02251v1","updated":"2023-12-04T18:04:27Z","published":"2023-12-04T18:04:27Z","title":"Fine-Tuning Language Models for Context-Specific SQL Query Generation","summary":" The ability to generate SQL queries from natural language has significant\nimplications for making data accessible to non-specialists. This paper presents\na novel approach to fine-tuning open-source large language models (LLMs) for\nthe task of transforming natural language into SQL queries within the retail\ndomain. We introduce models specialized in generating SQL queries, trained on\nsynthetic datasets tailored to the Snowflake SQL and GoogleSQL dialects. Our\nmethodology involves generating a context-specific dataset using GPT-4, then\nfine-tuning three open-source LLMs(Starcoder Plus, Code-Llama, and Mistral)\nemploying the LoRa technique to optimize for resource constraints. The\nfine-tuned models demonstrate superior performance in zero-shot settings\ncompared to the baseline GPT-4, with Code-Llama achieving the highest accuracy\nrates, at 81.58% for Snowflake SQL and 82.66% for GoogleSQL. These results\nunderscore the effectiveness of fine-tuning LLMs on domain-specific tasks and\nsuggest a promising direction for enhancing the accessibility of relational\ndatabases through natural language interfaces.\n","authors":["Amine Rebei"],"pdf_url":"https://arxiv.org/pdf/2312.02251v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02249v1","updated":"2023-12-04T17:27:24Z","published":"2023-12-04T17:27:24Z","title":"Recursive Visual Programming","summary":" Visual Programming (VP) has emerged as a powerful framework for Visual\nQuestion Answering (VQA). By generating and executing bespoke code for each\nquestion, these methods demonstrate impressive compositional and reasoning\ncapabilities, especially in few-shot and zero-shot scenarios. However, existing\nVP methods generate all code in a single function, resulting in code that is\nsuboptimal in terms of both accuracy and interpretability. Inspired by human\ncoding practices, we propose Recursive Visual Programming (RVP), which\nsimplifies generated routines, provides more efficient problem solving, and can\nmanage more complex data structures. RVP is inspired by human coding practices\nand approaches VQA tasks with an iterative recursive code generation approach,\nallowing decomposition of complicated problems into smaller parts. Notably, RVP\nis capable of dynamic type assignment, i.e., as the system recursively\ngenerates a new piece of code, it autonomously determines the appropriate\nreturn type and crafts the requisite code to generate that output. We show\nRVP's efficacy through extensive experiments on benchmarks including VSR, COVR,\nGQA, and NextQA, underscoring the value of adopting human-like recursive and\nmodular programming techniques for solving VQA tasks through coding.\n","authors":["Jiaxin Ge","Sanjay Subramanian","Baifeng Shi","Roei Herzig","Trevor Darrell"],"pdf_url":"https://arxiv.org/pdf/2312.02249v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02227v1","updated":"2023-12-04T02:58:19Z","published":"2023-12-04T02:58:19Z","title":"Improving Multimodal Sentiment Analysis: Supervised Angular Margin-based\n Contrastive Learning for Enhanced Fusion Representation","summary":" The effectiveness of a model is heavily reliant on the quality of the fusion\nrepresentation of multiple modalities in multimodal sentiment analysis.\nMoreover, each modality is extracted from raw input and integrated with the\nrest to construct a multimodal representation. Although previous methods have\nproposed multimodal representations and achieved promising results, most of\nthem focus on forming positive and negative pairs, neglecting the variation in\nsentiment scores within the same class. Additionally, they fail to capture the\nsignificance of unimodal representations in the fusion vector. To address these\nlimitations, we introduce a framework called Supervised Angular-based\nContrastive Learning for Multimodal Sentiment Analysis. This framework aims to\nenhance discrimination and generalizability of the multimodal representation\nand overcome biases in the fusion vector's modality. Our experimental results,\nalong with visualizations on two widely used datasets, demonstrate the\neffectiveness of our approach.\n","authors":["Cong-Duy Nguyen","Thong Nguyen","Duc Anh Vu","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2312.02227v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.02158v1","updated":"2023-12-04T18:59:59Z","published":"2023-12-04T18:59:59Z","title":"PaSCo: Urban 3D Panoptic Scene Completion with Uncertainty Awareness","summary":" We propose the task of Panoptic Scene Completion (PSC) which extends the\nrecently popular Semantic Scene Completion (SSC) task with instance-level\ninformation to produce a richer understanding of the 3D scene. Our PSC proposal\nutilizes a hybrid mask-based technique on the non-empty voxels from sparse\nmulti-scale completions. Whereas the SSC literature overlooks uncertainty which\nis critical for robotics applications, we instead propose an efficient\nensembling to estimate both voxel-wise and instance-wise uncertainties along\nPSC. This is achieved by building on a multi-input multi-output (MIMO)\nstrategy, while improving performance and yielding better uncertainty for\nlittle additional compute. Additionally, we introduce a technique to aggregate\npermutation-invariant mask predictions. Our experiments demonstrate that our\nmethod surpasses all baselines in both Panoptic Scene Completion and\nuncertainty estimation on three large-scale autonomous driving datasets. Our\ncode and data are available at https://astra-vision.github.io/PaSCo .\n","authors":["Anh-Quan Cao","Angela Dai","Raoul de Charette"],"pdf_url":"https://arxiv.org/pdf/2312.02158v1.pdf","comment":"Project page: https://astra-vision.github.io/PaSCo"},{"id":"http://arxiv.org/abs/2312.02157v1","updated":"2023-12-04T18:59:58Z","published":"2023-12-04T18:59:58Z","title":"Mesh-Guided Neural Implicit Field Editing","summary":" Neural implicit fields have emerged as a powerful 3D representation for\nreconstructing and rendering photo-realistic views, yet they possess limited\neditability. Conversely, explicit 3D representations, such as polygonal meshes,\noffer ease of editing but may not be as suitable for rendering high-quality\nnovel views. To harness the strengths of both representations, we propose a new\napproach that employs a mesh as a guiding mechanism in editing the neural\nradiance field. We first introduce a differentiable method using marching\ntetrahedra for polygonal mesh extraction from the neural implicit field and\nthen design a differentiable color extractor to assign colors obtained from the\nvolume renderings to this extracted mesh. This differentiable colored mesh\nallows gradient back-propagation from the explicit mesh to the implicit fields,\nempowering users to easily manipulate the geometry and color of neural implicit\nfields. To enhance user control from coarse-grained to fine-grained levels, we\nintroduce an octree-based structure into its optimization. This structure\nprioritizes the edited regions and the surface part, making our method achieve\nfine-grained edits to the neural implicit field and accommodate various user\nmodifications, including object additions, component removals, specific area\ndeformations, and adjustments to local and global colors. Through extensive\nexperiments involving diverse scenes and editing operations, we have\ndemonstrated the capabilities and effectiveness of our method. Our project page\nis: \\url{https://cassiepython.github.io/MNeuEdit/}\n","authors":["Can Wang","Mingming He","Menglei Chai","Dongdong Chen","Jing Liao"],"pdf_url":"https://arxiv.org/pdf/2312.02157v1.pdf","comment":"Project page: https://cassiepython.github.io/MNeuEdit/"},{"id":"http://arxiv.org/abs/2312.02155v1","updated":"2023-12-04T18:59:55Z","published":"2023-12-04T18:59:55Z","title":"GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for\n Real-time Human Novel View Synthesis","summary":" We present a new approach, termed GPS-Gaussian, for synthesizing novel views\nof a character in a real-time manner. The proposed method enables 2K-resolution\nrendering under a sparse-view camera setting. Unlike the original Gaussian\nSplatting or neural implicit rendering methods that necessitate per-subject\noptimizations, we introduce Gaussian parameter maps defined on the source views\nand regress directly Gaussian Splatting properties for instant novel view\nsynthesis without any fine-tuning or optimization. To this end, we train our\nGaussian parameter regression module on a large amount of human scan data,\njointly with a depth estimation module to lift 2D parameter maps to 3D space.\nThe proposed framework is fully differentiable and experiments on several\ndatasets demonstrate that our method outperforms state-of-the-art methods while\nachieving an exceeding rendering speed.\n","authors":["Shunyuan Zheng","Boyao Zhou","Ruizhi Shao","Boning Liu","Shengping Zhang","Liqiang Nie","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02155v1.pdf","comment":"The link to our projectpage is https://shunyuanzheng.github.io"},{"id":"http://arxiv.org/abs/2312.02156v1","updated":"2023-12-04T18:59:55Z","published":"2023-12-04T18:59:55Z","title":"Latent Feature-Guided Diffusion Models for Shadow Removal","summary":" Recovering textures under shadows has remained a challenging problem due to\nthe difficulty of inferring shadow-free scenes from shadow images. In this\npaper, we propose the use of diffusion models as they offer a promising\napproach to gradually refine the details of shadow regions during the diffusion\nprocess. Our method improves this process by conditioning on a learned latent\nfeature space that inherits the characteristics of shadow-free images, thus\navoiding the limitation of conventional methods that condition on degraded\nimages only. Additionally, we propose to alleviate potential local optima\nduring training by fusing noise features with the diffusion network. We\ndemonstrate the effectiveness of our approach which outperforms the previous\nbest method by 13% in terms of RMSE on the AISTD dataset. Further, we explore\ninstance-level shadow removal, where our model outperforms the previous best\nmethod by 82% in terms of RMSE on the DESOBA dataset.\n","authors":["Kangfu Mei","Luis Figueroa","Zhe Lin","Zhihong Ding","Scott Cohen","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2312.02156v1.pdf","comment":"project page see https://kfmei.page/shadow-diffusion/index.html"},{"id":"http://arxiv.org/abs/2312.02153v1","updated":"2023-12-04T18:59:50Z","published":"2023-12-04T18:59:50Z","title":"Aligning and Prompting Everything All at Once for Universal Visual\n Perception","summary":" Vision foundation models have been explored recently to build general-purpose\nvision systems. However, predominant paradigms, driven by casting\ninstance-level tasks as an object-word alignment, bring heavy cross-modality\ninteraction, which is not effective in prompting object detection and visual\ngrounding. Another line of work that focuses on pixel-level tasks often\nencounters a large annotation gap of things and stuff, and suffers from mutual\ninterference between foreground-object and background-class segmentation. In\nstark contrast to the prevailing methods, we present APE, a universal visual\nperception model for aligning and prompting everything all at once in an image\nto perform diverse tasks, i.e., detection, segmentation, and grounding, as an\ninstance-level sentence-object matching paradigm. Specifically, APE advances\nthe convergence of detection and grounding by reformulating language-guided\ngrounding as open-vocabulary detection, which efficiently scales up model\nprompting to thousands of category vocabularies and region descriptions while\nmaintaining the effectiveness of cross-modality fusion. To bridge the\ngranularity gap of different pixel-level tasks, APE equalizes semantic and\npanoptic segmentation to proxy instance learning by considering any isolated\nregions as individual instances. APE aligns vision and language representation\non broad data with natural and challenging characteristics all at once without\ntask-specific fine-tuning. The extensive experiments on over 160 datasets\ndemonstrate that, with only one-suit of weights, APE outperforms (or is on par\nwith) the state-of-the-art models, proving that an effective yet universal\nperception for anything aligning and prompting is indeed feasible. Codes and\ntrained models are released at https://github.com/shenyunhang/APE.\n","authors":["Yunhang Shen","Chaoyou Fu","Peixian Chen","Mengdan Zhang","Ke Li","Xing Sun","Yunsheng Wu","Shaohui Lin","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2312.02153v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02152v1","updated":"2023-12-04T18:59:44Z","published":"2023-12-04T18:59:44Z","title":"Steerers: A framework for rotation equivariant keypoint descriptors","summary":" Image keypoint descriptions that are discriminative and matchable over large\nchanges in viewpoint are vital for 3D reconstruction. However, descriptions\noutput by learned descriptors are typically not robust to camera rotation.\nWhile they can be made more robust by, e.g., data augmentation, this degrades\nperformance on upright images. Another approach is test-time augmentation,\nwhich incurs a significant increase in runtime. We instead learn a linear\ntransform in description space that encodes rotations of the input image. We\ncall this linear transform a steerer since it allows us to transform the\ndescriptions as if the image was rotated. From representation theory we know\nall possible steerers for the rotation group. Steerers can be optimized (A)\ngiven a fixed descriptor, (B) jointly with a descriptor or (C) we can optimize\na descriptor given a fixed steerer. We perform experiments in all of these\nthree settings and obtain state-of-the-art results on the rotation invariant\nimage matching benchmarks AIMS and Roto-360. We publish code and model weights\nat github.com/georg-bn/rotation-steerers.\n","authors":["Georg Bökman","Johan Edstedt","Michael Felsberg","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2312.02152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02151v1","updated":"2023-12-04T18:59:36Z","published":"2023-12-04T18:59:36Z","title":"Guarding Barlow Twins Against Overfitting with Mixed Samples","summary":" Self-supervised Learning (SSL) aims to learn transferable feature\nrepresentations for downstream applications without relying on labeled data.\nThe Barlow Twins algorithm, renowned for its widespread adoption and\nstraightforward implementation compared to its counterparts like contrastive\nlearning methods, minimizes feature redundancy while maximizing invariance to\ncommon corruptions. Optimizing for the above objective forces the network to\nlearn useful representations, while avoiding noisy or constant features,\nresulting in improved downstream task performance with limited adaptation.\nDespite Barlow Twins' proven effectiveness in pre-training, the underlying SSL\nobjective can inadvertently cause feature overfitting due to the lack of strong\ninteraction between the samples unlike the contrastive learning approaches.\nFrom our experiments, we observe that optimizing for the Barlow Twins objective\ndoesn't necessarily guarantee sustained improvements in representation quality\nbeyond a certain pre-training phase, and can potentially degrade downstream\nperformance on some datasets. To address this challenge, we introduce Mixed\nBarlow Twins, which aims to improve sample interaction during Barlow Twins\ntraining via linearly interpolated samples. This results in an additional\nregularization term to the original Barlow Twins objective, assuming linear\ninterpolation in the input space translates to linearly interpolated features\nin the feature space. Pre-training with this regularization effectively\nmitigates feature overfitting and further enhances the downstream performance\non CIFAR-10, CIFAR-100, TinyImageNet, STL-10, and ImageNet datasets. The code\nand checkpoints are available at: https://github.com/wgcban/mix-bt.git\n","authors":["Wele Gedara Chaminda Bandara","Celso M. De Melo","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2312.02151v1.pdf","comment":"Code and checkpoints are available at:\n https://github.com/wgcban/mix-bt.git"},{"id":"http://arxiv.org/abs/2312.02150v1","updated":"2023-12-04T18:59:32Z","published":"2023-12-04T18:59:32Z","title":"Readout Guidance: Learning Control from Diffusion Features","summary":" We present Readout Guidance, a method for controlling text-to-image diffusion\nmodels with learned signals. Readout Guidance uses readout heads, lightweight\nnetworks trained to extract signals from the features of a pre-trained, frozen\ndiffusion model at every timestep. These readouts can encode single-image\nproperties, such as pose, depth, and edges; or higher-order properties that\nrelate multiple images, such as correspondence and appearance similarity.\nFurthermore, by comparing the readout estimates to a user-defined target, and\nback-propagating the gradient through the readout head, these estimates can be\nused to guide the sampling process. Compared to prior methods for conditional\ngeneration, Readout Guidance requires significantly fewer added parameters and\ntraining samples, and offers a convenient and simple recipe for reproducing\ndifferent forms of conditional control under a single framework, with a single\narchitecture and sampling procedure. We showcase these benefits in the\napplications of drag-based manipulation, identity-consistent generation, and\nspatially aligned control. Project page: https://readout-guidance.github.io.\n","authors":["Grace Luo","Trevor Darrell","Oliver Wang","Dan B Goldman","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2312.02150v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02149v1","updated":"2023-12-04T18:59:25Z","published":"2023-12-04T18:59:25Z","title":"Generative Powers of Ten","summary":" We present a method that uses a text-to-image model to generate consistent\ncontent across multiple image scales, enabling extreme semantic zooms into a\nscene, e.g., ranging from a wide-angle landscape view of a forest to a macro\nshot of an insect sitting on one of the tree branches. We achieve this through\na joint multi-scale diffusion sampling approach that encourages consistency\nacross different scales while preserving the integrity of each individual\nsampling process. Since each generated scale is guided by a different text\nprompt, our method enables deeper levels of zoom than traditional\nsuper-resolution methods that may struggle to create new contextual structure\nat vastly different scales. We compare our method qualitatively with\nalternative techniques in image super-resolution and outpainting, and show that\nour method is most effective at generating consistent multi-scale content.\n","authors":["Xiaojuan Wang","Janne Kontkanen","Brian Curless","Steve Seitz","Ira Kemelmacher","Ben Mildenhall","Pratul Srinivasan","Dor Verbin","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2312.02149v1.pdf","comment":"Project page: https://powers-of-10.github.io/"},{"id":"http://arxiv.org/abs/2312.02147v1","updated":"2023-12-04T18:59:20Z","published":"2023-12-04T18:59:20Z","title":"Rejuvenating image-GPT as Strong Visual Representation Learners","summary":" This paper enhances image-GPT (iGPT), one of the pioneering works that\nintroduce autoregressive pretraining to predict next pixels for visual\nrepresentation learning. Two simple yet essential changes are made. First, we\nshift the prediction target from raw pixels to semantic tokens, enabling a\nhigher-level understanding of visual content. Second, we supplement the\nautoregressive modeling by instructing the model to predict not only the next\ntokens but also the visible tokens. This pipeline is particularly effective\nwhen semantic tokens are encoded by discriminatively trained models, such as\nCLIP. We introduce this novel approach as D-iGPT. Extensive experiments\nshowcase that D-iGPT excels as a strong learner of visual representations: A\nnotable achievement of D-iGPT is its compelling performance on the ImageNet-1K\ndataset -- by training on publicly available datasets, D-iGPT achieves 89.5\\%\ntop-1 accuracy with a vanilla ViT-Large model. This model also shows strong\ngeneralization on the downstream task and robustness on out-of-distribution\nsamples. Code is avaiable at\n\\href{https://github.com/OliverRensu/D-iGPT}{https://github.com/OliverRensu/D-iGPT}.\n","authors":["Sucheng Ren","Zeyu Wang","Hongru Zhu","Junfei Xiao","Alan Yuille","Cihang Xie"],"pdf_url":"https://arxiv.org/pdf/2312.02147v1.pdf","comment":"Larger models are coming"},{"id":"http://arxiv.org/abs/2312.02145v1","updated":"2023-12-04T18:59:13Z","published":"2023-12-04T18:59:13Z","title":"Repurposing Diffusion-Based Image Generators for Monocular Depth\n Estimation","summary":" Monocular depth estimation is a fundamental computer vision task. Recovering\n3D depth from a single image is geometrically ill-posed and requires scene\nunderstanding, so it is not surprising that the rise of deep learning has led\nto a breakthrough. The impressive progress of monocular depth estimators has\nmirrored the growth in model capacity, from relatively modest CNNs to large\nTransformer architectures. Still, monocular depth estimators tend to struggle\nwhen presented with images with unfamiliar content and layout, since their\nknowledge of the visual world is restricted by the data seen during training,\nand challenged by zero-shot generalization to new domains. This motivates us to\nexplore whether the extensive priors captured in recent generative diffusion\nmodels can enable better, more generalizable depth estimation. We introduce\nMarigold, a method for affine-invariant monocular depth estimation that is\nderived from Stable Diffusion and retains its rich prior knowledge. The\nestimator can be fine-tuned in a couple of days on a single GPU using only\nsynthetic training data. It delivers state-of-the-art performance across a wide\nrange of datasets, including over 20% performance gains in specific cases.\nProject page: https://marigoldmonodepth.github.io.\n","authors":["Bingxin Ke","Anton Obukhov","Shengyu Huang","Nando Metzger","Rodrigo Caye Daudt","Konrad Schindler"],"pdf_url":"https://arxiv.org/pdf/2312.02145v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02144v1","updated":"2023-12-04T18:59:02Z","published":"2023-12-04T18:59:02Z","title":"Optimizing Camera Configurations for Multi-View Pedestrian Detection","summary":" Jointly considering multiple camera views (multi-view) is very effective for\npedestrian detection under occlusion. For such multi-view systems, it is\ncritical to have well-designed camera configurations, including camera\nlocations, directions, and fields-of-view (FoVs). Usually, these configurations\nare crafted based on human experience or heuristics. In this work, we present a\nnovel solution that features a transformer-based camera configuration\ngenerator. Using reinforcement learning, this generator autonomously explores\nvast combinations within the action space and searches for configurations that\ngive the highest detection accuracy according to the training dataset. The\ngenerator learns advanced techniques like maximizing coverage, minimizing\nocclusion, and promoting collaboration. Across multiple simulation scenarios,\nthe configurations generated by our transformer-based model consistently\noutperform random search, heuristic-based methods, and configurations designed\nby human experts, shedding light on future camera layout optimization.\n","authors":["Yunzhong Hou","Xingjian Leng","Tom Gedeon","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.02144v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02142v1","updated":"2023-12-04T18:58:40Z","published":"2023-12-04T18:58:40Z","title":"Object Recognition as Next Token Prediction","summary":" We present an approach to pose object recognition as next token prediction.\nThe idea is to apply a language decoder that auto-regressively predicts the\ntext tokens from image embeddings to form labels. To ground this prediction\nprocess in auto-regression, we customize a non-causal attention mask for the\ndecoder, incorporating two key features: modeling tokens from different labels\nto be independent, and treating image tokens as a prefix. This masking\nmechanism inspires an efficient method - one-shot sampling - to simultaneously\nsample tokens of multiple labels in parallel and rank generated labels by their\nprobabilities during inference. To further enhance the efficiency, we propose a\nsimple strategy to construct a compact decoder by simply discarding the\nintermediate blocks of a pretrained language model. This approach yields a\ndecoder that matches the full model's performance while being notably more\nefficient. The code is available at https://github.com/kaiyuyue/nxtp\n","authors":["Kaiyu Yue","Bor-Chun Chen","Jonas Geiping","Hengduo Li","Tom Goldstein","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2312.02142v1.pdf","comment":"auto-regression for recognition"},{"id":"http://arxiv.org/abs/2312.02141v1","updated":"2023-12-04T18:58:20Z","published":"2023-12-04T18:58:20Z","title":"iMatching: Imperative Correspondence Learning","summary":" Learning feature correspondence is a foundational task in computer vision,\nholding immense importance for downstream applications such as visual odometry\nand 3D reconstruction. Despite recent progress in data-driven models, feature\ncorrespondence learning is still limited by the lack of accurate per-pixel\ncorrespondence labels. To overcome this difficulty, we introduce a new\nself-supervised scheme, imperative learning (IL), for training feature\ncorrespondence. It enables correspondence learning on arbitrary uninterrupted\nvideos without any camera pose or depth labels, heralding a new era for\nself-supervised correspondence learning. Specifically, we formulated the\nproblem of correspondence learning as a bilevel optimization, which takes the\nreprojection error from bundle adjustment as a supervisory signal for the\nmodel. To avoid large memory and computation overhead, we leverage the\nstationary point to effectively back-propagate the implicit gradients through\nbundle adjustment. Through extensive experiments, we demonstrate superior\nperformance on tasks including feature matching and pose estimation, in which\nwe obtained an average of 30% accuracy gain over the state-of-the-art matching\nmodels.\n","authors":["Zitong Zhan","Dasong Gao","Yun-Jou Lin","Youjie Xia","Chen Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02141v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02139v1","updated":"2023-12-04T18:57:01Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":" Diffusion models with their powerful expressivity and high sample quality\nhave enabled many new applications and use-cases in various domains. For sample\ngeneration, these models rely on a denoising neural network that generates\nimages by iterative denoising. Yet, the role of denoising network architecture\nis not well-studied with most efforts relying on convolutional residual U-Nets.\nIn this paper, we study the effectiveness of vision transformers in\ndiffusion-based generative learning. Specifically, we propose a new model,\ndenoted as Diffusion Vision Transformers (DiffiT), which consists of a hybrid\nhierarchical architecture with a U-shaped encoder and decoder. We introduce a\nnovel time-dependent self-attention module that allows attention layers to\nadapt their behavior at different stages of the denoising process in an\nefficient manner. We also introduce latent DiffiT which consists of transformer\nmodel with the proposed self-attention layers, for high-resolution image\ngeneration. Our results show that DiffiT is surprisingly effective in\ngenerating high-fidelity images, and it achieves state-of-the-art (SOTA)\nbenchmarks on a variety of class-conditional and unconditional synthesis tasks.\nIn the latent space, DiffiT achieves a new SOTA FID score of 1.73 on\nImageNet-256 dataset. Repository: https://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v1.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2312.02137v1","updated":"2023-12-04T18:56:22Z","published":"2023-12-04T18:56:22Z","title":"MANUS: Markerless Hand-Object Grasp Capture using Articulated 3D\n Gaussians","summary":" Understanding how we grasp objects with our hands has important applications\nin areas like robotics and mixed reality. However, this challenging problem\nrequires accurate modeling of the contact between hands and objects. To capture\ngrasps, existing methods use skeletons, meshes, or parametric models that can\ncause misalignments resulting in inaccurate contacts. We present MANUS, a\nmethod for Markerless Hand-Object Grasp Capture using Articulated 3D Gaussians.\nWe build a novel articulated 3D Gaussians representation that extends 3D\nGaussian splatting for high-fidelity representation of articulating hands.\nSince our representation uses Gaussian primitives, it enables us to efficiently\nand accurately estimate contacts between the hand and the object. For the most\naccurate results, our method requires tens of camera views that current\ndatasets do not provide. We therefore build MANUS-Grasps, a new dataset that\ncontains hand-object grasps viewed from 53 cameras across 30+ scenes, 3\nsubjects, and comprising over 7M frames. In addition to extensive qualitative\nresults, we also show that our method outperforms others on a quantitative\ncontact evaluation method that uses paint transfer from the object to the hand.\n","authors":["Chandradeep Pokhariya","Ishaan N Shah","Angela Xing","Zekun Li","Kefan Chen","Avinash Sharma","Srinath Sridhar"],"pdf_url":"https://arxiv.org/pdf/2312.02137v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02136v1","updated":"2023-12-04T18:56:10Z","published":"2023-12-04T18:56:10Z","title":"BerfScene: Bev-conditioned Equivariant Radiance Fields for Infinite 3D\n Scene Generation","summary":" Generating large-scale 3D scenes cannot simply apply existing 3D object\nsynthesis technique since 3D scenes usually hold complex spatial configurations\nand consist of a number of objects at varying scales. We thus propose a\npractical and efficient 3D representation that incorporates an equivariant\nradiance field with the guidance of a bird's-eye view (BEV) map. Concretely,\nobjects of synthesized 3D scenes could be easily manipulated through steering\nthe corresponding BEV maps. Moreover, by adequately incorporating positional\nencoding and low-pass filters into the generator, the representation becomes\nequivariant to the given BEV map. Such equivariance allows us to produce\nlarge-scale, even infinite-scale, 3D scenes via synthesizing local scenes and\nthen stitching them with smooth consistency. Extensive experiments on 3D scene\ndatasets demonstrate the effectiveness of our approach. Our project website is\nat https://zqh0253.github.io/BerfScene/.\n","authors":["Qihang Zhang","Yinghao Xu","Yujun Shen","Bo Dai","Bolei Zhou","Ceyuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02136v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02135v1","updated":"2023-12-04T18:55:48Z","published":"2023-12-04T18:55:48Z","title":"Fast View Synthesis of Casual Videos","summary":" Novel view synthesis from an in-the-wild video is difficult due to challenges\nlike scene dynamics and lack of parallax. While existing methods have shown\npromising results with implicit neural radiance fields, they are slow to train\nand render. This paper revisits explicit video representations to synthesize\nhigh-quality novel views from a monocular video efficiently. We treat static\nand dynamic video content separately. Specifically, we build a global static\nscene model using an extended plane-based scene representation to synthesize\ntemporally coherent novel video. Our plane-based scene representation is\naugmented with spherical harmonics and displacement maps to capture\nview-dependent effects and model non-planar complex surface geometry. We opt to\nrepresent the dynamic content as per-frame point clouds for efficiency. While\nsuch representations are inconsistency-prone, minor temporal inconsistencies\nare perceptually masked due to motion. We develop a method to quickly estimate\nsuch a hybrid video representation and render novel views in real time. Our\nexperiments show that our method can render high-quality novel views from an\nin-the-wild video with comparable quality to state-of-the-art methods while\nbeing 100x faster in training and enabling real-time rendering.\n","authors":["Yao-Chih Lee","Zhoutong Zhang","Kevin Blackburn-Matzen","Simon Niklaus","Jianming Zhang","Jia-Bin Huang","Feng Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02135v1.pdf","comment":"Project page: https://casual-fvs.github.io/"},{"id":"http://arxiv.org/abs/2312.02134v1","updated":"2023-12-04T18:55:45Z","published":"2023-12-04T18:55:45Z","title":"GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single\n Video via Animatable 3D Gaussians","summary":" We present GaussianAvatar, an efficient approach to creating realistic human\navatars with dynamic 3D appearances from a single video. We start by\nintroducing animatable 3D Gaussians to explicitly represent humans in various\nposes and clothing styles. Such an explicit and animatable representation can\nfuse 3D appearances more efficiently and consistently from 2D observations. Our\nrepresentation is further augmented with dynamic properties to support\npose-dependent appearance modeling, where a dynamic appearance network along\nwith an optimizable feature tensor is designed to learn the\nmotion-to-appearance mapping. Moreover, by leveraging the differentiable motion\ncondition, our method enables a joint optimization of motions and appearances\nduring avatar modeling, which helps to tackle the long-standing issue of\ninaccurate motion estimation in monocular settings. The efficacy of\nGaussianAvatar is validated on both the public dataset and our collected\ndataset, demonstrating its superior performances in terms of appearance quality\nand rendering efficiency.\n","authors":["Liangxiao Hu","Hongwen Zhang","Yuxiang Zhang","Boyao Zhou","Boning Liu","Shengping Zhang","Liqiang Nie"],"pdf_url":"https://arxiv.org/pdf/2312.02134v1.pdf","comment":"https://huliangxiao.github.io/GaussianAvatar"},{"id":"http://arxiv.org/abs/2312.02133v1","updated":"2023-12-04T18:55:35Z","published":"2023-12-04T18:55:35Z","title":"Style Aligned Image Generation via Shared Attention","summary":" Large-scale Text-to-Image (T2I) models have rapidly gained prominence across\ncreative fields, generating visually compelling outputs from textual prompts.\nHowever, controlling these models to ensure consistent style remains\nchallenging, with existing methods necessitating fine-tuning and manual\nintervention to disentangle content and style. In this paper, we introduce\nStyleAligned, a novel technique designed to establish style alignment among a\nseries of generated images. By employing minimal `attention sharing' during the\ndiffusion process, our method maintains style consistency across images within\nT2I models. This approach allows for the creation of style-consistent images\nusing a reference style through a straightforward inversion operation. Our\nmethod's evaluation across diverse styles and text prompts demonstrates\nhigh-quality synthesis and fidelity, underscoring its efficacy in achieving\nconsistent style across various inputs.\n","authors":["Amir Hertz","Andrey Voynov","Shlomi Fruchter","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2312.02133v1.pdf","comment":"Project page at style-aligned-gen.github.io"},{"id":"http://arxiv.org/abs/2312.02128v1","updated":"2023-12-04T18:53:42Z","published":"2023-12-04T18:53:42Z","title":"Can we truly transfer an actor's genuine happiness to avatars? An\n investigation into virtual, real, posed and spontaneous faces","summary":" A look is worth a thousand words is a popular phrase. And why is a simple\nlook enough to portray our feelings about something or someone? Behind this\nquestion are the theoretical foundations of the field of psychology regarding\nsocial cognition and the studies of psychologist Paul Ekman. Facial\nexpressions, as a form of non-verbal communication, are the primary way to\ntransmit emotions between human beings. The set of movements and expressions of\nfacial muscles that convey some emotional state of the individual to their\nobservers are targets of studies in many areas. Our research aims to evaluate\nEkman's action units in datasets of real human faces, posed and spontaneous,\nand virtual human faces resulting from transferring real faces into Computer\nGraphics faces. In addition, we also conducted a case study with specific movie\ncharacters, such as SheHulk and Genius. We intend to find differences and\nsimilarities in facial expressions between real and CG datasets, posed and\nspontaneous faces, and also to consider the actors' genders in the videos. This\ninvestigation can help several areas of knowledge, whether using real or\nvirtual human beings, in education, health, entertainment, games, security, and\neven legal matters. Our results indicate that AU intensities are greater for\nposed than spontaneous datasets, regardless of gender. Furthermore, there is a\nsmoothing of intensity up to 80 percent for AU6 and 45 percent for AU12 when a\nreal face is transformed into CG.\n","authors":["Vitor Miguel Xavier Peres","Greice Pinho Dal Molin","Soraia Raupp Musse"],"pdf_url":"https://arxiv.org/pdf/2312.02128v1.pdf","comment":"To be published in Simp\\'osio Brasileiro de Jogos e Entretenimento\n Digital - SBGames 2023"},{"id":"http://arxiv.org/abs/2312.02126v1","updated":"2023-12-04T18:53:24Z","published":"2023-12-04T18:53:24Z","title":"SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM","summary":" Dense simultaneous localization and mapping (SLAM) is pivotal for embodied\nscene understanding. Recent work has shown that 3D Gaussians enable\nhigh-quality reconstruction and real-time rendering of scenes using multiple\nposed cameras. In this light, we show for the first time that representing a\nscene by 3D Gaussians can enable dense SLAM using a single unposed monocular\nRGB-D camera. Our method, SplaTAM, addresses the limitations of prior radiance\nfield-based representations, including fast rendering and optimization, the\nability to determine if areas have been previously mapped, and structured map\nexpansion by adding more Gaussians. We employ an online tracking and mapping\npipeline while tailoring it to specifically use an underlying Gaussian\nrepresentation and silhouette-guided optimization via differentiable rendering.\nExtensive experiments show that SplaTAM achieves up to 2X state-of-the-art\nperformance in camera pose estimation, map construction, and novel-view\nsynthesis, demonstrating its superiority over existing approaches, while\nallowing real-time rendering of a high-resolution dense 3D map.\n","authors":["Nikhil Keetha","Jay Karhade","Krishna Murthy Jatavallabhula","Gengshan Yang","Sebastian Scherer","Deva Ramanan","Jonathon Luiten"],"pdf_url":"https://arxiv.org/pdf/2312.02126v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02124v1","updated":"2023-12-04T18:51:44Z","published":"2023-12-04T18:51:44Z","title":"VerA: Versatile Anonymization Fit for Clinical Facial Images","summary":" The escalating legislative demand for data privacy in facial image\ndissemination has underscored the significance of image anonymization. Recent\nadvancements in the field surpass traditional pixelation or blur methods, yet\nthey predominantly address regular single images. This leaves clinical image\nanonymization -- a necessity for illustrating medical interventions -- largely\nunaddressed. We present VerA, a versatile facial image anonymization that is\nfit for clinical facial images where: (1) certain semantic areas must be\npreserved to show medical intervention results, and (2) anonymizing image pairs\nis crucial for showing before-and-after results. VerA outperforms or is on par\nwith state-of-the-art methods in de-identification and photorealism for regular\nimages. In addition, we validate our results on paired anonymization, and on\nthe anonymization of both single and paired clinical images with extensive\nquantitative and qualitative evaluation.\n","authors":["Majed El Helou","Doruk Cetin","Petar Stamenkovic","Fabio Zund"],"pdf_url":"https://arxiv.org/pdf/2312.02124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02121v1","updated":"2023-12-04T18:50:41Z","published":"2023-12-04T18:50:41Z","title":"Mathematical Supplement for the $\\texttt{gsplat}$ Library","summary":" This report provides the mathematical details of the gsplat library, a\nmodular toolbox for efficient differentiable Gaussian splatting, as proposed by\nKerbl et al. It provides a self-contained reference for the computations\ninvolved in the forward and backward passes of differentiable Gaussian\nsplatting. To facilitate practical usage and development, we provide a user\nfriendly Python API that exposes each component of the forward and backward\npasses in rasterization at github.com/nerfstudio-project/gsplat .\n","authors":["Vickie Ye","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2312.02121v1.pdf","comment":"Find the library at: https://docs.gsplat.studio/"},{"id":"http://arxiv.org/abs/2312.02116v1","updated":"2023-12-04T18:48:02Z","published":"2023-12-04T18:48:02Z","title":"GIVT: Generative Infinite-Vocabulary Transformers","summary":" We introduce generative infinite-vocabulary transformers (GIVT) which\ngenerate vector sequences with real-valued entries, instead of discrete tokens\nfrom a finite vocabulary. To this end, we propose two surprisingly simple\nmodifications to decoder-only transformers: 1) at the input, we replace the\nfinite-vocabulary lookup table with a linear projection of the input vectors;\nand 2) at the output, we replace the logits prediction (usually mapped to a\ncategorical distribution) with the parameters of a multivariate Gaussian\nmixture model. Inspired by the image-generation paradigm of VQ-GAN and MaskGIT,\nwhere transformers are used to model the discrete latent sequences of a VQ-VAE,\nwe use GIVT to model the unquantized real-valued latent sequences of a VAE.\nWhen applying GIVT to class-conditional image generation with iterative masked\nmodeling, we show competitive results with MaskGIT, while our approach\noutperforms both VQ-GAN and MaskGIT when using it for causal modeling. Finally,\nwe obtain competitive results outside of image generation when applying our\napproach to panoptic segmentation and depth estimation with a VAE-based variant\nof the UViM framework.\n","authors":["Michael Tschannen","Cian Eastwood","Fabian Mentzer"],"pdf_url":"https://arxiv.org/pdf/2312.02116v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02111v1","updated":"2023-12-04T18:43:45Z","published":"2023-12-04T18:43:45Z","title":"TriDeNT: Triple Deep Network Training for Privileged Knowledge\n Distillation in Histopathology","summary":" Computational pathology models rarely utilise data that will not be available\nfor inference. This means most models cannot learn from highly informative data\nsuch as additional immunohistochemical (IHC) stains and spatial\ntranscriptomics. We present TriDeNT, a novel self-supervised method for\nutilising privileged data that is not available during inference to improve\nperformance. We demonstrate the efficacy of this method for a range of\ndifferent paired data including immunohistochemistry, spatial transcriptomics\nand expert nuclei annotations. In all settings, TriDeNT outperforms other\nstate-of-the-art methods in downstream tasks, with observed improvements of up\nto 101%. Furthermore, we provide qualitative and quantitative measurements of\nthe features learned by these models and how they differ from baselines.\nTriDeNT offers a novel method to distil knowledge from scarce or costly data\nduring training, to create significantly better models for routine inputs.\n","authors":["Lucas Farndale","Robert Insall","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.02111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04527v3","updated":"2023-12-04T18:42:25Z","published":"2023-06-07T15:36:26Z","title":"ContriMix: Unsupervised disentanglement of content and attribute for\n domain generalization in microscopy image analysis","summary":" Domain generalization is critical for real-world applications of machine\nlearning to microscopy images, including histopathology and fluorescence\nimaging. Artifacts in these modalities arise through a complex combination of\nfactors relating to tissue collection and laboratory processing, as well as\nfactors intrinsic to patient samples. In fluorescence imaging, these artifacts\nstem from variations across experimental batches. The complexity and subtlety\nof these artifacts make the enumeration of data domains intractable. Therefore,\naugmentation-based methods of domain generalization that require domain\nidentifiers and manual fine-tuning are inadequate in this setting. To overcome\nthis challenge, we introduce ContriMix, a domain generalization technique that\nlearns to generate synthetic images by disentangling and permuting the\nbiological content (\"content\") and technical variations (\"attributes\") in\nmicroscopy images. ContriMix does not rely on domain identifiers or handcrafted\naugmentations and makes no assumptions about the input characteristics of\nimages. We assess the performance of ContriMix on two pathology datasets\ndealing with patch classification and Whole Slide Image label prediction tasks\nrespectively (Camelyon17-WILDS and RCC subtyping), and one fluorescence\nmicroscopy dataset (RxRx1-WILDS). Without any access to domain identifiers at\ntrain or test time, ContriMix performs similar or better than current\nstate-of-the-art methods in all these datasets, motivating its usage for\nmicroscopy image analysis in real-world settings where domain information is\nhard to come by. The code for ContriMix can be found at\nhttps://gitlab.com/huutan86/contrimix\n","authors":["Tan H. Nguyen","Dinkar Juyal","Jin Li","Aaditya Prakash","Shima Nofallah","Chintan Shah","Sai Chowdary Gullapally","Limin Yu","Michael Griffin","Anand Sampat","John Abel","Justin Lee","Amaro Taylor-Weiner"],"pdf_url":"https://arxiv.org/pdf/2306.04527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.04517v3","updated":"2023-12-04T18:39:07Z","published":"2023-01-11T15:31:15Z","title":"A new sampling methodology for defining heterogeneous subsets of samples\n for training image segmentation algorithms","summary":" Creating a dataset for training supervised machine learning algorithms can be\na demanding task. This is especially true for medical image segmentation since\none or more specialists are usually required for image annotation, and creating\nground truth labels for just a single image can take up to several hours. In\naddition, it is paramount that the annotated samples represent well the\ndifferent conditions that might affect the imaged tissues as well as possible\nchanges in the image acquisition process. This can only be achieved by\nconsidering samples that are typical in the dataset as well as atypical, or\neven outlier, samples. We introduce a new sampling methodology for selecting\nrelevant images from a large dataset in a way that evenly considers both\nprototypical as well as atypical samples. The methodology involves the\ngeneration of a uniform grid from a feature space representing the samples,\nwhich is then used for randomly drawing relevant images. The selected images\nprovide a uniform covering of the original dataset, and thus define a\nheterogeneous set of images that can be annotated and used for training\nsupervised segmentation algorithms. We provide a case example by creating a\ndataset containing a representative set of blood vessel microscopy images\nselected from a larger dataset containing thousands of images. The dataset,\nwhich we call VessMAP, is being made available online to aid the development of\nnew blood vessel segmentation algorithms.\n","authors":["Matheus Viana da Silva","Natália de Carvalho Santos","Julie Ouellette","Baptiste Lacoste","Cesar Henrique Comin"],"pdf_url":"https://arxiv.org/pdf/2301.04517v3.pdf","comment":"10 pages, 9 figures. Under review"},{"id":"http://arxiv.org/abs/2312.02109v1","updated":"2023-12-04T18:39:00Z","published":"2023-12-04T18:39:00Z","title":"ArtAdapter: Text-to-Image Style Transfer using Multi-Level Style Encoder\n and Explicit Adaptation","summary":" This work introduces ArtAdapter, a transformative text-to-image (T2I) style\ntransfer framework that transcends traditional limitations of color,\nbrushstrokes, and object shape, capturing high-level style elements such as\ncomposition and distinctive artistic expression. The integration of a\nmulti-level style encoder with our proposed explicit adaptation mechanism\nenables ArtAdapte to achieve unprecedented fidelity in style transfer, ensuring\nclose alignment with textual descriptions. Additionally, the incorporation of\nan Auxiliary Content Adapter (ACA) effectively separates content from style,\nalleviating the borrowing of content from style references. Moreover, our novel\nfast finetuning approach could further enhance zero-shot style representation\nwhile mitigating the risk of overfitting. Comprehensive evaluations confirm\nthat ArtAdapter surpasses current state-of-the-art methods.\n","authors":["Dar-Yen Chen","Hamish Tennent","Ching-Wen Hsu"],"pdf_url":"https://arxiv.org/pdf/2312.02109v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02103v1","updated":"2023-12-04T18:29:03Z","published":"2023-12-04T18:29:03Z","title":"Learning Pseudo-Labeler beyond Noun Concepts for Open-Vocabulary Object\n Detection","summary":" Open-vocabulary object detection (OVOD) has recently gained significant\nattention as a crucial step toward achieving human-like visual intelligence.\nExisting OVOD methods extend target vocabulary from pre-defined categories to\nopen-world by transferring knowledge of arbitrary concepts from vision-language\npre-training models to the detectors. While previous methods have shown\nremarkable successes, they suffer from indirect supervision or limited\ntransferable concepts. In this paper, we propose a simple yet effective method\nto directly learn region-text alignment for arbitrary concepts. Specifically,\nthe proposed method aims to learn arbitrary image-to-text mapping for\npseudo-labeling of arbitrary concepts, named Pseudo-Labeling for Arbitrary\nConcepts (PLAC). The proposed method shows competitive performance on the\nstandard OVOD benchmark for noun concepts and a large improvement on referring\nexpression comprehension benchmark for arbitrary concepts.\n","authors":["Sunghun Kang","Junbum Cha","Jonghwan Mun","Byungseok Roh","Chang D. Yoo"],"pdf_url":"https://arxiv.org/pdf/2312.02103v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02087v1","updated":"2023-12-04T17:58:06Z","published":"2023-12-04T17:58:06Z","title":"VideoSwap: Customized Video Subject Swapping with Interactive Semantic\n Point Correspondence","summary":" Current diffusion-based video editing primarily focuses on\nstructure-preserved editing by utilizing various dense correspondences to\nensure temporal consistency and motion alignment. However, these approaches are\noften ineffective when the target edit involves a shape change. To embark on\nvideo editing with shape change, we explore customized video subject swapping\nin this work, where we aim to replace the main subject in a source video with a\ntarget subject having a distinct identity and potentially different shape. In\ncontrast to previous methods that rely on dense correspondences, we introduce\nthe VideoSwap framework that exploits semantic point correspondences, inspired\nby our observation that only a small number of semantic points are necessary to\nalign the subject's motion trajectory and modify its shape. We also introduce\nvarious user-point interactions (\\eg, removing points and dragging points) to\naddress various semantic point correspondence. Extensive experiments\ndemonstrate state-of-the-art video subject swapping results across a variety of\nreal-world videos.\n","authors":["Yuchao Gu","Yipin Zhou","Bichen Wu","Licheng Yu","Jia-Wei Liu","Rui Zhao","Jay Zhangjie Wu","David Junhao Zhang","Mike Zheng Shou","Kevin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02087v1.pdf","comment":"Project page at https://videoswap.github.io"},{"id":"http://arxiv.org/abs/2309.13550v3","updated":"2023-12-04T17:54:05Z","published":"2023-09-24T04:48:44Z","title":"I-AI: A Controllable & Interpretable AI System for Decoding\n Radiologists' Intense Focus for Accurate CXR Diagnoses","summary":" In the field of chest X-ray (CXR) diagnosis, existing works often focus\nsolely on determining where a radiologist looks, typically through tasks such\nas detection, segmentation, or classification. However, these approaches are\noften designed as black-box models, lacking interpretability. In this paper, we\nintroduce Interpretable Artificial Intelligence (I-AI) a novel and unified\ncontrollable interpretable pipeline for decoding the intense focus of\nradiologists in CXR diagnosis. Our I-AI addresses three key questions: where a\nradiologist looks, how long they focus on specific areas, and what findings\nthey diagnose. By capturing the intensity of the radiologist's gaze, we provide\na unified solution that offers insights into the cognitive process underlying\nradiological interpretation. Unlike current methods that rely on black-box\nmachine learning models, which can be prone to extracting erroneous information\nfrom the entire input image during the diagnosis process, we tackle this issue\nby effectively masking out irrelevant information. Our proposed I-AI leverages\na vision-language model, allowing for precise control over the interpretation\nprocess while ensuring the exclusion of irrelevant features. To train our I-AI\nmodel, we utilize an eye gaze dataset to extract anatomical gaze information\nand generate ground truth heatmaps. Through extensive experimentation, we\ndemonstrate the efficacy of our method. We showcase that the attention\nheatmaps, designed to mimic radiologists' focus, encode sufficient and relevant\ninformation, enabling accurate classification tasks using only a portion of\nCXR.\n","authors":["Trong Thang Pham","Jacob Brecheisen","Anh Nguyen","Hien Nguyen","Ngan Le"],"pdf_url":"https://arxiv.org/pdf/2309.13550v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02078v1","updated":"2023-12-04T17:41:52Z","published":"2023-12-04T17:41:52Z","title":"Integrating AI into CCTV Systems: A Comprehensive Evaluation of Smart\n Video Surveillance in Community Space","summary":" This article presents an AI-enabled Smart Video Surveillance (SVS) designed\nto enhance safety in community spaces such as educational and recreational\nareas, and small businesses. The proposed system innovatively integrates with\nexisting CCTV and wired camera networks, simplifying its adoption across\nvarious community cases to leverage recent AI advancements. Our SVS system,\nfocusing on privacy, uses metadata instead of pixel data for activity\nrecognition, aligning with ethical standards. It features cloud-based\ninfrastructure and a mobile app for real-time, privacy-conscious alerts in\ncommunities.\n This article notably pioneers a comprehensive real-world evaluation of the\nSVS system, covering AI-driven visual processing, statistical analysis,\ndatabase management, cloud communication, and user notifications. It's also the\nfirst to assess an end-to-end anomaly detection system's performance, vital for\nidentifying potential public safety incidents.\n For our evaluation, we implemented the system in a community college, serving\nas an ideal model to exemplify the proposed system's capabilities. Our findings\nin this setting demonstrate the system's robustness, with throughput, latency,\nand scalability effectively managing 16 CCTV cameras. The system maintained a\nconsistent 16.5 frames per second (FPS) over a 21-hour operation. The average\nend-to-end latency for detecting behavioral anomalies and alerting users was\n26.76 seconds.\n","authors":["Shanle Yao","Babak Rahimi Ardabili","Armin Danesh Pazho","Ghazal Alinezhad Noghre","Christopher Neff","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2312.02078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18116v2","updated":"2023-12-04T17:38:31Z","published":"2023-10-27T13:02:12Z","title":"Direct Unsupervised Denoising","summary":" Traditional supervised denoisers are trained using pairs of noisy input and\nclean target images. They learn to predict a central tendency of the posterior\ndistribution over possible clean images. When, e.g., trained with the popular\nquadratic loss function, the network's output will correspond to the minimum\nmean square error (MMSE) estimate. Unsupervised denoisers based on Variational\nAutoEncoders (VAEs) have succeeded in achieving state-of-the-art results while\nrequiring only unpaired noisy data as training input. In contrast to the\ntraditional supervised approach, unsupervised denoisers do not directly produce\na single prediction, such as the MMSE estimate, but allow us to draw samples\nfrom the posterior distribution of clean solutions corresponding to the noisy\ninput. To approximate the MMSE estimate during inference, unsupervised methods\nhave to create and draw a large number of samples - a computationally expensive\nprocess - rendering the approach inapplicable in many situations. Here, we\npresent an alternative approach that trains a deterministic network alongside\nthe VAE to directly predict a central tendency. Our method achieves results\nthat surpass the results achieved by the unsupervised method at a fraction of\nthe computational cost.\n","authors":["Benjamin Salmon","Alexander Krull"],"pdf_url":"https://arxiv.org/pdf/2310.18116v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02069v1","updated":"2023-12-04T17:28:35Z","published":"2023-12-04T17:28:35Z","title":"GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians","summary":" We introduce GaussianAvatars, a new method to create photorealistic head\navatars that are fully controllable in terms of expression, pose, and\nviewpoint. The core idea is a dynamic 3D representation based on 3D Gaussian\nsplats that are rigged to a parametric morphable face model. This combination\nfacilitates photorealistic rendering while allowing for precise animation\ncontrol via the underlying parametric model, e.g., through expression transfer\nfrom a driving sequence or by manually changing the morphable model parameters.\nWe parameterize each splat by a local coordinate frame of a triangle and\noptimize for explicit displacement offset to obtain a more accurate geometric\nrepresentation. During avatar reconstruction, we jointly optimize for the\nmorphable model parameters and Gaussian splat parameters in an end-to-end\nfashion. We demonstrate the animation capabilities of our photorealistic avatar\nin several challenging scenarios. For instance, we show reenactments from a\ndriving video, where our method outperforms existing works by a significant\nmargin.\n","authors":["Shenhan Qian","Tobias Kirschstein","Liam Schoneveld","Davide Davoli","Simon Giebenhain","Matthias Nießner"],"pdf_url":"https://arxiv.org/pdf/2312.02069v1.pdf","comment":"Project page: https://shenhanqian.github.io/gaussian-avatars"},{"id":"http://arxiv.org/abs/2312.02052v1","updated":"2023-12-04T17:10:25Z","published":"2023-12-04T17:10:25Z","title":"DUCK: Distance-based Unlearning via Centroid Kinematics","summary":" Machine Unlearning is rising as a new field, driven by the pressing necessity\nof ensuring privacy in modern artificial intelligence models. This technique\nprimarily aims to eradicate any residual influence of a specific subset of data\nfrom the knowledge acquired by a neural model during its training. This work\nintroduces a novel unlearning algorithm, denoted as Distance-based Unlearning\nvia Centroid Kinematics (DUCK), which employs metric learning to guide the\nremoval of samples matching the nearest incorrect centroid in the embedding\nspace. Evaluation of the algorithm's performance is conducted across various\nbenchmark datasets in two distinct scenarios, class removal, and homogeneous\nsampling removal, obtaining state-of-the-art performance. We introduce a novel\nmetric, called Adaptive Unlearning Score (AUS), encompassing not only the\nefficacy of the unlearning process in forgetting target data but also\nquantifying the performance loss relative to the original model. Moreover, we\npropose a novel membership inference attack to assess the algorithm's capacity\nto erase previously acquired knowledge, designed to be adaptable to future\nmethodologies.\n","authors":["Marco Cotogni","Jacopo Bonato","Luigi Sabetta","Francesco Pelosin","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02051v1","updated":"2023-12-04T17:09:52Z","published":"2023-12-04T17:09:52Z","title":"TimeChat: A Time-sensitive Multimodal Large Language Model for Long\n Video Understanding","summary":" This work proposes TimeChat, a time-sensitive multimodal large language model\nspecifically designed for long video understanding. Our model incorporates two\nkey architectural contributions: (1) a timestamp-aware frame encoder that binds\nvisual content with the timestamp of each frame, and (2) a sliding video\nQ-Former that produces a video token sequence of varying lengths to accommodate\nvideos of various durations. Additionally, we construct an instruction-tuning\ndataset, encompassing 6 tasks and a total of 125K instances, to further enhance\nTimeChat's instruction-following performance. Experiment results across various\nvideo understanding tasks, such as dense captioning, temporal grounding, and\nhighlight detection, demonstrate TimeChat's strong zero-shot temporal\nlocalization and reasoning capabilities. For example, it achieves +9.2 F1 score\nand +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5)\non Charades-STA, compared to state-of-the-art video large language models,\nholding the potential to serve as a versatile video assistant for long-form\nvideo comprehension tasks and satisfy realistic user requirements.\n","authors":["Shuhuai Ren","Linli Yao","Shicheng Li","Xu Sun","Lu Hou"],"pdf_url":"https://arxiv.org/pdf/2312.02051v1.pdf","comment":"17 pages, 10 figures, code is available at\n https://github.com/RenShuhuai-Andy/TimeChat"},{"id":"http://arxiv.org/abs/2307.02129v3","updated":"2023-12-04T17:06:12Z","published":"2023-07-05T09:11:09Z","title":"How Deep Neural Networks Learn Compositional Data: The Random Hierarchy\n Model","summary":" Deep learning algorithms demonstrate a surprising ability to learn\nhigh-dimensional tasks from limited examples. This is commonly attributed to\nthe depth of neural networks, enabling them to build a hierarchy of abstract,\nlow-dimensional data representations. However, how many training examples are\nrequired to learn such representations remains unknown. To quantitatively study\nthis question, we introduce the Random Hierarchy Model: a family of synthetic\ntasks inspired by the hierarchical structure of language and images. The model\nis a classification task where each class corresponds to a group of high-level\nfeatures, chosen among several equivalent groups associated with the same\nclass. In turn, each feature corresponds to a group of sub-features chosen\namong several equivalent ones and so on, following a hierarchy of composition\nrules. We find that deep networks learn the task by developing internal\nrepresentations invariant to exchanging equivalent groups. Moreover, the number\nof data required corresponds to the point where correlations between low-level\nfeatures and classes become detectable. Overall, our results indicate how deep\nnetworks overcome the curse of dimensionality by building invariant\nrepresentations, and provide an estimate of the number of data required to\nlearn a hierarchical task.\n","authors":["Francesco Cagnetta","Leonardo Petrini","Umberto M. Tomasini","Alessandro Favero","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2307.02129v3.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.11629v2","updated":"2023-12-04T17:01:20Z","published":"2023-11-20T09:28:04Z","title":"Generating Realistic Counterfactuals for Retinal Fundus and OCT Images\n using Diffusion Models","summary":" Counterfactual reasoning is often used in clinical settings to explain\ndecisions or weigh alternatives. Therefore, for imaging based specialties such\nas ophthalmology, it would be beneficial to be able to create counterfactual\nimages, illustrating answers to questions like \"If the subject had had diabetic\nretinopathy, how would the fundus image have looked?\". Here, we demonstrate\nthat using a diffusion model in combination with an adversarially robust\nclassifier trained on retinal disease classification tasks enables the\ngeneration of highly realistic counterfactuals of retinal fundus images and\noptical coherence tomography (OCT) B-scans. The key to the realism of\ncounterfactuals is that these classifiers encode salient features indicative\nfor each disease class and can steer the diffusion model to depict disease\nsigns or remove disease-related lesions in a realistic way. In a user study,\ndomain experts also found the counterfactuals generated using our method\nsignificantly more realistic than counterfactuals generated from a previous\nmethod, and even indistinguishable from real images.\n","authors":["Indu Ilanchezian","Valentyn Boreiko","Laura Kühlewein","Ziwei Huang","Murat Seçkin Ayhan","Matthias Hein","Lisa Koch","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2311.11629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04848v2","updated":"2023-12-04T17:00:18Z","published":"2023-06-08T00:56:33Z","title":"Interpreting and Improving Diffusion Models Using the Euclidean Distance\n Function","summary":" Denoising is intuitively related to projection. Indeed, under the manifold\nhypothesis, adding random noise is approximately equivalent to orthogonal\nperturbation. Hence, learning to denoise is approximately learning to project.\nIn this paper, we use this observation to reinterpret denoising diffusion\nmodels as approximate gradient descent applied to the Euclidean distance\nfunction. We then provide straight-forward convergence analysis of the DDIM\nsampler under simple assumptions on the projection-error of the denoiser.\nFinally, we propose a new sampler based on two simple modifications to DDIM\nusing insights from our theoretical results. In as few as 5-10 function\nevaluations, our sampler achieves state-of-the-art FID scores on pretrained\nCIFAR-10 and CelebA models and can generate high quality samples on latent\ndiffusion models.\n","authors":["Frank Permenter","Chenyang Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.04848v2.pdf","comment":"19 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.02029v1","updated":"2023-12-04T16:51:23Z","published":"2023-12-04T16:51:23Z","title":"Implicit Learning of Scene Geometry from Poses for Global Localization","summary":" Global visual localization estimates the absolute pose of a camera using a\nsingle image, in a previously mapped area. Obtaining the pose from a single\nimage enables many robotics and augmented/virtual reality applications.\nInspired by latest advances in deep learning, many existing approaches directly\nlearn and regress 6 DoF pose from an input image. However, these methods do not\nfully utilize the underlying scene geometry for pose regression. The challenge\nin monocular relocalization is the minimal availability of supervised training\ndata, which is just the corresponding 6 DoF poses of the images. In this paper,\nwe propose to utilize these minimal available labels (.i.e, poses) to learn the\nunderlying 3D geometry of the scene and use the geometry to estimate the 6 DoF\ncamera pose. We present a learning method that uses these pose labels and rigid\nalignment to learn two 3D geometric representations (\\textit{X, Y, Z\ncoordinates}) of the scene, one in camera coordinate frame and the other in\nglobal coordinate frame. Given a single image, it estimates these two 3D scene\nrepresentations, which are then aligned to estimate a pose that matches the\npose label. This formulation allows for the active inclusion of additional\nlearning constraints to minimize 3D alignment errors between the two 3D scene\nrepresentations, and 2D re-projection errors between the 3D global scene\nrepresentation and 2D image pixels, resulting in improved localization\naccuracy. During inference, our model estimates the 3D scene geometry in camera\nand global frames and aligns them rigidly to obtain pose in real-time. We\nevaluate our work on three common visual localization datasets, conduct\nablation studies, and show that our method exceeds state-of-the-art regression\nmethods' pose accuracy on all datasets.\n","authors":["Mohammad Altillawi","Shile Li","Sai Manoj Prakhya","Ziyuan Liu","Joan Serrat"],"pdf_url":"https://arxiv.org/pdf/2312.02029v1.pdf","comment":"IEEE ROBOTICS AND AUTOMATION LETTERS. ACCEPTED NOVEMBER, 2023"},{"id":"http://arxiv.org/abs/2311.17465v3","updated":"2023-12-04T16:49:18Z","published":"2023-11-29T09:13:00Z","title":"AgentAvatar: Disentangling Planning, Driving and Rendering for\n Photorealistic Avatar Agents","summary":" In this study, our goal is to create interactive avatar agents that can\nautonomously plan and animate nuanced facial movements realistically, from both\nvisual and behavioral perspectives. Given high-level inputs about the\nenvironment and agent profile, our framework harnesses LLMs to produce a series\nof detailed text descriptions of the avatar agents' facial motions. These\ndescriptions are then processed by our task-agnostic driving engine into motion\ntoken sequences, which are subsequently converted into continuous motion\nembeddings that are further consumed by our standalone neural-based renderer to\ngenerate the final photorealistic avatar animations. These streamlined\nprocesses allow our framework to adapt to a variety of non-verbal avatar\ninteractions, both monadic and dyadic. Our extensive study, which includes\nexperiments on both newly compiled and existing datasets featuring two types of\nagents -- one capable of monadic interaction with the environment, and the\nother designed for dyadic conversation -- validates the effectiveness and\nversatility of our approach. To our knowledge, we advanced a leap step by\ncombining LLMs and neural rendering for generalized non-verbal prediction and\nphoto-realistic rendering of avatar agents.\n","authors":["Duomin Wang","Bin Dai","Yu Deng","Baoyuan Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17465v3.pdf","comment":"Project page: https://dorniwang.github.io/AgentAvatar_project/"},{"id":"http://arxiv.org/abs/2312.02021v1","updated":"2023-12-04T16:46:38Z","published":"2023-12-04T16:46:38Z","title":"VLTSeg: Simple Transfer of CLIP-Based Vision-Language Representations\n for Domain Generalized Semantic Segmentation","summary":" Domain generalization (DG) remains a significant challenge for perception\nbased on deep neural networks (DNN), where domain shifts occur due to lighting,\nweather, or geolocation changes. In this work, we propose VLTSeg to enhance\ndomain generalization in semantic segmentation, where the network is solely\ntrained on the source domain and evaluated on unseen target domains. Our method\nleverages the inherent semantic robustness of vision-language models. First, by\nsubstituting traditional vision-only backbones with pre-trained encoders from\nCLIP and EVA-CLIP as transfer learning setting we find that in the field of DG,\nvision-language pre-training significantly outperforms supervised and\nself-supervised vision pre-training. We thus propose a new vision-language\napproach for domain generalized segmentation, which improves the domain\ngeneralization SOTA by 7.6% mIoU when training on the synthetic GTA5 dataset.\nWe further show the superior generalization capabilities of vision-language\nsegmentation models by reaching 76.48% mIoU on the popular Cityscapes-to-ACDC\nbenchmark, outperforming the previous SOTA approach by 6.9% mIoU on the test\nset at the time of writing. Additionally, our approach shows strong in-domain\ngeneralization capabilities indicated by 86.1% mIoU on the Cityscapes test set,\nresulting in a shared first place with the previous SOTA on the current\nleaderboard at the time of submission.\n","authors":["Christoph Hümmer","Manuel Schwonberg","Liangwei Zhong","Hu Cao","Alois Knoll","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2312.02021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16849v2","updated":"2023-12-04T16:44:45Z","published":"2023-09-28T20:59:51Z","title":"Space-Time Attention with Shifted Non-Local Search","summary":" Efficiently computing attention maps for videos is challenging due to the\nmotion of objects between frames. While a standard non-local search is\nhigh-quality for a window surrounding each query point, the window's small size\ncannot accommodate motion. Methods for long-range motion use an auxiliary\nnetwork to predict the most similar key coordinates as offsets from each query\nlocation. However, accurately predicting this flow field of offsets remains\nchallenging, even for large-scale networks. Small spatial inaccuracies\nsignificantly impact the attention module's quality. This paper proposes a\nsearch strategy that combines the quality of a non-local search with the range\nof predicted offsets. The method, named Shifted Non-Local Search, executes a\nsmall grid search surrounding the predicted offsets to correct small spatial\nerrors. Our method's in-place computation consumes 10 times less memory and is\nover 3 times faster than previous work. Experimentally, correcting the small\nspatial errors improves the video frame alignment quality by over 3 dB PSNR.\nOur search upgrades existing space-time attention modules, which improves video\ndenoising results by 0.30 dB PSNR for a 7.5% increase in overall runtime. We\nintegrate our space-time attention module into a UNet-like architecture to\nachieve state-of-the-art results on video denoising.\n","authors":["Kent Gauen","Stanley Chan"],"pdf_url":"https://arxiv.org/pdf/2309.16849v2.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2312.02017v1","updated":"2023-12-04T16:40:53Z","published":"2023-12-04T16:40:53Z","title":"A multi-channel cycleGAN for CBCT to CT synthesis","summary":" Image synthesis is used to generate synthetic CTs (sCTs) from on-treatment\ncone-beam CTs (CBCTs) with a view to improving image quality and enabling\naccurate dose computation to facilitate a CBCT-based adaptive radiotherapy\nworkflow. As this area of research gains momentum, developments in sCT\ngeneration methods are difficult to compare due to the lack of large public\ndatasets and sizeable variation in training procedures. To compare and assess\nthe latest advancements in sCT generation, the SynthRAD2023 challenge provides\na public dataset and evaluation framework for both MR and CBCT to sCT\nsynthesis. Our contribution focuses on the second task, CBCT-to-sCT synthesis.\nBy leveraging a multi-channel input to emphasize specific image features, our\napproach effectively addresses some of the challenges inherent in CBCT imaging,\nwhilst restoring the contrast necessary for accurate visualisation of patients'\nanatomy. Additionally, we introduce an auxiliary fusion network to further\nenhance the fidelity of generated sCT images.\n","authors":["Chelsea A. H. Sargeant","Edward G. A. Henderson","Dónal M. McSweeney","Aaron G. Rankin","Denis Page"],"pdf_url":"https://arxiv.org/pdf/2312.02017v1.pdf","comment":"RRRocket_Lollies submission for the Synthesizing computed tomography\n for radiotherapy (SynthRAD2023) Challenge at MICCAI 2023"},{"id":"http://arxiv.org/abs/2311.13398v2","updated":"2023-12-04T16:40:44Z","published":"2023-11-22T13:53:04Z","title":"Depth-Regularized Optimization for 3D Gaussian Splatting in Few-Shot\n Images","summary":" In this paper, we present a method to optimize Gaussian splatting with a\nlimited number of images while avoiding overfitting. Representing a 3D scene by\ncombining numerous Gaussian splats has yielded outstanding visual quality.\nHowever, it tends to overfit the training views when only a small number of\nimages are available. To address this issue, we introduce a dense depth map as\na geometry guide to mitigate overfitting. We obtained the depth map using a\npre-trained monocular depth estimation model and aligning the scale and offset\nusing sparse COLMAP feature points. The adjusted depth aids in the color-based\noptimization of 3D Gaussian splatting, mitigating floating artifacts, and\nensuring adherence to geometric constraints. We verify the proposed method on\nthe NeRF-LLFF dataset with varying numbers of few images. Our approach\ndemonstrates robust geometry compared to the original method that relies solely\non images. Project page: robot0321.github.io/DepthRegGS\n","authors":["Jaeyoung Chung","Jeongtaek Oh","Kyoung Mu Lee"],"pdf_url":"https://arxiv.org/pdf/2311.13398v2.pdf","comment":"10 pages, 5 figures; Project page: robot0321.github.io/DepthRegGS"},{"id":"http://arxiv.org/abs/2312.02015v1","updated":"2023-12-04T16:38:16Z","published":"2023-12-04T16:38:16Z","title":"ColonNeRF: Neural Radiance Fields for High-Fidelity Long-Sequence\n Colonoscopy Reconstruction","summary":" Colonoscopy reconstruction is pivotal for diagnosing colorectal cancer.\nHowever, accurate long-sequence colonoscopy reconstruction faces three major\nchallenges: (1) dissimilarity among segments of the colon due to its meandering\nand convoluted shape; (2) co-existence of simple and intricately folded\ngeometry structures; (3) sparse viewpoints due to constrained camera\ntrajectories. To tackle these challenges, we introduce a new reconstruction\nframework based on neural radiance field (NeRF), named ColonNeRF, which\nleverages neural rendering for novel view synthesis of long-sequence\ncolonoscopy. Specifically, to reconstruct the entire colon in a piecewise\nmanner, our ColonNeRF introduces a region division and integration module,\neffectively reducing shape dissimilarity and ensuring geometric consistency in\neach segment. To learn both the simple and complex geometry in a unified\nframework, our ColonNeRF incorporates a multi-level fusion module that\nprogressively models the colon regions from easy to hard. Additionally, to\novercome the challenges from sparse views, we devise a DensiNet module for\ndensifying camera poses under the guidance of semantic consistency. We conduct\nextensive experiments on both synthetic and real-world datasets to evaluate our\nColonNeRF. Quantitatively, our ColonNeRF outperforms existing methods on two\nbenchmarks over four evaluation metrics. Notably, our LPIPS-ALEX scores exhibit\na substantial increase of about 67%-85% on the SimCol-to-3D dataset.\nQualitatively, our reconstruction visualizations show much clearer textures and\nmore accurate geometric details. These sufficiently demonstrate our superior\nperformance over the state-of-the-art methods.\n","authors":["Yufei Shi","Beijia Lu","Jia-Wei Liu","Ming Li","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.02015v1.pdf","comment":"for Project Page, see https://showlab.github.io/ColonNeRF/"},{"id":"http://arxiv.org/abs/2312.02010v1","updated":"2023-12-04T16:32:51Z","published":"2023-12-04T16:32:51Z","title":"Towards Learning a Generalist Model for Embodied Navigation","summary":" Building a generalist agent that can interact with the world is the\nintriguing target of AI systems, thus spurring the research for embodied\nnavigation, where an agent is required to navigate according to instructions or\nrespond to queries. Despite the major progress attained, previous works\nprimarily focus on task-specific agents and lack generalizability to unseen\nscenarios. Recently, LLMs have presented remarkable capabilities across various\nfields, and provided a promising opportunity for embodied navigation. Drawing\non this, we propose the first generalist model for embodied navigation,\nNaviLLM. It adapts LLMs to embodied navigation by introducing schema-based\ninstruction. The schema-based instruction flexibly casts various tasks into\ngeneration problems, thereby unifying a wide range of tasks. This approach\nallows us to integrate diverse data sources from various datasets into the\ntraining, equipping NaviLLM with a wide range of capabilities required by\nembodied navigation. We conduct extensive experiments to evaluate the\nperformance and generalizability of our model. The experimental results\ndemonstrate that our unified model achieves state-of-the-art performance on\nCVDN, SOON, and ScanQA. Specifically, it surpasses the previous\nstats-of-the-art method by a significant margin of 29% in goal progress on\nCVDN. Moreover, our model also demonstrates strong generalizability and\npresents impressive results on unseen tasks, e.g., embodied question answering\nand 3D captioning.\n","authors":["Duo Zheng","Shijia huang","Lin Zhao","Yiwu Zhong","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02010v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2306.00783v2","updated":"2023-12-04T16:25:49Z","published":"2023-06-01T15:14:39Z","title":"FaceDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and\n Relighting with Diffusion Models","summary":" The ability to create high-quality 3D faces from a single image has become\nincreasingly important with wide applications in video conferencing, AR/VR, and\nadvanced video editing in movie industries. In this paper, we propose Face\nDiffusion NeRF (FaceDNeRF), a new generative method to reconstruct high-quality\nFace NeRFs from single images, complete with semantic editing and relighting\ncapabilities. FaceDNeRF utilizes high-resolution 3D GAN inversion and expertly\ntrained 2D latent-diffusion model, allowing users to manipulate and construct\nFace NeRFs in zero-shot learning without the need for explicit 3D data. With\ncarefully designed illumination and identity preserving loss, as well as\nmulti-modal pre-training, FaceDNeRF offers users unparalleled control over the\nediting process enabling them to create and edit face NeRFs using just\nsingle-view images, text prompts, and explicit target lighting. The advanced\nfeatures of FaceDNeRF have been designed to produce more impressive results\nthan existing 2D editing approaches that rely on 2D segmentation maps for\neditable attributes. Experiments show that our FaceDNeRF achieves exceptionally\nrealistic results and unprecedented flexibility in editing compared with\nstate-of-the-art 3D face reconstruction and editing methods. Our code will be\navailable at https://github.com/BillyXYB/FaceDNeRF.\n","authors":["Hao Zhang","Yanbo Xu","Tianyuan Dai","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2306.00783v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01999v1","updated":"2023-12-04T16:22:39Z","published":"2023-12-04T16:22:39Z","title":"SRTransGAN: Image Super-Resolution using Transformer based Generative\n Adversarial Network","summary":" Image super-resolution aims to synthesize high-resolution image from a\nlow-resolution image. It is an active area to overcome the resolution\nlimitations in several applications like low-resolution object-recognition,\nmedical image enhancement, etc. The generative adversarial network (GAN) based\nmethods have been the state-of-the-art for image super-resolution by utilizing\nthe convolutional neural networks (CNNs) based generator and discriminator\nnetworks. However, the CNNs are not able to exploit the global information very\neffectively in contrast to the transformers, which are the recent breakthrough\nin deep learning by exploiting the self-attention mechanism. Motivated from the\nsuccess of transformers in language and vision applications, we propose a\nSRTransGAN for image super-resolution using transformer based GAN.\nSpecifically, we propose a novel transformer-based encoder-decoder network as a\ngenerator to generate 2x images and 4x images. We design the discriminator\nnetwork using vision transformer which uses the image as sequence of patches\nand hence useful for binary classification between synthesized and real\nhigh-resolution images. The proposed SRTransGAN outperforms the existing\nmethods by 4.38 % on an average of PSNR and SSIM scores. We also analyze the\nsaliency map to understand the learning ability of the proposed method.\n","authors":["Neeraj Baghel","Shiv Ram Dubey","Satish Kumar Singh"],"pdf_url":"https://arxiv.org/pdf/2312.01999v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01998v1","updated":"2023-12-04T16:22:06Z","published":"2023-12-04T16:22:06Z","title":"Language-only Efficient Training of Zero-shot Composed Image Retrieval","summary":" Composed image retrieval (CIR) task takes a composed query of image and text,\naiming to search relative images for both conditions. Conventional CIR\napproaches need a training dataset composed of triplets of query image, query\ntext, and target image, which is very expensive to collect. Several recent\nworks have worked on the zero-shot (ZS) CIR paradigm to tackle the issue\nwithout using pre-collected triplets. However, the existing ZS-CIR methods show\nlimited backbone scalability and generalizability due to the lack of diversity\nof the input texts during training. We propose a novel CIR framework, only\nusing language for its training. Our LinCIR (Language-only training for CIR)\ncan be trained only with text datasets by a novel self-supervision named\nself-masking projection (SMP). We project the text latent embedding to the\ntoken embedding space and construct a new text by replacing the keyword tokens\nof the original text. Then, we let the new and original texts have the same\nlatent embedding vector. With this simple strategy, LinCIR is surprisingly\nefficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in\n48 minutes and shows the best ZS-CIR performances on four different CIR\nbenchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised\nmethod on FashionIQ. Code is available at https://github.com/navervision/lincir\n","authors":["Geonmo Gu","Sanghyuk Chun","Wonjae Kim","Yoohoon Kang","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2312.01998v1.pdf","comment":"First two authors contributed equally; 16 pages, 2.9MB"},{"id":"http://arxiv.org/abs/2312.01994v1","updated":"2023-12-04T16:14:43Z","published":"2023-12-04T16:14:43Z","title":"A Generative Self-Supervised Framework using Functional Connectivity in\n fMRI Data","summary":" Deep neural networks trained on Functional Connectivity (FC) networks\nextracted from functional Magnetic Resonance Imaging (fMRI) data have gained\npopularity due to the increasing availability of data and advances in model\narchitectures, including Graph Neural Network (GNN). Recent research on the\napplication of GNN to FC suggests that exploiting the time-varying properties\nof the FC could significantly improve the accuracy and interpretability of the\nmodel prediction. However, the high cost of acquiring high-quality fMRI data\nand corresponding phenotypic labels poses a hurdle to their application in\nreal-world settings, such that a model na\\\"ively trained in a supervised\nfashion can suffer from insufficient performance or a lack of generalization on\na small number of data. In addition, most Self-Supervised Learning (SSL)\napproaches for GNNs to date adopt a contrastive strategy, which tends to lose\nappropriate semantic information when the graph structure is perturbed or does\nnot leverage both spatial and temporal information simultaneously. In light of\nthese challenges, we propose a generative SSL approach that is tailored to\neffectively harness spatio-temporal information within dynamic FC. Our\nempirical results, experimented with large-scale (>50,000) fMRI datasets,\ndemonstrate that our approach learns valuable representations and enables the\nconstruction of accurate and robust models when fine-tuned for downstream\ntasks.\n","authors":["Jungwon Choi","Seongho Keum","EungGu Yun","Byung-Hoon Kim","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2312.01994v1.pdf","comment":"NeurIPS 2023 Temporal Graph Learning Workshop"},{"id":"http://arxiv.org/abs/2311.18803v2","updated":"2023-12-04T16:13:21Z","published":"2023-11-30T18:49:43Z","title":"BioCLIP: A Vision Foundation Model for the Tree of Life","summary":" Images of the natural world, collected by a variety of cameras, from drones\nto individual phones, are increasingly abundant sources of biological\ninformation. There is an explosion of computational methods and tools,\nparticularly computer vision, for extracting biologically relevant information\nfrom images for science and conservation. Yet most of these are bespoke\napproaches designed for a specific task and are not easily adaptable or\nextendable to new questions, contexts, and datasets. A vision model for general\norganismal biology questions on images is of timely need. To approach this, we\ncurate and release TreeOfLife-10M, the largest and most diverse ML-ready\ndataset of biology images. We then develop BioCLIP, a foundation model for the\ntree of life, leveraging the unique properties of biology captured by\nTreeOfLife-10M, namely the abundance and variety of images of plants, animals,\nand fungi, together with the availability of rich structured biological\nknowledge. We rigorously benchmark our approach on diverse fine-grained biology\nclassification tasks, and find that BioCLIP consistently and substantially\noutperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation\nreveals that BioCLIP has learned a hierarchical representation conforming to\nthe tree of life, shedding light on its strong generalizability. Our code,\nmodels and data will be made available at\nhttps://github.com/Imageomics/bioclip.\n","authors":["Samuel Stevens","Jiaman Wu","Matthew J Thompson","Elizabeth G Campolongo","Chan Hee Song","David Edward Carlyn","Li Dong","Wasila M Dahdul","Charles Stewart","Tanya Berger-Wolf","Wei-Lun Chao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2311.18803v2.pdf","comment":"18 pages; updated title"},{"id":"http://arxiv.org/abs/2312.01987v1","updated":"2023-12-04T16:04:41Z","published":"2023-12-04T16:04:41Z","title":"Bootstrapping SparseFormers from Vision Foundation Models","summary":" The recently proposed SparseFormer architecture provides an alternative\napproach to visual understanding by utilizing a significantly lower number of\nvisual tokens via adjusting RoIs, greatly reducing computational costs while\nstill achieving promising performance. However, training SparseFormers from\nscratch is still expensive, and scaling up the number of parameters can be\nchallenging. In this paper, we propose to bootstrap SparseFormers from\nViT-based vision foundation models in a simple and efficient way. Since the\nmajority of SparseFormer blocks are the standard transformer ones, we can\ninherit weights from large-scale pre-trained vision transformers and freeze\nthem as much as possible. Therefore, we only need to train the\nSparseFormer-specific lightweight focusing transformer to adjust token RoIs and\nfine-tune a few early pre-trained blocks to align the final token\nrepresentation. In such a way, we can bootstrap SparseFormer architectures from\nvarious large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or\nCLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and\nwithout labels or captions within just a few hours. As a result, the\nbootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9%\naccuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from\nCLIPs also demonstrates notable zero-shot performance with highly reduced\ncomputational cost without seeing any caption during the bootstrapping\nprocedure. In addition, CLIP-bootstrapped SparseFormers, which align the output\nspace with language without seeing a word, can serve as efficient vision\nencoders in multimodal large language models. Code will be publicly available\nat https://github.com/showlab/sparseformer\n","authors":["Ziteng Gao","Zhan Tong","Kevin Qinghong Lin","Joya Chen","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.01987v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.01985v1","updated":"2023-12-04T15:59:27Z","published":"2023-12-04T15:59:27Z","title":"UniGS: Unified Representation for Image Generation and Segmentation","summary":" This paper introduces a novel unified representation of diffusion models for\nimage generation and segmentation. Specifically, we use a colormap to represent\nentity-level masks, addressing the challenge of varying entity numbers while\naligning the representation closely with the image RGB domain. Two novel\nmodules, including the location-aware color palette and progressive dichotomy\nmodule, are proposed to support our mask representation. On the one hand, a\nlocation-aware palette guarantees the colors' consistency to entities'\nlocations. On the other hand, the progressive dichotomy module can efficiently\ndecode the synthesized colormap to high-quality entity-level masks in a\ndepth-first binary search without knowing the cluster numbers. To tackle the\nissue of lacking large-scale segmentation training data, we employ an\ninpainting pipeline and then improve the flexibility of diffusion models across\nvarious tasks, including inpainting, image synthesis, referring segmentation,\nand entity segmentation. Comprehensive experiments validate the efficiency of\nour approach, demonstrating comparable segmentation mask quality to\nstate-of-the-art and adaptability to multiple tasks. The code will be released\nat \\href{https://github.com/qqlu/Entity}{https://github.com/qqlu/Entity}.\n","authors":["Lu Qi","Lehan Yang","Weidong Guo","Yu Xu","Bo Du","Varun Jampani","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01985v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01964v1","updated":"2023-12-04T15:23:49Z","published":"2023-12-04T15:23:49Z","title":"Semantics-aware Motion Retargeting with Vision-Language Models","summary":" Capturing and preserving motion semantics is essential to motion retargeting\nbetween animation characters. However, most of the previous works neglect the\nsemantic information or rely on human-designed joint-level representations.\nHere, we present a novel Semantics-aware Motion reTargeting (SMT) method with\nthe advantage of vision-language models to extract and maintain meaningful\nmotion semantics. We utilize a differentiable module to render 3D motions. Then\nthe high-level motion semantics are incorporated into the motion retargeting\nprocess by feeding the vision-language model with the rendered images and\naligning the extracted semantic embeddings. To ensure the preservation of\nfine-grained motion details and high-level semantics, we adopt a two-stage\npipeline consisting of skeleton-aware pre-training and fine-tuning with\nsemantics and geometry constraints. Experimental results show the effectiveness\nof the proposed method in producing high-quality motion retargeting results\nwhile accurately preserving motion semantics. Project page can be found at\nhttps://sites.google.com/view/smtnet.\n","authors":["Haodong Zhang","ZhiKe Chen","Haocheng Xu","Lei Hao","Xiaofei Wu","Songcen Xu","Zhensong Zhang","Yue Wang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.01964v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.16267v3","updated":"2023-12-04T15:21:32Z","published":"2023-10-25T00:46:26Z","title":"Student Classroom Behavior Detection based on Spatio-Temporal Network\n and Multi-Model Fusion","summary":" Using deep learning methods to detect students' classroom behavior\nautomatically is a promising approach for analyzing their class performance and\nimproving teaching effectiveness. However, the lack of publicly available\nspatio-temporal datasets on student behavior, as well as the high cost of\nmanually labeling such datasets, pose significant challenges for researchers in\nthis field. To address this issue, we proposed a method for extending the\nspatio-temporal behavior dataset in Student Classroom Scenarios\n(SCB-ST-Dataset4) through image dataset. Our SCB-ST-Dataset4 comprises 757265\nimages with 25810 labels, focusing on 3 behaviors: hand-raising, reading,\nwriting. Our proposed method can rapidly generate spatio-temporal behavior\ndatasets without requiring extra manual labeling. Furthermore, we proposed a\nBehavior Similarity Index (BSI) to explore the similarity of behaviors. We\nevaluated the dataset using the YOLOv5, YOLOv7, YOLOv8, and SlowFast\nalgorithms, achieving a mean average precision (map) of up to 82.3%. Last, we\nfused multiple models to generate student behavior-related data from various\nperspectives. The experiment further demonstrates the effectiveness of our\nmethod. And SCB-ST-Dataset4 provides a robust foundation for future research in\nstudent behavior detection, potentially contributing to advancements in this\nfield. The SCB-ST-Dataset4 is available for download at:\nhttps://github.com/Whiffe/SCB-dataset.\n","authors":["Fan Yang","Xiaofei Wang"],"pdf_url":"https://arxiv.org/pdf/2310.16267v3.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2310.02522;\n text overlap with arXiv:2306.03318"},{"id":"http://arxiv.org/abs/2304.06433v2","updated":"2023-12-04T15:07:49Z","published":"2023-04-13T12:02:40Z","title":"High-Fidelity Zero-Shot Texture Anomaly Localization Using Feature\n Correspondence Analysis","summary":" We propose a novel method for Zero-Shot Anomaly Localization on textures. The\ntask refers to identifying abnormal regions in an otherwise homogeneous image.\nTo obtain a high-fidelity localization, we leverage a bijective mapping derived\nfrom the 1-dimensional Wasserstein Distance. As opposed to using holistic\ndistances between distributions, the proposed approach allows pinpointing the\nnon-conformity of a pixel in a local context with increased precision. By\naggregating the contribution of the pixel to the errors of all nearby patches\nwe obtain a reliable anomaly score estimate. We validate our solution on\nseveral datasets and obtain more than a 40% reduction in error over the\nprevious state of the art on the MVTec AD dataset in a zero-shot setting. Also\nsee https://reality.tf.fau.de/pub/ardelean2024highfidelity.html.\n","authors":["Andrei-Timotei Ardelean","Tim Weyrich"],"pdf_url":"https://arxiv.org/pdf/2304.06433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01943v1","updated":"2023-12-04T15:00:15Z","published":"2023-12-04T15:00:15Z","title":"Instance-guided Cartoon Editing with a Large-scale Dataset","summary":" Cartoon editing, appreciated by both professional illustrators and hobbyists,\nallows extensive creative freedom and the development of original narratives\nwithin the cartoon domain. However, the existing literature on cartoon editing\nis complex and leans heavily on manual operations, owing to the challenge of\nautomatic identification of individual character instances. Therefore, an\nautomated segmentation of these elements becomes imperative to facilitate a\nvariety of cartoon editing applications such as visual style editing, motion\ndecomposition and transfer, and the computation of stereoscopic depths for an\nenriched visual experience. Unfortunately, most current segmentation methods\nare designed for natural photographs, failing to recognize from the intricate\naesthetics of cartoon subjects, thus lowering segmentation quality. The major\nchallenge stems from two key shortcomings: the rarity of high-quality cartoon\ndedicated datasets and the absence of competent models for high-resolution\ninstance extraction on cartoons. To address this, we introduce a high-quality\ndataset of over 100k paired high-resolution cartoon images and their instance\nlabeling masks. We also present an instance-aware image segmentation model that\ncan generate accurate, high-resolution segmentation masks for characters in\ncartoon images. We present that the proposed approach enables a range of\nsegmentation-dependent cartoon editing applications like 3D Ken Burns parallax\neffects, text-guided cartoon style editing, and puppet animation from\nillustrations and manga.\n","authors":["Jian Lin","Chengze Li","Xueting Liu","Zhongping Ge"],"pdf_url":"https://arxiv.org/pdf/2312.01943v1.pdf","comment":"Project page: https://cartoonsegmentation.github.io/ 10 pages, 10\n figures"},{"id":"http://arxiv.org/abs/2308.14831v2","updated":"2023-12-04T14:52:08Z","published":"2023-08-28T18:31:09Z","title":"Continual Learning with Dynamic Sparse Training: Exploring Algorithms\n for Effective Model Updates","summary":" Continual learning (CL) refers to the ability of an intelligent system to\nsequentially acquire and retain knowledge from a stream of data with as little\ncomputational overhead as possible. To this end; regularization, replay,\narchitecture, and parameter isolation approaches were introduced to the\nliterature. Parameter isolation using a sparse network which enables to\nallocate distinct parts of the neural network to different tasks and also\nallows to share of parameters between tasks if they are similar. Dynamic Sparse\nTraining (DST) is a prominent way to find these sparse networks and isolate\nthem for each task. This paper is the first empirical study investigating the\neffect of different DST components under the CL paradigm to fill a critical\nresearch gap and shed light on the optimal configuration of DST for CL if it\nexists. Therefore, we perform a comprehensive study in which we investigate\nvarious DST components to find the best topology per task on well-known\nCIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our\nprimary focus is to evaluate the performance of various DST criteria, rather\nthan the process of mask selection. We found that, at a low sparsity level,\nErdos-R\\'enyi Kernel (ERK) initialization utilizes the backbone more\nefficiently and allows to effectively learn increments of tasks. At a high\nsparsity level, unless it is extreme, uniform initialization demonstrates a\nmore reliable and robust performance. In terms of growth strategy; performance\nis dependent on the defined initialization strategy and the extent of sparsity.\nFinally, adaptivity within DST components is a promising way for better\ncontinual learners.\n","authors":["Murat Onur Yildirim","Elif Ceren Gok Yildirim","Ghada Sokar","Decebal Constantin Mocanu","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2308.14831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01919v1","updated":"2023-12-04T14:23:18Z","published":"2023-12-04T14:23:18Z","title":"COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy\n Prediction","summary":" The autonomous driving community has shown significant interest in 3D\noccupancy prediction, driven by its exceptional geometric perception and\ngeneral object recognition capabilities. To achieve this, current works try to\nconstruct a Tri-Perspective View (TPV) or Occupancy (OCC) representation\nextending from the Bird-Eye-View perception. However, compressed views like TPV\nrepresentation lose 3D geometry information while raw and sparse OCC\nrepresentation requires heavy but reducant computational costs. To address the\nabove limitations, we propose Compact Occupancy TRansformer (COTR), with a\ngeometry-aware occupancy encoder and a semantic-aware group decoder to\nreconstruct a compact 3D OCC representation. The occupancy encoder first\ngenerates a compact geometrical OCC feature through efficient explicit-implicit\nview transformation. Then, the occupancy decoder further enhances the semantic\ndiscriminability of the compact OCC representation by a coarse-to-fine semantic\ngrouping strategy. Empirical experiments show that there are evident\nperformance gains across multiple baselines, e.g., COTR outperforms baselines\nwith a relative improvement of 8%-15%, demonstrating the superiority of our\nmethod.\n","authors":["Qihang Ma","Xin Tan","Yanyun Qu","Lizhuang Ma","Zhizhong Zhang","Yuan Xie"],"pdf_url":"https://arxiv.org/pdf/2312.01919v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01915v1","updated":"2023-12-04T14:19:36Z","published":"2023-12-04T14:19:36Z","title":"A Reliable Representation with Bidirectional Transition Model for Visual\n Reinforcement Learning Generalization","summary":" Visual reinforcement learning has proven effective in solving control tasks\nwith high-dimensional observations. However, extracting reliable and\ngeneralizable representations from vision-based observations remains a central\nchallenge. Inspired by the human thought process, when the representation\nextracted from the observation can predict the future and trace history, the\nrepresentation is reliable and accurate in comprehending the environment. Based\non this concept, we introduce a Bidirectional Transition (BiT) model, which\nleverages the ability to bidirectionally predict environmental transitions both\nforward and backward to extract reliable representations. Our model\ndemonstrates competitive generalization performance and sample efficiency on\ntwo settings of the DeepMind Control suite. Additionally, we utilize robotic\nmanipulation and CARLA simulators to demonstrate the wide applicability of our\nmethod.\n","authors":["Xiaobo Hu","Youfang Lin","Yue Liu","Jinwen Wang","Shuo Wang","Hehe Fan","Kai Lv"],"pdf_url":"https://arxiv.org/pdf/2312.01915v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09909v3","updated":"2023-12-04T14:13:35Z","published":"2023-10-15T18:32:27Z","title":"Can GPT-4V(ision) Serve Medical Applications? Case Studies on GPT-4V for\n Multimodal Medical Diagnosis","summary":" Driven by the large foundation models, the development of artificial\nintelligence has witnessed tremendous progress lately, leading to a surge of\ngeneral interest from the public. In this study, we aim to assess the\nperformance of OpenAI's newest model, GPT-4V(ision), specifically in the realm\nof multimodal medical diagnosis. Our evaluation encompasses 17 human body\nsystems, including Central Nervous System, Head and Neck, Cardiac, Chest,\nHematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology,\nObstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma,\nPediatrics, with images taken from 8 modalities used in daily clinic routine,\ne.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI),\nPositron Emission Tomography (PET), Digital Subtraction Angiography (DSA),\nMammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on\nmultiple clinical tasks with or without patent history provided, including\nimaging modality and anatomy recognition, disease diagnosis, report generation,\ndisease localisation.\n Our observation shows that, while GPT-4V demonstrates proficiency in\ndistinguishing between medical image modalities and anatomy, it faces\nsignificant challenges in disease diagnosis and generating comprehensive\nreports. These findings underscore that while large multimodal models have made\nsignificant advancements in computer vision and natural language processing, it\nremains far from being used to effectively support real-world medical\napplications and clinical decision-making.\n All images used in this report can be found in\nhttps://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation.\n","authors":["Chaoyi Wu","Jiayu Lei","Qiaoyu Zheng","Weike Zhao","Weixiong Lin","Xiaoman Zhang","Xiao Zhou","Ziheng Zhao","Ya Zhang","Yanfeng Wang","Weidi Xie"],"pdf_url":"https://arxiv.org/pdf/2310.09909v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01904v1","updated":"2023-12-04T14:02:56Z","published":"2023-12-04T14:02:56Z","title":"Unsupervised Anomaly Detection using Aggregated Normative Diffusion","summary":" Early detection of anomalies in medical images such as brain MRI is highly\nrelevant for diagnosis and treatment of many conditions. Supervised machine\nlearning methods are limited to a small number of pathologies where there is\ngood availability of labeled data. In contrast, unsupervised anomaly detection\n(UAD) has the potential to identify a broader spectrum of anomalies by spotting\ndeviations from normal patterns. Our research demonstrates that existing\nstate-of-the-art UAD approaches do not generalise well to diverse types of\nanomalies in realistic multi-modal MR data. To overcome this, we introduce a\nnew UAD method named Aggregated Normative Diffusion (ANDi). ANDi operates by\naggregating differences between predicted denoising steps and ground truth\nbackwards transitions in Denoising Diffusion Probabilistic Models (DDPMs) that\nhave been trained on pyramidal Gaussian noise. We validate ANDi against three\nrecent UAD baselines, and across three diverse brain MRI datasets. We show that\nANDi, in some cases, substantially surpasses these baselines and shows\nincreased robustness to varying types of anomalies. Particularly in detecting\nmultiple sclerosis (MS) lesions, ANDi achieves improvements of up to 178% in\nterms of AUPRC.\n","authors":["Alexander Frotscher","Jaivardhan Kapoor","Thomas Wolfers","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2312.01904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01897v1","updated":"2023-12-04T13:51:16Z","published":"2023-12-04T13:51:16Z","title":"Adapting Short-Term Transformers for Action Detection in Untrimmed\n Videos","summary":" Vision transformer (ViT) has shown high potential in video recognition, owing\nto its flexible design, adaptable self-attention mechanisms, and the efficacy\nof masked pre-training. Yet, it still remains unclear how to adapt these\npre-trained short-term ViTs for temporal action detection (TAD) in untrimmed\nvideos. The existing works treat them as off-the-shelf feature extractors for\neach short trimmed snippet without capturing the fine-grained relation among\ndifferent snippets in a broader temporal context. To mitigate this issue, this\npaper focuses on designing a new mechanism for adapting these pre-trained ViT\nmodels as a unified long-form video transformer to fully unleash its modeling\npower in capturing inter-snippet relation, while still keeping low computation\noverhead and memory consumption for efficient TAD. To this end, we design\neffective cross-snippet propagation modules to gradually exchange short-term\nvideo information among different snippets from two levels. For inner-backbone\ninformation propagation, we introduce a cross-snippet propagation strategy to\nenable multi-snippet temporal feature interaction inside the backbone. For\npost-backbone information propagation, we propose temporal transformer layers\nfor further clip-level modeling. With the plain ViT-B pre-trained with\nVideoMAE, our end-to-end temporal action detector (ViT-TAD) yields a very\ncompetitive performance to previous temporal action detectors, riching up to\n69.0 average mAP on THUMOS14, 37.12 average mAP on ActivityNet-1.3 and 17.20\naverage mAP on FineAction.\n","authors":["Min Yang","Huan Gao","Ping Guo","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01897v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.01886v1","updated":"2023-12-04T13:40:05Z","published":"2023-12-04T13:40:05Z","title":"InstructTA: Instruction-Tuned Targeted Attack for Large Vision-Language\n Models","summary":" Large vision-language models (LVLMs) have demonstrated their incredible\ncapability in image understanding and response generation. However, this rich\nvisual interaction also makes LVLMs vulnerable to adversarial examples. In this\npaper, we formulate a novel and practical gray-box attack scenario that the\nadversary can only access the visual encoder of the victim LVLM, without the\nknowledge of its prompts (which are often proprietary for service providers and\nnot publicly available) and its underlying large language model (LLM). This\npractical setting poses challenges to the cross-prompt and cross-model\ntransferability of targeted adversarial attack, which aims to confuse the LVLM\nto output a response that is semantically similar to the attacker's chosen\ntarget text. To this end, we propose an instruction-tuned targeted attack\n(dubbed InstructTA) to deliver the targeted adversarial attack on LVLMs with\nhigh transferability. Initially, we utilize a public text-to-image generative\nmodel to \"reverse\" the target response into a target image, and employ GPT-4 to\ninfer a reasonable instruction $\\boldsymbol{p}^\\prime$ from the target\nresponse. We then form a local surrogate model (sharing the same visual encoder\nwith the victim LVLM) to extract instruction-aware features of an adversarial\nimage example and the target image, and minimize the distance between these two\nfeatures to optimize the adversarial example. To further improve the\ntransferability, we augment the instruction $\\boldsymbol{p}^\\prime$ with\ninstructions paraphrased from an LLM. Extensive experiments demonstrate the\nsuperiority of our proposed method in targeted attack performance and\ntransferability.\n","authors":["Xunguang Wang","Zhenlan Ji","Pingchuan Ma","Zongjie Li","Shuai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01886v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.13925v2","updated":"2023-12-04T13:34:01Z","published":"2023-09-25T07:46:56Z","title":"Towards Surveillance Video-and-Language Understanding: New Dataset,\n Baselines, and Challenges","summary":" Surveillance videos are an essential component of daily life with various\ncritical applications, particularly in public security. However, current\nsurveillance video tasks mainly focus on classifying and localizing anomalous\nevents. Existing methods are limited to detecting and classifying the\npredefined events with unsatisfactory semantic understanding, although they\nhave obtained considerable performance. To address this issue, we propose a new\nresearch direction of surveillance video-and-language understanding, and\nconstruct the first multimodal surveillance video dataset. We manually annotate\nthe real-world surveillance dataset UCF-Crime with fine-grained event content\nand timing. Our newly annotated dataset, UCA (UCF-Crime Annotation), contains\n23,542 sentences, with an average length of 20 words, and its annotated videos\nare as long as 110.7 hours. Furthermore, we benchmark SOTA models for four\nmultimodal tasks on this newly created dataset, which serve as new baselines\nfor surveillance video-and-language understanding. Through our experiments, we\nfind that mainstream models used in previously publicly available datasets\nperform poorly on surveillance video, which demonstrates the new challenges in\nsurveillance video-and-language understanding. To validate the effectiveness of\nour UCA, we conducted experiments on multimodal anomaly detection. The results\ndemonstrate that our multimodal surveillance learning can improve the\nperformance of conventional anomaly detection tasks. All the experiments\nhighlight the necessity of constructing this dataset to advance surveillance\nAI. The link to our dataset is provided at:\nhttps://xuange923.github.io/Surveillance-Video-Understanding.\n","authors":["Tongtong Yuan","Xuange Zhang","Kun Liu","Bo Liu","Chen Chen","Jian Jin","Zhenzhen Jiao"],"pdf_url":"https://arxiv.org/pdf/2309.13925v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01882v1","updated":"2023-12-04T13:25:16Z","published":"2023-12-04T13:25:16Z","title":"Unleashing the Potential of Large Language Model: Zero-shot VQA for\n Flood Disaster Scenario","summary":" Visual question answering (VQA) is a fundamental and essential AI task, and\nVQA-based disaster scenario understanding is a hot research topic. For\ninstance, we can ask questions about a disaster image by the VQA model and the\nanswer can help identify whether anyone or anything is affected by the\ndisaster. However, previous VQA models for disaster damage assessment have some\nshortcomings, such as limited candidate answer space, monotonous question\ntypes, and limited answering capability of existing models. In this paper, we\npropose a zero-shot VQA model named Zero-shot VQA for Flood Disaster Damage\nAssessment (ZFDDA). It is a VQA model for damage assessment without\npre-training. Also, with flood disaster as the main research object, we build a\nFreestyle Flood Disaster Image Question Answering dataset (FFD-IQA) to evaluate\nour VQA model. This new dataset expands the question types to include\nfree-form, multiple-choice, and yes-no questions. At the same time, we expand\nthe size of the previous dataset to contain a total of 2,058 images and 22,422\nquestion-meta ground truth pairs. Most importantly, our model uses\nwell-designed chain of thought (CoT) demonstrations to unlock the potential of\nthe large language model, allowing zero-shot VQA to show better performance in\ndisaster scenarios. The experimental results show that the accuracy in\nanswering complex questions is greatly improved with CoT prompts. Our study\nprovides a research basis for subsequent research of VQA for other disaster\nscenarios.\n","authors":["Yimin Sun","Chao Wang","Yan Peng"],"pdf_url":"https://arxiv.org/pdf/2312.01882v1.pdf","comment":"accepted by The 4th International Conference on Artificial\n Intelligence and Computer Engineering"},{"id":"http://arxiv.org/abs/2312.01871v1","updated":"2023-12-04T13:09:00Z","published":"2023-12-04T13:09:00Z","title":"FeaInfNet: Diagnosis in Medical Image with Feature-Driven Inference and\n Visual Explanations","summary":" Interpretable deep learning models have received widespread attention in the\nfield of image recognition. Due to the unique multi-instance learning of\nmedical images and the difficulty in identifying decision-making regions, many\ninterpretability models that have been proposed still have problems of\ninsufficient accuracy and interpretability in medical image disease diagnosis.\nTo solve these problems, we propose feature-driven inference network\n(FeaInfNet). Our first key innovation involves proposing a feature-based\nnetwork reasoning structure, which is applied to FeaInfNet. The network of this\nstructure compares the similarity of each sub-region image patch with the\ndisease templates and normal templates that may appear in the region, and\nfinally combines the comparison of each sub-region to make the final diagnosis.\nIt simulates the diagnosis process of doctors to make the model interpretable\nin the reasoning process, while avoiding the misleading caused by the\nparticipation of normal areas in reasoning. Secondly, we propose local feature\nmasks (LFM) to extract feature vectors in order to provide global information\nfor these vectors, thus enhancing the expressive ability of the FeaInfNet.\nFinally, we propose adaptive dynamic masks (Adaptive-DM) to interpret feature\nvectors and prototypes into human-understandable image patches to provide\naccurate visual interpretation. We conducted qualitative and quantitative\nexperiments on multiple publicly available medical datasets, including RSNA,\niChallenge-PM, Covid-19, ChinaCXRSet, and MontgomerySet. The results of our\nexperiments validate that our method achieves state-of-the-art performance in\nterms of classification accuracy and interpretability compared to baseline\nmethods in medical image diagnosis. Additional ablation studies verify the\neffectiveness of each of our proposed components.\n","authors":["Yitao Peng","Lianghua He","Die Hu","Yihang Liu","Longzhen Yang","Shaohua Shang"],"pdf_url":"https://arxiv.org/pdf/2312.01871v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01860v1","updated":"2023-12-04T12:48:44Z","published":"2023-12-04T12:48:44Z","title":"Unveiling Objects with SOLA: An Annotation-Free Image Search on the\n Object Level for Automotive Data Sets","summary":" Huge image data sets are the fundament for the development of the perception\nof automated driving systems. A large number of images is necessary to train\nrobust neural networks that can cope with diverse situations. A sufficiently\nlarge data set contains challenging situations and objects. For testing the\nresulting functions, it is necessary that these situations and objects can be\nfound and extracted from the data set. While it is relatively easy to record a\nlarge amount of unlabeled data, it is far more difficult to find demanding\nsituations and objects. However, during the development of perception systems,\nit must be possible to access challenging data without having to perform\nlengthy and time-consuming annotations. A developer must therefore be able to\nsearch dynamically for specific situations and objects in a data set. Thus, we\ndesigned a method which is based on state-of-the-art neural networks to search\nfor objects with certain properties within an image. For the ease of use, the\nquery of this search is described using natural language. To determine the time\nsavings and performance gains, we evaluated our method qualitatively and\nquantitatively on automotive data sets.\n","authors":["Philipp Rigoll","Jacob Langner","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2312.01860v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01853v1","updated":"2023-12-04T12:35:43Z","published":"2023-12-04T12:35:43Z","title":"Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing","summary":" Executing contact-rich manipulation tasks necessitates the fusion of tactile\nand visual feedback. However, the distinct nature of these modalities poses\nsignificant challenges. In this paper, we introduce a system that leverages\nvisual and tactile sensory inputs to enable dexterous in-hand manipulation.\nSpecifically, we propose Robot Synesthesia, a novel point cloud-based tactile\nrepresentation inspired by human tactile-visual synesthesia. This approach\nallows for the simultaneous and seamless integration of both sensory inputs,\noffering richer spatial information and facilitating better reasoning about\nrobot actions. The method, trained in a simulated environment and then deployed\nto a real robot, is applicable to various in-hand object rotation tasks.\nComprehensive ablations are performed on how the integration of vision and\ntouch can improve reinforcement learning and Sim2Real performance. Our project\npage is available at https://yingyuan0414.github.io/visuotactile/ .\n","authors":["Ying Yuan","Haichuan Che","Yuzhe Qin","Binghao Huang","Zhao-Heng Yin","Kang-Won Lee","Yi Wu","Soo-Chul Lim","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01853v1.pdf","comment":"Project page: https://yingyuan0414.github.io/visuotactile/"},{"id":"http://arxiv.org/abs/2312.01850v1","updated":"2023-12-04T12:31:45Z","published":"2023-12-04T12:31:45Z","title":"Generalization by Adaptation: Diffusion-Based Domain Extension for\n Domain-Generalized Semantic Segmentation","summary":" When models, e.g., for semantic segmentation, are applied to images that are\nvastly different from training data, the performance will drop significantly.\nDomain adaptation methods try to overcome this issue, but need samples from the\ntarget domain. However, this might not always be feasible for various reasons\nand therefore domain generalization methods are useful as they do not require\nany target data. We present a new diffusion-based domain extension (DIDEX)\nmethod and employ a diffusion model to generate a pseudo-target domain with\ndiverse text prompts. In contrast to existing methods, this allows to control\nthe style and content of the generated images and to introduce a high\ndiversity. In a second step, we train a generalizing model by adapting towards\nthis pseudo-target domain. We outperform previous approaches by a large margin\nacross various datasets and architectures without using any real data. For the\ngeneralization from GTA5, we improve state-of-the-art mIoU performance by 3.8%\nabsolute on average and for SYNTHIA by 11.8% absolute, marking a big step for\nthe generalization performance on these benchmarks. Code is available at\nhttps://github.com/JNiemeijer/DIDEX\n","authors":["Joshua Niemeijer","Manuel Schwonberg","Jan-Aike Termöhlen","Nico M. Schmidt","Tim Fingscheidt"],"pdf_url":"https://arxiv.org/pdf/2312.01850v1.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2312.01841v1","updated":"2023-12-04T12:25:37Z","published":"2023-12-04T12:25:37Z","title":"VividTalk: One-Shot Audio-Driven Talking Head Generation Based on 3D\n Hybrid Prior","summary":" Audio-driven talking head generation has drawn much attention in recent\nyears, and many efforts have been made in lip-sync, expressive facial\nexpressions, natural head pose generation, and high video quality. However, no\nmodel has yet led or tied on all these metrics due to the one-to-many mapping\nbetween audio and motion. In this paper, we propose VividTalk, a two-stage\ngeneric framework that supports generating high-visual quality talking head\nvideos with all the above properties. Specifically, in the first stage, we map\nthe audio to mesh by learning two motions, including non-rigid expression\nmotion and rigid head motion. For expression motion, both blendshape and vertex\nare adopted as the intermediate representation to maximize the representation\nability of the model. For natural head motion, a novel learnable head pose\ncodebook with a two-phase training mechanism is proposed. In the second stage,\nwe proposed a dual branch motion-vae and a generator to transform the meshes\ninto dense motion and synthesize high-quality video frame-by-frame. Extensive\nexperiments show that the proposed VividTalk can generate high-visual quality\ntalking head videos with lip-sync and realistic enhanced by a large margin, and\noutperforms previous state-of-the-art works in objective and subjective\ncomparisons.\n","authors":["Xusen Sun","Longhao Zhang","Hao Zhu","Peng Zhang","Bang Zhang","Xinya Ji","Kangneng Zhou","Daiheng Gao","Liefeng Bo","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2312.01841v1.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.01835v1","updated":"2023-12-04T12:16:02Z","published":"2023-12-04T12:16:02Z","title":"Few Clicks Suffice: Active Test-Time Adaptation for Semantic\n Segmentation","summary":" Test-time adaptation (TTA) adapts the pre-trained models during inference\nusing unlabeled test data and has received a lot of research attention due to\nits potential practical value. Unfortunately, without any label supervision,\nexisting TTA methods rely heavily on heuristic or empirical studies. Where to\nupdate the model always falls into suboptimal or brings more computational\nresource consumption. Meanwhile, there is still a significant performance gap\nbetween the TTA approaches and their supervised counterparts. Motivated by\nactive learning, in this work, we propose the active test-time adaptation for\nsemantic segmentation setup. Specifically, we introduce the human-in-the-loop\npattern during the testing phase, which queries very few labels to facilitate\npredictions and model updates in an online manner. To do so, we propose a\nsimple but effective ATASeg framework, which consists of two parts, i.e., model\nadapter and label annotator. Extensive experiments demonstrate that ATASeg\nbridges the performance gap between TTA methods and their supervised\ncounterparts with only extremely few annotations, even one click for labeling\nsurpasses known SOTA TTA methods by 2.6% average mIoU on ACDC benchmark.\nEmpirical results imply that progress in either the model adapter or the label\nannotator will bring improvements to the ATASeg framework, giving it large\nresearch and reality potential.\n","authors":["Longhui Yuan","Shuang Li","Zhuo He","Binhui Xie"],"pdf_url":"https://arxiv.org/pdf/2312.01835v1.pdf","comment":"15 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.01831v1","updated":"2023-12-04T12:07:39Z","published":"2023-12-04T12:07:39Z","title":"Equivariant plug-and-play image reconstruction","summary":" Plug-and-play algorithms constitute a popular framework for solving inverse\nimaging problems that rely on the implicit definition of an image prior via a\ndenoiser. These algorithms can leverage powerful pre-trained denoisers to solve\na wide range of imaging tasks, circumventing the necessity to train models on a\nper-task basis. Unfortunately, plug-and-play methods often show unstable\nbehaviors, hampering their promise of versatility and leading to suboptimal\nquality of reconstructed images. In this work, we show that enforcing\nequivariance to certain groups of transformations (rotations, reflections,\nand/or translations) on the denoiser strongly improves the stability of the\nalgorithm as well as its reconstruction quality. We provide a theoretical\nanalysis that illustrates the role of equivariance on better performance and\nstability. We present a simple algorithm that enforces equivariance on any\nexisting denoiser by simply applying a random transformation to the input of\nthe denoiser and the inverse transformation to the output at each iteration of\nthe algorithm. Experiments on multiple imaging modalities and denoising\nnetworks show that the equivariant plug-and-play algorithm improves both the\nreconstruction performance and the stability compared to their non-equivariant\ncounterparts.\n","authors":["Matthieu Terris","Thomas Moreau","Nelly Pustelnik","Julian Tachella"],"pdf_url":"https://arxiv.org/pdf/2312.01831v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01800v1","updated":"2023-12-04T10:45:12Z","published":"2023-12-04T10:45:12Z","title":"Collaborative Neural Painting","summary":" The process of painting fosters creativity and rational planning. However,\nexisting generative AI mostly focuses on producing visually pleasant artworks,\nwithout emphasizing the painting process. We introduce a novel task,\nCollaborative Neural Painting (CNP), to facilitate collaborative art painting\ngeneration between humans and machines. Given any number of user-input\nbrushstrokes as the context or just the desired object class, CNP should\nproduce a sequence of strokes supporting the completion of a coherent painting.\nImportantly, the process can be gradual and iterative, so allowing users'\nmodifications at any phase until the completion. Moreover, we propose to solve\nthis task using a painting representation based on a sequence of parametrized\nstrokes, which makes it easy both editing and composition operations. These\nparametrized strokes are processed by a Transformer-based architecture with a\nnovel attention mechanism to model the relationship between the input strokes\nand the strokes to complete. We also propose a new masking scheme to reflect\nthe interactive nature of CNP and adopt diffusion models as the basic learning\nprocess for its effectiveness and diversity in the generative field. Finally,\nto develop and validate methods on the novel task, we introduce a new dataset\nof painted objects and an evaluation protocol to benchmark CNP both\nquantitatively and qualitatively. We demonstrate the effectiveness of our\napproach and the potential of the CNP task as a promising avenue for future\nresearch.\n","authors":["Nicola Dall'Asen","Willi Menapace","Elia Peruzzo","Enver Sangineto","Yiming Wang","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2312.01800v1.pdf","comment":"Submitted to Computer Vision and Image Understanding, project website\n at https://fodark.github.io/collaborative-neural-painting/"},{"id":"http://arxiv.org/abs/2311.12553v3","updated":"2023-12-04T10:41:15Z","published":"2023-11-21T12:05:56Z","title":"HoVer-UNet: Accelerating HoVerNet with UNet-based multi-class nuclei\n segmentation via knowledge distillation","summary":" We present HoVer-UNet, an approach to distill the knowledge of the\nmulti-branch HoVerNet framework for nuclei instance segmentation and\nclassification in histopathology. We propose a compact, streamlined single UNet\nnetwork with a Mix Vision Transformer backbone, and equip it with a custom loss\nfunction to optimally encode the distilled knowledge of HoVerNet, reducing\ncomputational requirements without compromising performances. We show that our\nmodel achieved results comparable to HoVerNet on the public PanNuke and Consep\ndatasets with a three-fold reduction in inference time. We make the code of our\nmodel publicly available at https://github.com/DIAGNijmegen/HoVer-UNet.\n","authors":["Cristian Tommasino","Cristiano Russo","Antonio Maria Rinaldi","Francesco Ciompi"],"pdf_url":"https://arxiv.org/pdf/2311.12553v3.pdf","comment":"4 pages, 2 figures, submitted to ISBI 2024"},{"id":"http://arxiv.org/abs/2307.06614v2","updated":"2023-12-04T10:36:45Z","published":"2023-07-13T08:27:09Z","title":"Interpretable 2D Vision Models for 3D Medical Images","summary":" Training Artificial Intelligence (AI) models on 3D images presents unique\nchallenges compared to the 2D case: Firstly, the demand for computational\nresources is significantly higher, and secondly, the availability of large\ndatasets for pre-training is often limited, impeding training success. This\nstudy proposes a simple approach of adapting 2D networks with an intermediate\nfeature representation for processing 3D images. Our method employs attention\npooling to learn to assign each slice an importance weight and, by that, obtain\na weighted average of all 2D slices. These weights directly quantify the\ncontribution of each slice to the contribution and thus make the model\nprediction inspectable. We show on all 3D MedMNIST datasets as benchmark and\ntwo real-world datasets consisting of several hundred high-resolution CT or MRI\nscans that our approach performs on par with existing methods. Furthermore, we\ncompare the in-built interpretability of our approach to HiResCam, a\nstate-of-the-art retrospective interpretability approach.\n","authors":["Alexander Ziller","Ayhan Can Erdur","Marwa Trigui","Alp Güvenir","Tamara T. Mueller","Philip Müller","Friederike Jungmann","Johannes Brandt","Jan Peeken","Rickmer Braren","Daniel Rueckert","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2307.06614v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2004.11968v3","updated":"2023-12-04T10:29:42Z","published":"2020-04-16T09:44:08Z","title":"Visualizing key features in X-ray images of epoxy resins for improved\n material classification using singular value decomposition of deep learning\n features","summary":" Although the process variables of epoxy resins alter their mechanical\nproperties, the visual identification of the characteristic features of X-ray\nimages of samples of these materials is challenging. To facilitate the\nidentification, we approximate the magnitude of the gradient of the intensity\nfield of the X-ray images of different kinds of epoxy resins and then we use\ndeep learning to discover the most representative features of the transformed\nimages. In this solution of the inverse problem to finding characteristic\nfeatures to discriminate samples of heterogeneous materials, we use the\neigenvectors obtained from the singular value decomposition of all the channels\nof the feature maps of the early layers in a convolutional neural network.\nWhile the strongest activated channel gives a visual representation of the\ncharacteristic features, often these are not robust enough in some practical\nsettings. On the other hand, the left singular vectors of the matrix\ndecomposition of the feature maps, barely change when variables such as the\ncapacity of the network or network architecture change. High classification\naccuracy and robustness of characteristic features are presented in this work.\n","authors":["Edgar Avalos","Kazuto Akagi","Yasumasa Nishiura"],"pdf_url":"https://arxiv.org/pdf/2004.11968v3.pdf","comment":"43 pages, 16 figures"},{"id":"http://arxiv.org/abs/2312.01790v1","updated":"2023-12-04T10:25:42Z","published":"2023-12-04T10:25:42Z","title":"Exploring Multi-Modal Fusion for Image Manipulation Detection and\n Localization","summary":" Recent image manipulation localization and detection techniques usually\nleverage forensic artifacts and traces that are produced by a noise-sensitive\nfilter, such as SRM and Bayar convolution. In this paper, we showcase that\ndifferent filters commonly used in such approaches excel at unveiling different\ntypes of manipulations and provide complementary forensic traces. Thus, we\nexplore ways of merging the outputs of such filters and aim to leverage the\ncomplementary nature of the artifacts produced to perform image manipulation\nlocalization and detection (IMLD). We propose two distinct methods: one that\nproduces independent features from each forensic filter and then fuses them\n(this is referred to as late fusion) and one that performs early mixing of\ndifferent modal outputs and produces early combined features (this is referred\nto as early fusion). We demonstrate that both approaches achieve competitive\nperformance for both image manipulation localization and detection,\noutperforming state-of-the-art models across several datasets.\n","authors":["Konstantinos Triaridis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2312.01790v1.pdf","comment":"Accepted for publication, 30th Int. Conf. on MultiMedia Modeling (MMM\n 2024), Amsterdam, NL, Jan.-Feb. 2024. This is the \"submitted manuscript\"\n version"},{"id":"http://arxiv.org/abs/2312.01789v1","updated":"2023-12-04T10:25:34Z","published":"2023-12-04T10:25:34Z","title":"Two-stage optimized unified adversarial patch for attacking\n visible-infrared cross-modal detectors in the physical world","summary":" Currently, many studies have addressed security concerns related to visible\nand infrared detectors independently. In practical scenarios, utilizing\ncross-modal detectors for tasks proves more reliable than relying on\nsingle-modal detectors. Despite this, there is a lack of comprehensive security\nevaluations for cross-modal detectors. While existing research has explored the\nfeasibility of attacks against cross-modal detectors, the implementation of a\nrobust attack remains unaddressed. This work introduces the Two-stage Optimized\nUnified Adversarial Patch (TOUAP) designed for performing attacks against\nvisible-infrared cross-modal detectors in real-world, black-box settings. The\nTOUAP employs a two-stage optimization process: firstly, PSO optimizes an\nirregular polygonal infrared patch to attack the infrared detector; secondly,\nthe color QR code is optimized, and the shape information of the infrared patch\nfrom the first stage is used as a mask. The resulting irregular polygon visible\nmodal patch executes an attack on the visible detector. Through extensive\nexperiments conducted in both digital and physical environments, we validate\nthe effectiveness and robustness of the proposed method. As the TOUAP surpasses\nbaseline performance, we advocate for its widespread attention.\n","authors":["Chengyin Hu","Weiwen Shi"],"pdf_url":"https://arxiv.org/pdf/2312.01789v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.12484v3","updated":"2023-12-04T10:14:57Z","published":"2023-03-22T11:51:49Z","title":"Label-Efficient Deep Learning in Medical Image Analysis: Challenges and\n Future Directions","summary":" Deep learning has seen rapid growth in recent years and achieved\nstate-of-the-art performance in a wide range of applications. However, training\nmodels typically requires expensive and time-consuming collection of large\nquantities of labeled data. This is particularly true within the scope of\nmedical imaging analysis (MIA), where data are limited and labels are expensive\nto be acquired. Thus, label-efficient deep learning methods are developed to\nmake comprehensive use of the labeled data as well as the abundance of\nunlabeled and weak-labeled data. In this survey, we extensively investigated\nover 300 recent papers to provide a comprehensive overview of recent progress\non label-efficient learning strategies in MIA. We first present the background\nof label-efficient learning and categorize the approaches into different\nschemes. Next, we examine the current state-of-the-art methods in detail\nthrough each scheme. Specifically, we provide an in-depth investigation,\ncovering not only canonical semi-supervised, self-supervised, and\nmulti-instance learning schemes, but also recently emerged active and\nannotation-efficient learning strategies. Moreover, as a comprehensive\ncontribution to the field, this survey not only elucidates the commonalities\nand unique features of the surveyed methods but also presents a detailed\nanalysis of the current challenges in the field and suggests potential avenues\nfor future research.\n","authors":["Cheng Jin","Zhengrui Guo","Yi Lin","Luyang Luo","Hao Chen"],"pdf_url":"https://arxiv.org/pdf/2303.12484v3.pdf","comment":"Update 2023 papers"},{"id":"http://arxiv.org/abs/2305.16746v3","updated":"2023-12-04T09:52:25Z","published":"2023-05-26T08:48:17Z","title":"CNN Feature Map Augmentation for Single-Source Domain Generalization","summary":" In search of robust and generalizable machine learning models, Domain\nGeneralization (DG) has gained significant traction during the past few years.\nThe goal in DG is to produce models which continue to perform well when\npresented with data distributions different from the ones available during\ntraining. While deep convolutional neural networks (CNN) have been able to\nachieve outstanding performance on downstream computer vision tasks, they still\noften fail to generalize on previously unseen data Domains. Therefore, in this\nwork we focus on producing a model which is able to remain robust under data\ndistribution shift and propose an alternative regularization technique for\nconvolutional neural network architectures in the single-source DG image\nclassification setting. To mitigate the problem caused by domain shift between\nsource and target data, we propose augmenting intermediate feature maps of\nCNNs. Specifically, we pass them through a novel Augmentation Layer} to prevent\nmodels from overfitting on the training set and improve their cross-domain\ngeneralization. To the best of our knowledge, this is the first paper proposing\nsuch a setup for the DG image classification setting. Experiments on the DG\nbenchmark datasets of PACS, VLCS, Office-Home and TerraIncognita validate the\neffectiveness of our method, in which our model surpasses state-of-the-art\nalgorithms in most cases.\n","authors":["Aristotelis Ballas","Christos Diou"],"pdf_url":"https://arxiv.org/pdf/2305.16746v3.pdf","comment":"In proceedings of IEEE BigDataService2023\n (https://ieeebigdataservice.com/)"},{"id":"http://arxiv.org/abs/2312.01771v1","updated":"2023-12-04T09:48:29Z","published":"2023-12-04T09:48:29Z","title":"IMProv: Inpainting-based Multimodal Prompting for Computer Vision Tasks","summary":" In-context learning allows adapting a model to new tasks given a task\ndescription at test time. In this paper, we present IMProv - a generative model\nthat is able to in-context learn visual tasks from multimodal prompts. Given a\ntextual description of a visual task (e.g. \"Left: input image, Right:\nforeground segmentation\"), a few input-output visual examples, or both, the\nmodel in-context learns to solve it for a new test input. We train a masked\ngenerative transformer on a new dataset of figures from computer vision papers\nand their associated captions, together with a captioned large-scale image-text\ndataset. During inference time, we prompt the model with text and/or image task\nexample(s) and have the model inpaint the corresponding output. We show that\ntraining our model with text conditioning and scaling the dataset size improves\nin-context learning for computer vision tasks by over +10\\% AP for Foreground\nSegmentation, over +5\\% gains in AP for Single Object Detection, and almost\n20\\% lower LPIPS in Colorization. Our empirical results suggest that vision and\nlanguage prompts are complementary and it is advantageous to use both to\nachieve better in-context learning performance. Project page is available at\nhttps://jerryxu.net/IMProv .\n","authors":["Jiarui Xu","Yossi Gandelsman","Amir Bar","Jianwei Yang","Jianfeng Gao","Trevor Darrell","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01771v1.pdf","comment":"Project page: https://jerryxu.net/IMProv"},{"id":"http://arxiv.org/abs/2312.01768v1","updated":"2023-12-04T09:43:05Z","published":"2023-12-04T09:43:05Z","title":"Localizing and Assessing Node Significance in Default Mode Network using\n Sub-Community Detection in Mild Cognitive Impairment","summary":" Our study aims to utilize fMRI to identify the affected brain regions within\nthe Default Mode Network (DMN) in subjects with Mild Cognitive Impairment\n(MCI), using a novel Node Significance Score (NSS). We construct\nsubject-specific DMN graphs by employing partial correlation of Regions of\nInterest (ROIs) that make-up the DMN. For the DMN graph, ROIs are the nodes and\nedges are determined based on partial correlation. Four popular community\ndetection algorithms (Clique Percolation Method (CPM), Louvain algorithm,\nGreedy Modularity and Leading Eigenvectors) are applied to determine the\nlargest sub-community. NSS ratings are derived for each node, considering (I)\nfrequency in the largest sub-community within a class across all subjects and\n(II) occurrence in the largest sub-community according to all four methods.\nAfter computing the NSS of each ROI in both healthy and MCI subjects, we\nquantify the score disparity to identify nodes most impacted by MCI. The\nresults reveal a disparity exceeding 20% for 10 DMN nodes, maximally for PCC\nand Fusiform, showing 45.69% and 43.08% disparity. This aligns with existing\nmedical literature, additionally providing a quantitative measure that enables\nthe ordering of the affected ROIs. These findings offer valuable insights and\ncould lead to treatment strategies aggressively targeting the affected nodes.\n","authors":["Ameiy Acharya","Chakka Sai Pradeep","Neelam Sinha"],"pdf_url":"https://arxiv.org/pdf/2312.01768v1.pdf","comment":"4 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.01764v1","updated":"2023-12-04T09:40:11Z","published":"2023-12-04T09:40:11Z","title":"Dynamic Erasing Network Based on Multi-Scale Temporal Features for\n Weakly Supervised Video Anomaly Detection","summary":" The goal of weakly supervised video anomaly detection is to learn a detection\nmodel using only video-level labeled data. However, prior studies typically\ndivide videos into fixed-length segments without considering the complexity or\nduration of anomalies. Moreover, these studies usually just detect the most\nabnormal segments, potentially overlooking the completeness of anomalies. To\naddress these limitations, we propose a Dynamic Erasing Network (DE-Net) for\nweakly supervised video anomaly detection, which learns multi-scale temporal\nfeatures. Specifically, to handle duration variations of abnormal events, we\nfirst propose a multi-scale temporal modeling module, capable of extracting\nfeatures from segments of varying lengths and capturing both local and global\nvisual information across different temporal scales. Then, we design a dynamic\nerasing strategy, which dynamically assesses the completeness of the detected\nanomalies and erases prominent abnormal segments in order to encourage the\nmodel to discover gentle abnormal segments in a video. The proposed method\nobtains favorable performance compared to several state-of-the-art approaches\non three datasets: XD-Violence, TAD, and UCF-Crime. Code will be made available\nat https://github.com/ArielZc/DE-Net.\n","authors":["Chen Zhang","Guorong Li","Yuankai Qi","Hanhua Ye","Laiyun Qing","Ming-Hsuan Yang","Qingming Huang"],"pdf_url":"https://arxiv.org/pdf/2312.01764v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01761v1","updated":"2023-12-04T09:37:27Z","published":"2023-12-04T09:37:27Z","title":"Light Field Imaging in the Restrictive Object Space based on Flexible\n Angular Plane","summary":" In some applications, the object space of light field imaging system is\nrestrictive, such as industrial and medical endoscopes. If the traditional\nlight field imaging system is used in the restrictive object space (ROS)\ndirectly but without any specific considerations, the ROS will lead to severe\nmicrolens image distortions and then affects light field decoding, calibration\nand 3D reconstruction. The light field imaging in restrictive object space\n(ROS-LF) is complicated but significant. In this paper, we first deduce that\nthe reason of the microlens image deviation is the position variation of the\nangular plane, then we propose the flexible angular plane for ROS-LF, while in\nthe traditional light field the angular plane always coincides with the main\nlens plane. Subsequently, we propose the microlens image non-distortion\nprinciple for ROS-LF and introduce the ROS-LF imaging principle. We demonstrate\nthat the difference is an aperture constant term between the ROS-LF and\ntraditional light field imaging models. At last, we design a ROS-LF simulated\nsystem and calibrate it to verify principles proposed in this paper.\n","authors":["Ping Zhou","Nuo Chen","Yuda Xu","Chengcai Xu"],"pdf_url":"https://arxiv.org/pdf/2312.01761v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01758v1","updated":"2023-12-04T09:35:36Z","published":"2023-12-04T09:35:36Z","title":"CZL-CIAE: CLIP-driven Zero-shot Learning for Correcting Inverse Age\n Estimation","summary":" Zero-shot age estimation aims to learn feature information about age from\ninput images and make inferences about a given person's image or video frame\nwithout specific sample data. The development of zero-shot age estimation can\nimprove the efficiency and accuracy of various applications (e.g., age\nverification and secure access control, etc.), while also promoting research on\nmulti-modal and zero-shot learning in the social media field. For example,\nzero-sample age estimation can be used to create social networks focused on\nspecific age groups. However, existing methods mainly focus on supervised,\nlabeled age estimation learning, and the prediction effect of zero-shot\nlearning is very poor. To tackle the above issues, we propose a novel\nCLIP-driven Zero-shot Learning for Correcting Inverse Age Estimation\n(CZL-CIAE). Specifically, we first introduce the CLIP model to extract image\nfeatures and text semantic information respectively, and map them into a highly\nsemantically aligned high-dimensional feature space. Next, we designed a new\nTransformer architecture (i.e., FourierFormer) to achieve channel evolution and\nspatial interaction of images, and to fuse image and text semantic information.\nFinally, we introduce reversible age estimation, which uses end-to-end error\nfeedback to reduce the error rate of age predictions. Through extensive\nexperiments on multiple data sets, CZL-CIAE has achieved better age prediction\nresults.\n","authors":["Yuntao Shou","Wei Ai","Tao Meng","Keqin Li"],"pdf_url":"https://arxiv.org/pdf/2312.01758v1.pdf","comment":"14 pages, 14 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.01756v1","updated":"2023-12-04T09:35:21Z","published":"2023-12-04T09:35:21Z","title":"A Comprehensive Literature Review on Sweet Orange Leaf Diseases","summary":" Sweet orange leaf diseases are significant to agricultural productivity. Leaf\ndiseases impact fruit quality in the citrus industry. The apparition of machine\nlearning makes the development of disease finder. Early detection and diagnosis\nare necessary for leaf management. Sweet orange leaf disease-predicting\nautomated systems have already been developed using different image-processing\ntechniques. This comprehensive literature review is systematically based on\nleaf disease and machine learning methodologies applied to the detection of\ndamaged leaves via image classification. The benefits and limitations of\ndifferent machine learning models, including Vision Transformer (ViT), Neural\nNetwork (CNN), CNN with SoftMax and RBF SVM, Hybrid CNN-SVM, HLB-ConvMLP,\nEfficientNet-b0, YOLOv5, YOLOv7, Convolutional, Deep CNN. These machine\nlearning models tested on various datasets and detected the disease. This\ncomprehensive review study related to leaf disease compares the performance of\nthe models; those models' accuracy, precision, recall, etc., were used in the\nsubsisting studies\n","authors":["Yousuf Rayhan Emon","Md Golam Rabbani","Dr. Md. Taimur Ahad","Faruk Ahmed"],"pdf_url":"https://arxiv.org/pdf/2312.01756v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2202.02149v4","updated":"2023-12-04T09:27:15Z","published":"2022-02-04T14:27:09Z","title":"3D Point Cloud Registration with Learning-based Matching Algorithm","summary":" We present a novel differential matching algorithm for 3D point cloud\nregistration. Instead of only optimizing the feature extractor for a matching\nalgorithm, we propose a learning-based matching module optimized to the\njointly-trained feature extractor. We focused on edge-wise feature-forwarding\narchitectures, which are memory-consuming but can avoid the over-smoothing\neffect that GNNs suffer. We improve its memory efficiency to scale it for point\ncloud registration while investigating the best way of connecting it to the\nfeature extractor. Experimental results show our matching module's significant\nimpact on performance improvement in rigid/non-rigid and whole/partial point\ncloud registration datasets with multiple contemporary feature extractors. For\nexample, our module boosted the current SOTA method, RoITr, by +5.4%, and +7.2%\nin the NFMR metric and +6.1% and +8.5% in the IR metric on the 4DMatch and\n4DLoMatch datasets, respectively.\n","authors":["Rintaro Yanagi","Atsushi Hashimoto","Shusaku Sone","Naoya Chiba","Jiaxin Ma","Yoshitaka Ushiku"],"pdf_url":"https://arxiv.org/pdf/2202.02149v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01753v1","updated":"2023-12-04T09:27:03Z","published":"2023-12-04T09:27:03Z","title":"Long-Tail Learning with Rebalanced Contrastive Loss","summary":" Integrating supervised contrastive loss to cross entropy-based communication\nhas recently been proposed as a solution to address the long-tail learning\nproblem. However, when the class imbalance ratio is high, it requires adjusting\nthe supervised contrastive loss to support the tail classes, as the\nconventional contrastive learning is biased towards head classes by default. To\nthis end, we present Rebalanced Contrastive Learning (RCL), an efficient means\nto increase the long tail classification accuracy by addressing three main\naspects: 1. Feature space balancedness - Equal division of the feature space\namong all the classes, 2. Intra-Class compactness - Reducing the distance\nbetween same-class embeddings, 3. Regularization - Enforcing larger margins for\ntail classes to reduce overfitting. RCL adopts class frequency-based SoftMax\nloss balancing to supervised contrastive learning loss and exploits scalar\nmultiplied features fed to the contrastive learning loss to enforce\ncompactness. We implement RCL on the Balanced Contrastive Learning (BCL)\nFramework, which has the SOTA performance. Our experiments on three benchmark\ndatasets demonstrate the richness of the learnt embeddings and increased top-1\nbalanced accuracy RCL provides to the BCL framework. We further demonstrate\nthat the performance of RCL as a standalone loss also achieves state-of-the-art\nlevel accuracy.\n","authors":["Charika De Alvis","Dishanika Denipitiyage","Suranga Seneviratne"],"pdf_url":"https://arxiv.org/pdf/2312.01753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01746v1","updated":"2023-12-04T09:10:25Z","published":"2023-12-04T09:10:25Z","title":"Open-DDVM: A Reproduction and Extension of Diffusion Model for Optical\n Flow Estimation","summary":" Recently, Google proposes DDVM which for the first time demonstrates that a\ngeneral diffusion model for image-to-image translation task works impressively\nwell on optical flow estimation task without any specific designs like RAFT.\nHowever, DDVM is still a closed-source model with the expensive and private\nPalette-style pretraining. In this technical report, we present the first\nopen-source DDVM by reproducing it. We study several design choices and find\nthose important ones. By training on 40k public data with 4 GPUs, our\nreproduction achieves comparable performance to the closed-source DDVM. The\ncode and model have been released in\nhttps://github.com/DQiaole/FlowDiffusion_pytorch.\n","authors":["Qiaole Dong","Bo Zhao","Yanwei Fu"],"pdf_url":"https://arxiv.org/pdf/2312.01746v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2312.01745v1","updated":"2023-12-04T09:10:24Z","published":"2023-12-04T09:10:24Z","title":"Cross-Modal Adaptive Dual Association for Text-to-Image Person Retrieval","summary":" Text-to-image person re-identification (ReID) aims to retrieve images of a\nperson based on a given textual description. The key challenge is to learn the\nrelations between detailed information from visual and textual modalities.\nExisting works focus on learning a latent space to narrow the modality gap and\nfurther build local correspondences between two modalities. However, these\nmethods assume that image-to-text and text-to-image associations are\nmodality-agnostic, resulting in suboptimal associations. In this work, we show\nthe discrepancy between image-to-text association and text-to-image association\nand propose CADA: Cross-Modal Adaptive Dual Association that finely builds\nbidirectional image-text detailed associations. Our approach features a\ndecoder-based adaptive dual association module that enables full interaction\nbetween visual and textual modalities, allowing for bidirectional and adaptive\ncross-modal correspondence associations. Specifically, the paper proposes a\nbidirectional association mechanism: Association of text Tokens to image\nPatches (ATP) and Association of image Regions to text Attributes (ARA). We\nadaptively model the ATP based on the fact that aggregating cross-modal\nfeatures based on mistaken associations will lead to feature distortion. For\nmodeling the ARA, since the attributes are typically the first distinguishing\ncues of a person, we propose to explore the attribute-level association by\npredicting the masked text phrase using the related image region. Finally, we\nlearn the dual associations between texts and images, and the experimental\nresults demonstrate the superiority of our dual formulation. Codes will be made\npublicly available.\n","authors":["Dixuan Lin","Yixing Peng","Jingke Meng","Wei-Shi Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.01745v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01742v1","updated":"2023-12-04T09:07:09Z","published":"2023-12-04T09:07:09Z","title":"Fully Spiking Denoising Diffusion Implicit Models","summary":" Spiking neural networks (SNNs) have garnered considerable attention owing to\ntheir ability to run on neuromorphic devices with super-high speeds and\nremarkable energy efficiencies. SNNs can be used in conventional neural\nnetwork-based time- and energy-consuming applications. However, research on\ngenerative models within SNNs remains limited, despite their advantages. In\nparticular, diffusion models are a powerful class of generative models, whose\nimage generation quality surpass that of the other generative models, such as\nGANs. However, diffusion models are characterized by high computational costs\nand long inference times owing to their iterative denoising feature. Therefore,\nwe propose a novel approach fully spiking denoising diffusion implicit model\n(FSDDIM) to construct a diffusion model within SNNs and leverage the high speed\nand low energy consumption features of SNNs via synaptic current learning\n(SCL). SCL fills the gap in that diffusion models use a neural network to\nestimate real-valued parameters of a predefined probabilistic distribution,\nwhereas SNNs output binary spike trains. The SCL enables us to complete the\nentire generative process of diffusion models exclusively using SNNs. We\ndemonstrate that the proposed method outperforms the state-of-the-art fully\nspiking generative model.\n","authors":["Ryo Watanabe","Yusuke Mukuta","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2312.01742v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01741v1","updated":"2023-12-04T09:06:41Z","published":"2023-12-04T09:06:41Z","title":"SRSNetwork: Siamese Reconstruction-Segmentation Networks based on\n Dynamic-Parameter Convolution","summary":" In this paper, we present a high-performance deep neural network for weak\ntarget image segmentation, including medical image segmentation and infrared\nimage segmentation. To this end, this work analyzes the existing dynamic\nconvolutions and proposes dynamic parameter convolution (DPConv). Furthermore,\nit reevaluates the relationship between reconstruction tasks and segmentation\ntasks from the perspective of DPConv, leading to the proposal of a dual-network\nmodel called the Siamese Reconstruction-Segmentation Network (SRSNet). The\nproposed model is not only a universal network but also enhances the\nsegmentation performance without altering its structure, leveraging the\nreconstruction task. Additionally, as the amount of training data for the\nreconstruction network increases, the performance of the segmentation network\nalso improves synchronously. On seven datasets including five medical datasets\nand two infrared image datasets, our SRSNet consistently achieves the best\nsegmentation results. The code is released at https://github.com/fidshu/SRSNet.\n","authors":["Bingkun Nian","Fenghe Tang","Jianrui Ding","Pingping Zhang","Jie Yang","S. Kevin Zhou","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.01741v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.01740v1","updated":"2023-12-04T09:04:05Z","published":"2023-12-04T09:04:05Z","title":"MobileUtr: Revisiting the relationship between light-weight CNN and\n Transformer for efficient medical image segmentation","summary":" Due to the scarcity and specific imaging characteristics in medical images,\nlight-weighting Vision Transformers (ViTs) for efficient medical image\nsegmentation is a significant challenge, and current studies have not yet paid\nattention to this issue. This work revisits the relationship between CNNs and\nTransformers in lightweight universal networks for medical image segmentation,\naiming to integrate the advantages of both worlds at the infrastructure design\nlevel. In order to leverage the inductive bias inherent in CNNs, we abstract a\nTransformer-like lightweight CNNs block (ConvUtr) as the patch embeddings of\nViTs, feeding Transformer with denoised, non-redundant and highly condensed\nsemantic information. Moreover, an adaptive Local-Global-Local (LGL) block is\nintroduced to facilitate efficient local-to-global information flow exchange,\nmaximizing Transformer's global context information extraction capabilities.\nFinally, we build an efficient medical image segmentation model (MobileUtr)\nbased on CNN and Transformer. Extensive experiments on five public medical\nimage datasets with three different modalities demonstrate the superiority of\nMobileUtr over the state-of-the-art methods, while boasting lighter weights and\nlower computational cost. Code is available at\nhttps://github.com/FengheTan9/MobileUtr.\n","authors":["Fenghe Tang","Bingkun Nian","Jianrui Ding","Quan Quan","Jie Yang","Wei Liu","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.01740v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2307.08596v2","updated":"2023-12-04T09:01:14Z","published":"2023-07-14T07:09:57Z","title":"Omnipotent Adversarial Training in the Wild","summary":" Adversarial training is an important topic in robust deep learning, but the\ncommunity lacks attention to its practical usage. In this paper, we aim to\nresolve a real-world challenge, i.e., training a model on an imbalanced and\nnoisy dataset to achieve high clean accuracy and adversarial robustness, with\nour proposed Omnipotent Adversarial Training (OAT) strategy. OAT consists of\ntwo innovative methodologies to address the imperfection in the training set.\nWe first introduce an oracle into the adversarial training process to help the\nmodel learn a correct data-label conditional distribution. This\ncarefully-designed oracle can provide correct label annotations for adversarial\ntraining. We further propose logits adjustment adversarial training to overcome\nthe data imbalance issue, which can help the model learn a Bayes-optimal\ndistribution. Our comprehensive evaluation results show that OAT outperforms\nother baselines by more than 20% clean accuracy improvement and 10% robust\naccuracy improvement under complex combinations of data imbalance and label\nnoise scenarios. The code can be found in https://github.com/GuanlinLee/OAT.\n","authors":["Guanlin Li","Kangjie Chen","Yuan Xu","Han Qiu","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01734v1","updated":"2023-12-04T08:55:46Z","published":"2023-12-04T08:55:46Z","title":"Effective Adapter for Face Recognition in the Wild","summary":" In this paper, we tackle the challenge of face recognition in the wild, where\nimages often suffer from low quality and real-world distortions. Traditional\nheuristic approaches-either training models directly on these degraded images\nor their enhanced counterparts using face restoration techniques-have proven\nineffective, primarily due to the degradation of facial features and the\ndiscrepancy in image domains. To overcome these issues, we propose an effective\nadapter for augmenting existing face recognition models trained on high-quality\nfacial datasets. The key of our adapter is to process both the unrefined and\nthe enhanced images by two similar structures where one is fixed and the other\ntrainable. Such design can confer two benefits. First, the dual-input system\nminimizes the domain gap while providing varied perspectives for the face\nrecognition model, where the enhanced image can be regarded as a complex\nnon-linear transformation of the original one by the restoration model. Second,\nboth two similar structures can be initialized by the pre-trained models\nwithout dropping the past knowledge. The extensive experiments in zero-shot\nsettings show the effectiveness of our method by surpassing baselines of about\n3%, 4%, and 7% in three datasets. Our code will be publicly available at\nhttps://github.com/liuyunhaozz/FaceAdapter/.\n","authors":["Yunhao Liu","Lu Qi","Yu-Ju Tsai","Xiangtai Li","Kelvin C. K. Chan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01734v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01732v1","updated":"2023-12-04T08:53:59Z","published":"2023-12-04T08:53:59Z","title":"Likelihood-Aware Semantic Alignment for Full-Spectrum\n Out-of-Distribution Detection","summary":" Full-spectrum out-of-distribution (F-OOD) detection aims to accurately\nrecognize in-distribution (ID) samples while encountering semantic and\ncovariate shifts simultaneously. However, existing out-of-distribution (OOD)\ndetectors tend to overfit the covariance information and ignore intrinsic\nsemantic correlation, inadequate for adapting to complex domain\ntransformations. To address this issue, we propose a Likelihood-Aware Semantic\nAlignment (LSA) framework to promote the image-text correspondence into\nsemantically high-likelihood regions. LSA consists of an offline Gaussian\nsampling strategy which efficiently samples semantic-relevant visual embeddings\nfrom the class-conditional Gaussian distribution, and a bidirectional prompt\ncustomization mechanism that adjusts both ID-related and negative context for\ndiscriminative ID/OOD boundary. Extensive experiments demonstrate the\nremarkable OOD detection performance of our proposed LSA especially on the\nintractable Near-OOD setting, surpassing existing methods by a margin of\n$15.26\\%$ and $18.88\\%$ on two F-OOD benchmarks, respectively.\n","authors":["Fan Lu","Kai Zhu","Kecheng Zheng","Wei Zhai","Yang Cao"],"pdf_url":"https://arxiv.org/pdf/2312.01732v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2307.10205v2","updated":"2023-12-04T08:53:58Z","published":"2023-07-14T07:01:48Z","title":"Alleviating the Effect of Data Imbalance on Adversarial Training","summary":" In this paper, we study adversarial training on datasets that obey the\nlong-tailed distribution, which is practical but rarely explored in previous\nworks. Compared with conventional adversarial training on balanced datasets,\nthis process falls into the dilemma of generating uneven adversarial examples\n(AEs) and an unbalanced feature embedding space, causing the resulting model to\nexhibit low robustness and accuracy on tail data. To combat that, we\ntheoretically analyze the lower bound of the robust risk to train a model on a\nlong-tailed dataset to obtain the key challenges in addressing the\naforementioned dilemmas. Based on it, we propose a new adversarial training\nframework -- Re-balancing Adversarial Training (REAT). This framework consists\nof two components: (1) a new training strategy inspired by the effective number\nto guide the model to generate more balanced and informative AEs; (2) a\ncarefully constructed penalty function to force a satisfactory feature space.\nEvaluation results on different datasets and model structures prove that REAT\ncan effectively enhance the model's robustness and preserve the model's clean\naccuracy. The code can be found in https://github.com/GuanlinLee/REAT.\n","authors":["Guanlin Li","Guowen Xu","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15984v2","updated":"2023-12-04T08:50:31Z","published":"2023-08-30T12:13:13Z","title":"Learning Structure-from-Motion with Graph Attention Networks","summary":" In this paper we tackle the problem of learning Structure-from-Motion (SfM)\nthrough the use of graph attention networks. SfM is a classic computer vision\nproblem that is solved though iterative minimization of reprojection errors,\nreferred to as Bundle Adjustment (BA), starting from a good initialization. In\norder to obtain a good enough initialization to BA, conventional methods rely\non a sequence of sub-problems (such as pairwise pose estimation, pose averaging\nor triangulation) which provides an initial solution that can then be refined\nusing BA. In this work we replace these sub-problems by learning a model that\ntakes as input the 2D keypoints detected across multiple views, and outputs the\ncorresponding camera poses and 3D keypoint coordinates. Our model takes\nadvantage of graph neural networks to learn SfM-specific primitives, and we\nshow that it can be used for fast inference of the reconstruction for new and\nunseen sequences. The experimental results show that the proposed model\noutperforms competing learning-based methods, and challenges COLMAP while\nhaving lower runtime.\n","authors":["Lucas Brynte","José Pedro Iglesias","Carl Olsson","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2308.15984v2.pdf","comment":"Added additional metrics"},{"id":"http://arxiv.org/abs/2104.13742v2","updated":"2023-12-04T08:33:55Z","published":"2021-04-28T13:10:56Z","title":"MineGAN++: Mining Generative Models for Efficient Knowledge Transfer to\n Limited Data Domains","summary":" GANs largely increases the potential impact of generative models. Therefore,\nwe propose a novel knowledge transfer method for generative models based on\nmining the knowledge that is most beneficial to a specific target domain,\neither from a single or multiple pretrained GANs. This is done using a miner\nnetwork that identifies which part of the generative distribution of each\npretrained GAN outputs samples closest to the target domain. Mining effectively\nsteers GAN sampling towards suitable regions of the latent space, which\nfacilitates the posterior finetuning and avoids pathologies of other methods,\nsuch as mode collapse and lack of flexibility. Furthermore, to prevent\noverfitting on small target domains, we introduce sparse subnetwork selection,\nthat restricts the set of trainable neurons to those that are relevant for the\ntarget dataset. We perform comprehensive experiments on several challenging\ndatasets using various GAN architectures (BigGAN, Progressive GAN, and\nStyleGAN) and show that the proposed method, called MineGAN, effectively\ntransfers knowledge to domains with few target images, outperforming existing\nmethods. In addition, MineGAN can successfully transfer knowledge from multiple\npretrained GANs.\n","authors":["Yaxing Wang","Abel Gonzalez-Garcia","Chenshen Wu","Luis Herranz","Fahad Shahbaz Khan","Shangling Jui","Joost van de Weijer"],"pdf_url":"https://arxiv.org/pdf/2104.13742v2.pdf","comment":"accepted at IJCV. arXiv admin note: substantial text overlap with\n arXiv:1912.05270"},{"id":"http://arxiv.org/abs/2312.01726v1","updated":"2023-12-04T08:32:31Z","published":"2023-12-04T08:32:31Z","title":"Simultaneous Alignment and Surface Regression Using Hybrid 2D-3D\n Networks for 3D Coherent Layer Segmentation of Retinal OCT Images with Full\n and Sparse Annotations","summary":" Layer segmentation is important to quantitative analysis of retinal optical\ncoherence tomography (OCT). Recently, deep learning based methods have been\ndeveloped to automate this task and yield remarkable performance. However, due\nto the large spatial gap and potential mismatch between the B-scans of an OCT\nvolume, all of them were based on 2D segmentation of individual B-scans, which\nmay lose the continuity and diagnostic information of the retinal layers in 3D\nspace. Besides, most of these methods required dense annotation of the OCT\nvolumes, which is labor-intensive and expertise-demanding. This work presents a\nnovel framework based on hybrid 2D-3D convolutional neural networks (CNNs) to\nobtain continuous 3D retinal layer surfaces from OCT volumes, which works well\nwith both full and sparse annotations. The 2D features of individual B-scans\nare extracted by an encoder consisting of 2D convolutions. These 2D features\nare then used to produce the alignment displacement vectors and layer\nsegmentation by two 3D decoders coupled via a spatial transformer module. Two\nlosses are proposed to utilize the retinal layers' natural property of being\nsmooth for B-scan alignment and layer segmentation, respectively, and are the\nkey to the semi-supervised learning with sparse annotation. The entire\nframework is trained end-to-end. To the best of our knowledge, this is the\nfirst work that attempts 3D retinal layer segmentation in volumetric OCT images\nbased on CNNs. Experiments on a synthetic dataset and three public clinical\ndatasets show that our framework can effectively align the B-scans for\npotential motion correction, and achieves superior performance to\nstate-of-the-art 2D deep learning methods in terms of both layer segmentation\naccuracy and cross-B-scan 3D continuity in both fully and semi-supervised\nsettings, thus offering more clinical values than previous works.\n","authors":["Hong Liu","Dong Wei","Donghuan Lu","Xiaoying Tang","Liansheng Wang","Yefeng Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.01726v1.pdf","comment":"Accepted by MIA"},{"id":"http://arxiv.org/abs/2312.01725v1","updated":"2023-12-04T08:27:59Z","published":"2023-12-04T08:27:59Z","title":"StableVITON: Learning Semantic Correspondence with Latent Diffusion\n Model for Virtual Try-On","summary":" Given a clothing image and a person image, an image-based virtual try-on aims\nto generate a customized image that appears natural and accurately reflects the\ncharacteristics of the clothing image. In this work, we aim to expand the\napplicability of the pre-trained diffusion model so that it can be utilized\nindependently for the virtual try-on task.The main challenge is to preserve the\nclothing details while effectively utilizing the robust generative capability\nof the pre-trained model. In order to tackle these issues, we propose\nStableVITON, learning the semantic correspondence between the clothing and the\nhuman body within the latent space of the pre-trained diffusion model in an\nend-to-end manner. Our proposed zero cross-attention blocks not only preserve\nthe clothing details by learning the semantic correspondence but also generate\nhigh-fidelity images by utilizing the inherent knowledge of the pre-trained\nmodel in the warping process. Through our proposed novel attention total\nvariation loss and applying augmentation, we achieve the sharp attention map,\nresulting in a more precise representation of clothing details. StableVITON\noutperforms the baselines in qualitative and quantitative evaluation, showing\npromising quality in arbitrary person images. Our code is available at\nhttps://github.com/rlawjdghek/StableVITON.\n","authors":["Jeongho Kim","Gyojung Gu","Minho Park","Sunghyun Park","Jaegul Choo"],"pdf_url":"https://arxiv.org/pdf/2312.01725v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2311.08239v2","updated":"2023-12-04T08:25:58Z","published":"2023-11-14T15:20:42Z","title":"Learning Physics-Inspired Regularization for Medical Image Registration\n with Hypernetworks","summary":" Medical image registration aims at identifying the spatial deformation\nbetween images of the same anatomical region and is fundamental to image-based\ndiagnostics and therapy. To date, the majority of the deep learning-based\nregistration methods employ regularizers that enforce global spatial\nsmoothness, e.g., the diffusion regularizer. However, such regularizers are not\ntailored to the data and might not be capable of reflecting the complex\nunderlying deformation. In contrast, physics-inspired regularizers promote\nphysically plausible deformations. One such regularizer is the linear elastic\nregularizer which models the deformation of elastic material. These\nregularizers are driven by parameters that define the material's physical\nproperties. For biological tissue, a wide range of estimations of such\nparameters can be found in the literature and it remains an open challenge to\nidentify suitable parameter values for successful registration. To overcome\nthis problem and to incorporate physical properties into learning-based\nregistration, we propose to use a hypernetwork that learns the effect of the\nphysical parameters of a physics-inspired regularizer on the resulting spatial\ndeformation field. In particular, we adapt the HyperMorph framework to learn\nthe effect of the two elasticity parameters of the linear elastic regularizer.\nOur approach enables the efficient discovery of suitable, data-specific\nphysical parameters at test time.\n","authors":["Anna Reithmeir","Julia A. Schnabel","Veronika A. Zimmer"],"pdf_url":"https://arxiv.org/pdf/2311.08239v2.pdf","comment":"Manuscript accepted at SPIE Medical Imaging 2024"},{"id":"http://arxiv.org/abs/2308.06762v2","updated":"2023-12-04T08:08:05Z","published":"2023-08-13T12:51:15Z","title":"Tissue Segmentation of Thick-Slice Fetal Brain MR Scans with Guidance\n from High-Quality Isotropic Volumes","summary":" Accurate tissue segmentation of thick-slice fetal brain magnetic resonance\n(MR) scans is crucial for both reconstruction of isotropic brain MR volumes and\nthe quantification of fetal brain development. However, this task is\nchallenging due to the use of thick-slice scans in clinically-acquired fetal\nbrain data. To address this issue, we propose to leverage high-quality\nisotropic fetal brain MR volumes (and also their corresponding annotations) as\nguidance for segmentation of thick-slice scans. Due to existence of significant\ndomain gap between high-quality isotropic volume (i.e., source data) and\nthick-slice scans (i.e., target data), we employ a domain adaptation technique\nto achieve the associated knowledge transfer (from high-quality \nvolumes to thick-slice scans). Specifically, we first register the\navailable high-quality isotropic fetal brain MR volumes across different\ngestational weeks to construct longitudinally-complete source data. To capture\ndomain-invariant information, we then perform Fourier decomposition to extract\nimage content and style codes. Finally, we propose a novel Cycle-Consistent\nDomain Adaptation Network (C2DA-Net) to efficiently transfer the knowledge\nlearned from high-quality isotropic volumes for accurate tissue segmentation of\nthick-slice scans. Our C2DA-Net can fully utilize a small set of annotated\nisotropic volumes to guide tissue segmentation on unannotated thick-slice\nscans. Extensive experiments on a large-scale dataset of 372 clinically\nacquired thick-slice MR scans demonstrate that our C2DA-Net achieves much\nbetter performance than cutting-edge methods quantitatively and qualitatively.\n","authors":["Shijie Huang","Xukun Zhang","Zhiming Cui","He Zhang","Geng Chen","Dinggang Shen"],"pdf_url":"https://arxiv.org/pdf/2308.06762v2.pdf","comment":"10 pages, 9 figures, 5 tables, Fetal MRI, Brain tissue segmentation,\n Unsupervised domain adaptation, Cycle-consistency"},{"id":"http://arxiv.org/abs/2312.01713v1","updated":"2023-12-04T08:02:59Z","published":"2023-12-04T08:02:59Z","title":"Disentangled Interaction Representation for One-Stage Human-Object\n Interaction Detection","summary":" Human-Object Interaction (HOI) detection is a core task for human-centric\nimage understanding. Recent one-stage methods adopt a transformer decoder to\ncollect image-wide cues that are useful for interaction prediction; however,\nthe interaction representations obtained using this method are entangled and\nlack interpretability. In contrast, traditional two-stage methods benefit\nsignificantly from their ability to compose interaction features in a\ndisentangled and explainable manner. In this paper, we improve the performance\nof one-stage methods by enabling them to extract disentangled interaction\nrepresentations. First, we propose Shunted Cross-Attention (SCA) to extract\nhuman appearance, object appearance, and global context features using\ndifferent cross-attention heads. This is achieved by imposing different masks\non the cross-attention maps produced by the different heads. Second, we\nintroduce the Interaction-aware Pose Estimation (IPE) task to learn\ninteraction-relevant human pose features using a disentangled decoder. This is\nachieved with a novel attention module that accurately captures the human\nkeypoints relevant to the current interaction category. Finally, our approach\nfuses the appearance feature and pose feature via element-wise addition to form\nthe interaction representation. Experimental results show that our approach can\nbe readily applied to existing one-stage HOI detectors. Moreover, we achieve\nstate-of-the-art performance on two benchmarks: HICO-DET and V-COCO.\n","authors":["Xubin Zhong","Changxing Ding","Yupeng Hu","Dacheng Tao"],"pdf_url":"https://arxiv.org/pdf/2312.01713v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16450v2","updated":"2023-12-04T07:59:05Z","published":"2023-11-28T03:11:33Z","title":"Typhoon Intensity Prediction with Vision Transformer","summary":" Predicting typhoon intensity accurately across space and time is crucial for\nissuing timely disaster warnings and facilitating emergency response. This has\nvast potential for minimizing life losses and property damages as well as\nreducing economic and environmental impacts. Leveraging satellite imagery for\nscenario analysis is effective but also introduces additional challenges due to\nthe complex relations among clouds and the highly dynamic context. Existing\ndeep learning methods in this domain rely on convolutional neural networks\n(CNNs), which suffer from limited per-layer receptive fields. This limitation\nhinders their ability to capture long-range dependencies and global contextual\nknowledge during inference. In response, we introduce a novel approach, namely\n\"Typhoon Intensity Transformer\" (Tint), which leverages self-attention\nmechanisms with global receptive fields per layer. Tint adopts a\nsequence-to-sequence feature representation learning perspective. It begins by\ncutting a given satellite image into a sequence of patches and recursively\nemploys self-attention operations to extract both local and global contextual\nrelations between all patch pairs simultaneously, thereby enhancing per-patch\nfeature representation learning. Extensive experiments on a publicly available\ntyphoon benchmark validate the efficacy of Tint in comparison with both\nstate-of-the-art deep learning and conventional meteorological methods. Our\ncode is available at https://github.com/chen-huanxin/Tint.\n","authors":["Huanxin Chen","Pengshuai Yin","Huichou Huang","Qingyao Wu","Ruirui Liu","Xiatian Zhu"],"pdf_url":"https://arxiv.org/pdf/2311.16450v2.pdf","comment":"8 pages, 2 figures, accepted by Tackling Climate Change with Machine\n Learning: workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.01711v1","updated":"2023-12-04T07:53:59Z","published":"2023-12-04T07:53:59Z","title":"Regressor-Segmenter Mutual Prompt Learning for Crowd Counting","summary":" Crowd counting has achieved significant progress by training regressors to\npredict instance positions. In heavily crowded scenarios, however, regressors\nare challenged by uncontrollable annotation variance, which causes density map\nbias and context information inaccuracy. In this study, we propose mutual\nprompt learning (mPrompt), which leverages a regressor and a segmenter as\nguidance for each other, solving bias and inaccuracy caused by annotation\nvariance while distinguishing foreground from background. In specific, mPrompt\nleverages point annotations to tune the segmenter and predict pseudo head masks\nin a way of point prompt learning. It then uses the predicted segmentation\nmasks, which serve as spatial constraint, to rectify biased point annotations\nas context prompt learning. mPrompt defines a way of mutual information\nmaximization from prompt learning, mitigating the impact of annotation variance\nwhile improving model accuracy. Experiments show that mPrompt significantly\nreduces the Mean Average Error (MAE), demonstrating the potential to be general\nframework for down-stream vision tasks.\n","authors":["Mingyue Guo","Li Yuan","Zhaoyi Yan","Binghui Chen","Yaowei Wang","Qixiang Ye"],"pdf_url":"https://arxiv.org/pdf/2312.01711v1.pdf","comment":"17 pages, 15 figures"},{"id":"http://arxiv.org/abs/2312.01701v1","updated":"2023-12-04T07:43:02Z","published":"2023-12-04T07:43:02Z","title":"Mitigating Fine-Grained Hallucination by Fine-Tuning Large\n Vision-Language Models with Caption Rewrites","summary":" Large language models (LLMs) have shown remarkable performance in natural\nlanguage processing (NLP) tasks. To comprehend and execute diverse human\ninstructions over image data, instruction-tuned large vision-language models\n(LVLMs) have been introduced. However, LVLMs may suffer from different types of\nobject hallucinations. Nevertheless, LVLMs are evaluated for coarse-grained\nobject hallucinations only (i.e., generated objects non-existent in the input\nimage). The fine-grained object attributes and behaviors non-existent in the\nimage may still be generated but not measured by the current evaluation\nmethods. In this paper, we thus focus on reducing fine-grained hallucinations\nof LVLMs. We propose \\textit{ReCaption}, a framework that consists of two\ncomponents: rewriting captions using ChatGPT and fine-tuning the\ninstruction-tuned LVLMs on the rewritten captions. We also propose a\nfine-grained probing-based evaluation method named \\textit{Fine-Grained Object\nHallucination Evaluation} (\\textit{FGHE}). Our experiment results demonstrate\nthat ReCaption effectively reduces fine-grained object hallucination for\ndifferent LVLM options and improves their text generation quality. The code can\nbe found at https://github.com/Anonymousanoy/FOHE.\n","authors":["Lei Wang","Jiabang He","Shenshen Li","Ning Liu","Ee-Peng Lim"],"pdf_url":"https://arxiv.org/pdf/2312.01701v1.pdf","comment":"MMM 2024"},{"id":"http://arxiv.org/abs/2312.01697v1","updated":"2023-12-04T07:36:04Z","published":"2023-12-04T07:36:04Z","title":"Hulk: A Universal Knowledge Translator for Human-Centric Tasks","summary":" Human-centric perception tasks, e.g., human mesh recovery, pedestrian\ndetection, skeleton-based action recognition, and pose estimation, have wide\nindustrial applications, such as metaverse and sports analysis. There is a\nrecent surge to develop human-centric foundation models that can benefit a\nbroad range of human-centric perception tasks. While many human-centric\nfoundation models have achieved success, most of them only excel in 2D vision\ntasks or require extensive fine-tuning for practical deployment in real-world\nscenarios. These limitations severely restrict their usability across various\ndownstream tasks and situations. To tackle these problems, we present Hulk, the\nfirst multimodal human-centric generalist model, capable of addressing most of\nthe mainstream tasks simultaneously without task-specific finetuning, covering\n2D vision, 3D vision, skeleton-based, and vision-language tasks. The key to\nachieving this is condensing various task-specific heads into two general\nheads, one for discrete representations, e.g., languages, and the other for\ncontinuous representations, e.g., location coordinates. The outputs of two\nheads can be further stacked into four distinct input and output modalities.\nThis uniform representation enables Hulk to treat human-centric tasks as\nmodality translation, integrating knowledge across a wide range of tasks. To\nvalidate the effectiveness of our proposed method, we conduct comprehensive\nexperiments on 11 benchmarks across 8 human-centric tasks. Experimental results\nsurpass previous methods substantially, demonstrating the superiority of our\nproposed method. The code will be available on\nhttps://github.com/OpenGVLab/HumanBench.\n","authors":["Yizhou Wang","Yixuan Wu","Shixiang Tang","Weizhen He","Xun Guo","Feng Zhu","Lei Bai","Rui Zhao","Jian Wu","Tong He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2312.01697v1.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.01696v1","updated":"2023-12-04T07:35:02Z","published":"2023-12-04T07:35:02Z","title":"BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection","summary":" Recently, the rise of query-based Transformer decoders is reshaping\ncamera-based 3D object detection. These query-based decoders are surpassing the\ntraditional dense BEV (Bird's Eye View)-based methods. However, we argue that\ndense BEV frameworks remain important due to their outstanding abilities in\ndepth estimation and object localization, depicting 3D scenes accurately and\ncomprehensively. This paper aims to address the drawbacks of the existing dense\nBEV-based 3D object detectors by introducing our proposed enhanced components,\nincluding a CRF-modulated depth estimation module enforcing object-level\nconsistencies, a long-term temporal aggregation module with extended receptive\nfields, and a two-stage object decoder combining perspective techniques with\nCRF-modulated depth embedding. These enhancements lead to a \"modernized\" dense\nBEV framework dubbed BEVNeXt. On the nuScenes benchmark, BEVNeXt outperforms\nboth BEV-based and query-based frameworks under various settings, achieving a\nstate-of-the-art result of 64.2 NDS on the nuScenes test set.\n","authors":["Zhenxin Li","Shiyi Lan","Jose M. Alvarez","Zuxuan Wu"],"pdf_url":"https://arxiv.org/pdf/2312.01696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17245v2","updated":"2023-12-04T07:30:36Z","published":"2023-11-28T21:39:20Z","title":"LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and\n 200+ FPS","summary":" Recent advancements in real-time neural rendering using point-based\ntechniques have paved the way for the widespread adoption of 3D\nrepresentations. However, foundational approaches like 3D Gaussian Splatting\ncome with a substantial storage overhead caused by growing the SfM points to\nmillions, often demanding gigabyte-level disk space for a single unbounded\nscene, posing significant scalability challenges and hindering the splatting\nefficiency.\n To address this challenge, we introduce LightGaussian, a novel method\ndesigned to transform 3D Gaussians into a more efficient and compact format.\nDrawing inspiration from the concept of Network Pruning, LightGaussian\nidentifies Gaussians that are insignificant in contributing to the scene\nreconstruction and adopts a pruning and recovery process, effectively reducing\nredundancy in Gaussian counts while preserving visual effects. Additionally,\nLightGaussian employs distillation and pseudo-view augmentation to distill\nspherical harmonics to a lower degree, allowing knowledge transfer to more\ncompact representations while maintaining reflectance. Furthermore, we propose\na hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in\nlower bitwidth representations with minimal accuracy losses.\n In summary, LightGaussian achieves an averaged compression rate over 15x\nwhile boosting the FPS from 139 to 215, enabling an efficient representation of\ncomplex scenes on Mip-NeRF 360, Tank and Temple datasets.\n Project website: https://lightgaussian.github.io/\n","authors":["Zhiwen Fan","Kevin Wang","Kairun Wen","Zehao Zhu","Dejia Xu","Zhangyang Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17245v2.pdf","comment":"16pages, 8figures"},{"id":"http://arxiv.org/abs/2312.01689v1","updated":"2023-12-04T07:23:44Z","published":"2023-12-04T07:23:44Z","title":"Fast and accurate sparse-view CBCT reconstruction using meta-learned\n neural attenuation field and hash-encoding regularization","summary":" Cone beam computed tomography (CBCT) is an emerging medical imaging technique\nto visualize the internal anatomical structures of patients. During a CBCT\nscan, several projection images of different angles or views are collectively\nutilized to reconstruct a tomographic image. However, reducing the number of\nprojections in a CBCT scan while preserving the quality of a reconstructed\nimage is challenging due to the nature of an ill-posed inverse problem.\nRecently, a neural attenuation field (NAF) method was proposed by adopting a\nneural radiance field algorithm as a new way for CBCT reconstruction,\ndemonstrating fast and promising results using only 50 views. However,\ndecreasing the number of projections is still preferable to reduce potential\nradiation exposure, and a faster reconstruction time is required considering a\ntypical scan time. In this work, we propose a fast and accurate sparse-view\nCBCT reconstruction (FACT) method to provide better reconstruction quality and\nfaster optimization speed in the minimal number of view acquisitions ($<$ 50\nviews). In the FACT method, we meta-trained a neural network and a hash-encoder\nusing a few scans (= 15), and a new regularization technique is utilized to\nreconstruct the details of an anatomical structure. In conclusion, we have\nshown that the FACT method produced better, and faster reconstruction results\nover the other conventional algorithms based on CBCT scans of different body\nparts (chest, head, and abdomen) and CT vendors (Siemens, Phillips, and GE).\n","authors":["Heejun Shin","Taehee Kim","Jongho Lee","Seyoung Chun","Seungryung Cho","Dongmyung Shin"],"pdf_url":"https://arxiv.org/pdf/2312.01689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01682v1","updated":"2023-12-04T07:14:20Z","published":"2023-12-04T07:14:20Z","title":"ResEnsemble-DDPM: Residual Denoising Diffusion Probabilistic Models for\n Ensemble Learning","summary":" Nowadays, denoising diffusion probabilistic models have been adapted for many\nimage segmentation tasks. However, existing end-to-end models have already\ndemonstrated remarkable capabilities. Rather than using denoising diffusion\nprobabilistic models alone, integrating the abilities of both denoising\ndiffusion probabilistic models and existing end-to-end models can better\nimprove the performance of image segmentation. Based on this, we implicitly\nintroduce residual term into the diffusion process and propose\nResEnsemble-DDPM, which seamlessly integrates the diffusion model and the\nend-to-end model through ensemble learning. The output distributions of these\ntwo models are strictly symmetric with respect to the ground truth\ndistribution, allowing us to integrate the two models by reducing the residual\nterm. Experimental results demonstrate that our ResEnsemble-DDPM can further\nimprove the capabilities of existing models. Furthermore, its ensemble learning\nstrategy can be generalized to other downstream tasks in image generation and\nget strong competitiveness.\n","authors":["Shi Zhenning","Dong Changsheng","Xie Xueshuo","Pan Bin","He Along","Li Tao"],"pdf_url":"https://arxiv.org/pdf/2312.01682v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00311v2","updated":"2023-12-04T07:12:48Z","published":"2023-12-01T03:05:21Z","title":"3D Face Reconstruction with the Geometric Guidance of Facial Part\n Segmentation","summary":" 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in\nvarious applications. However, existing methods struggle to reconstruct faces\nwith extreme expressions due to deficiencies in supervisory signals, such as\nsparse or inaccurate landmarks. Segmentation information contains effective\ngeometric contexts for face reconstruction. Certain attempts intuitively depend\non differentiable renderers to compare the rendered silhouettes of\nreconstruction with segmentation, which is prone to issues like local optima\nand gradient instability. In this paper, we fully utilize the facial part\nsegmentation geometry by introducing Part Re-projection Distance Loss (PRDL).\nSpecifically, PRDL transforms facial part segmentation into 2D points and\nre-projects the reconstruction onto the image plane. Subsequently, by\nintroducing grid anchors and computing different statistical distances from\nthese anchors to the point sets, PRDL establishes geometry descriptors to\noptimize the distribution of the point sets for face reconstruction. PRDL\nexhibits a clear gradient compared to the renderer-based methods and presents\nstate-of-the-art reconstruction performance in extensive quantitative and\nqualitative experiments. The project will be publicly available.\n","authors":["Zidu Wang","Xiangyu Zhu","Tianshuo Zhang","Baiqin Wang","Zhen Lei"],"pdf_url":"https://arxiv.org/pdf/2312.00311v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01679v1","updated":"2023-12-04T07:04:20Z","published":"2023-12-04T07:04:20Z","title":"Adversarial Medical Image with Hierarchical Feature Hiding","summary":" Deep learning based methods for medical images can be easily compromised by\nadversarial examples (AEs), posing a great security flaw in clinical\ndecision-making. It has been discovered that conventional adversarial attacks\nlike PGD which optimize the classification logits, are easy to distinguish in\nthe feature space, resulting in accurate reactive defenses. To better\nunderstand this phenomenon and reassess the reliability of the reactive\ndefenses for medical AEs, we thoroughly investigate the characteristic of\nconventional medical AEs. Specifically, we first theoretically prove that\nconventional adversarial attacks change the outputs by continuously optimizing\nvulnerable features in a fixed direction, thereby leading to outlier\nrepresentations in the feature space. Then, a stress test is conducted to\nreveal the vulnerability of medical images, by comparing with natural images.\nInterestingly, this vulnerability is a double-edged sword, which can be\nexploited to hide AEs. We then propose a simple-yet-effective hierarchical\nfeature constraint (HFC), a novel add-on to conventional white-box attacks,\nwhich assists to hide the adversarial feature in the target feature\ndistribution. The proposed method is evaluated on three medical datasets, both\n2D and 3D, with different modalities. The experimental results demonstrate the\nsuperiority of HFC, \\emph{i.e.,} it bypasses an array of state-of-the-art\nadversarial medical AE detectors more efficiently than competing adaptive\nattacks, which reveals the deficiencies of medical reactive defense and allows\nto develop more robust defenses in future.\n","authors":["Qingsong Yao","Zecheng He","Yuexiang Li","Yi Lin","Kai Ma","Yefeng Zheng","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.01679v1.pdf","comment":"Our code is available at\n \\url{https://github.com/qsyao/Hierarchical_Feature_Constraint}"},{"id":"http://arxiv.org/abs/2312.01677v1","updated":"2023-12-04T06:59:55Z","published":"2023-12-04T06:59:55Z","title":"Multi-task Image Restoration Guided By Robust DINO Features","summary":" Multi-task image restoration has gained significant interest due to its\ninherent versatility and efficiency compared to its single-task counterpart.\nDespite its potential, performance degradation is observed with an increase in\nthe number of tasks, primarily attributed to the distinct nature of each\nrestoration task. Addressing this challenge, we introduce\n\\mbox{\\textbf{DINO-IR}}, a novel multi-task image restoration approach\nleveraging robust features extracted from DINOv2. Our empirical analysis shows\nthat while shallow features of DINOv2 capture rich low-level image\ncharacteristics, the deep features ensure a robust semantic representation\ninsensitive to degradations while preserving high-frequency contour details.\nBuilding on these features, we devise specialized components, including\nmulti-layer semantic fusion module, DINO-Restore adaption and fusion module,\nand DINO perception contrastive loss, to integrate DINOv2 features into the\nrestoration paradigm. Equipped with the aforementioned components, our DINO-IR\nperforms favorably against existing multi-task image restoration approaches in\nvarious tasks by a large margin, indicating the superiority and necessity of\nreinforcing the robust features for multi-task image restoration.\n","authors":["Xin Lin","Chao Ren","Kelvin C. K. Chan","Lu Qi","Jinshan Pan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01677v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01671v1","updated":"2023-12-04T06:38:23Z","published":"2023-12-04T06:38:23Z","title":"Multimodality-guided Image Style Transfer using Cross-modal GAN\n Inversion","summary":" Image Style Transfer (IST) is an interdisciplinary topic of computer vision\nand art that continuously attracts researchers' interests. Different from\ntraditional Image-guided Image Style Transfer (IIST) methods that require a\nstyle reference image as input to define the desired style, recent works start\nto tackle the problem in a text-guided manner, i.e., Text-guided Image Style\nTransfer (TIST). Compared to IIST, such approaches provide more flexibility\nwith text-specified styles, which are useful in scenarios where the style is\nhard to define with reference images. Unfortunately, many TIST approaches\nproduce undesirable artifacts in the transferred images. To address this issue,\nwe present a novel method to achieve much improved style transfer based on text\nguidance. Meanwhile, to offer more flexibility than IIST and TIST, our method\nallows style inputs from multiple sources and modalities, enabling\nMultiModality-guided Image Style Transfer (MMIST). Specifically, we realize\nMMIST with a novel cross-modal GAN inversion method, which generates style\nrepresentations consistent with specified styles. Such style representations\nfacilitate style transfer and in principle generalize any IIST methods to\nMMIST. Large-scale experiments and user studies demonstrate that our method\nachieves state-of-the-art performance on TIST task. Furthermore, comprehensive\nqualitative results confirm the effectiveness of our method on MMIST task and\ncross-modal style interpolation.\n","authors":["Hanyu Wang","Pengxiang Wu","Kevin Dela Rosa","Chen Wang","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.01671v1.pdf","comment":"WACV 2024. Project website: https://hywang66.github.io/mmist/"},{"id":"http://arxiv.org/abs/2312.01663v1","updated":"2023-12-04T06:25:06Z","published":"2023-12-04T06:25:06Z","title":"Customize your NeRF: Adaptive Source Driven 3D Scene Editing via\n Local-Global Iterative Training","summary":" In this paper, we target the adaptive source driven 3D scene editing task by\nproposing a CustomNeRF model that unifies a text description or a reference\nimage as the editing prompt. However, obtaining desired editing results\nconformed with the editing prompt is nontrivial since there exist two\nsignificant challenges, including accurate editing of only foreground regions\nand multi-view consistency given a single-view reference image. To tackle the\nfirst challenge, we propose a Local-Global Iterative Editing (LGIE) training\nscheme that alternates between foreground region editing and full-image\nediting, aimed at foreground-only manipulation while preserving the background.\nFor the second challenge, we also design a class-guided regularization that\nexploits class priors within the generation model to alleviate the\ninconsistency problem among different views in image-driven editing. Extensive\nexperiments show that our CustomNeRF produces precise editing results under\nvarious real scenes for both text- and image-driven settings.\n","authors":["Runze He","Shaofei Huang","Xuecheng Nie","Tianrui Hui","Luoqi Liu","Jiao Dai","Jizhong Han","Guanbin Li","Si Liu"],"pdf_url":"https://arxiv.org/pdf/2312.01663v1.pdf","comment":"14 pages, 13 figures, project website: https://customnerf.github.io/"},{"id":"http://arxiv.org/abs/2312.01659v1","updated":"2023-12-04T06:21:22Z","published":"2023-12-04T06:21:22Z","title":"RiskBench: A Scenario-based Benchmark for Risk Identification","summary":" Intelligent driving systems aim to achieve a zero-collision mobility\nexperience, requiring interdisciplinary efforts to enhance safety performance.\nThis work focuses on risk identification, the process of identifying and\nanalyzing risks stemming from dynamic traffic participants and unexpected\nevents. While significant advances have been made in the community, the current\nevaluation of different risk identification algorithms uses independent\ndatasets, leading to difficulty in direct comparison and hindering collective\nprogress toward safety performance enhancement. To address this limitation, we\nintroduce \\textbf{RiskBench}, a large-scale scenario-based benchmark for risk\nidentification. We design a scenario taxonomy and augmentation pipeline to\nenable a systematic collection of ground truth risks under diverse scenarios.\nWe assess the ability of ten algorithms to (1) detect and locate risks, (2)\nanticipate risks, and (3) facilitate decision-making. We conduct extensive\nexperiments and summarize future research on risk identification. Our aim is to\nencourage collaborative endeavors in achieving a society with zero collisions.\nWe have made our dataset and benchmark toolkit publicly on the project page:\nhttps://hcis-lab.github.io/RiskBench/\n","authors":["Chi-Hsi Kung","Chieh-Chi Yang","Pang-Yuan Pao","Shu-Wei Lu","Pin-Lun Chen","Hsin-Cheng Lu","Yi-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01656v1","updated":"2023-12-04T06:14:25Z","published":"2023-12-04T06:14:25Z","title":"The Contemporary Art of Image Search: Iterative User Intent Expansion\n via Vision-Language Model","summary":" Image search is an essential and user-friendly method to explore vast\ngalleries of digital images. However, existing image search methods heavily\nrely on proximity measurements like tag matching or image similarity, requiring\nprecise user inputs for satisfactory results.To meet the growing demand for a\ncontemporary image search engine that enables accurate comprehension of users'\nsearch intentions, we introduce an innovative user intent expansion framework.\nOur framework leverages visual-language models to parse and compose multi-modal\nuser inputs to provide more accurate and satisfying results. It comprises\ntwo-stage processes: 1) a parsing stage that incorporates a language parsing\nmodule with large language models to enhance the comprehension of textual\ninputs, along with a visual parsing module that integrates an interactive\nsegmentation module to swiftly identify detailed visual elements within images;\nand 2) a logic composition stage that combines multiple user search intents\ninto a unified logic expression for more sophisticated operations in complex\nsearching scenarios. Moreover, the intent expansion framework enables users to\nperform flexible contextualized interactions with the search results to further\nspecify or adjust their detailed search intents iteratively. We implemented the\nframework into an image search system for NFT (non-fungible token) search and\nconducted a user study to evaluate its usability and novel properties. The\nresults indicate that the proposed framework significantly improves users'\nimage search experience. Particularly the parsing and contextualized\ninteractions prove useful in allowing users to express their search intents\nmore accurately and engage in a more enjoyable iterative search experience.\n","authors":["Yilin Ye","Qian Zhu","Shishi Xiao","Kang Zhang","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01656v1.pdf","comment":"Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported\n Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024)"},{"id":"http://arxiv.org/abs/2312.01650v1","updated":"2023-12-04T06:01:46Z","published":"2023-12-04T06:01:46Z","title":"Adaptive Confidence Threshold for ByteTrack in Multi-Object Tracking","summary":" We investigate the application of ByteTrack in the realm of multiple object\ntracking. ByteTrack, a simple tracking algorithm, enables the simultaneous\ntracking of multiple objects by strategically incorporating detections with a\nlow confidence threshold. Conventionally, objects are initially associated with\nhigh confidence threshold detections. When the association between objects and\ndetections becomes ambiguous, ByteTrack extends the association to lower\nconfidence threshold detections. One notable drawback of the existing ByteTrack\napproach is its reliance on a fixed threshold to differentiate between high and\nlow-confidence detections. In response to this limitation, we introduce a novel\nand adaptive approach. Our proposed method entails a dynamic adjustment of the\nconfidence threshold, leveraging insights derived from overall detections.\nThrough experimentation, we demonstrate the effectiveness of our adaptive\nconfidence threshold technique while maintaining running time compared to\nByteTrack.\n","authors":["Linh Van Ma","Muhammad Ishfaq Hussain","JongHyun Park","Jeongbae Kim","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2312.01650v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01644v1","updated":"2023-12-04T05:52:49Z","published":"2023-12-04T05:52:49Z","title":"TMSR: Tiny Multi-path CNNs for Super Resolution","summary":" In this paper, we proposed a tiny multi-path CNN-based Super-Resolution (SR)\nmethod, called TMSR. We mainly refer to some tiny CNN-based SR methods, under\n5k parameters. The main contribution of the proposed method is the improved\nmulti-path learning and self-defined activated function. The experimental\nresults show that TMSR obtains competitive image quality (i.e. PSNR and SSIM)\ncompared to the related works under 5k parameters.\n","authors":["Chia-Hung Liu","Tzu-Hsin Hsieh","Kuan-Yu Huang","Pei-Yin Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01644v1.pdf","comment":"5 pages, 7 figures, published in the IEEE Eurasia Conference on IoT,\n Communication and Engineering proceedings 2023"},{"id":"http://arxiv.org/abs/2312.01640v1","updated":"2023-12-04T05:42:56Z","published":"2023-12-04T05:42:56Z","title":"SequencePAR: Understanding Pedestrian Attributes via A Sequence\n Generation Paradigm","summary":" Current pedestrian attribute recognition (PAR) algorithms are developed based\non multi-label or multi-task learning frameworks, which aim to discriminate the\nattributes using specific classification heads. However, these discriminative\nmodels are easily influenced by imbalanced data or noisy samples. Inspired by\nthe success of generative models, we rethink the pedestrian attribute\nrecognition scheme and believe the generative models may perform better on\nmodeling dependencies and complexity between human attributes. In this paper,\nwe propose a novel sequence generation paradigm for pedestrian attribute\nrecognition, termed SequencePAR. It extracts the pedestrian features using a\npre-trained CLIP model and embeds the attribute set into query tokens under the\nguidance of text prompts. Then, a Transformer decoder is proposed to generate\nthe human attributes by incorporating the visual features and attribute query\ntokens. The masked multi-head attention layer is introduced into the decoder\nmodule to prevent the model from remembering the next attribute while making\nattribute predictions during training. Extensive experiments on multiple widely\nused pedestrian attribute recognition datasets fully validated the\neffectiveness of our proposed SequencePAR. The source code and pre-trained\nmodels will be released at https://github.com/Event-AHU/OpenPAR.\n","authors":["Jiandong Jin","Xiao Wang","Chenglong Li","Lili Huang","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.01640v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2306.07581v2","updated":"2023-12-04T05:40:20Z","published":"2023-06-13T07:11:10Z","title":"Binary Radiance Fields","summary":" In this paper, we propose \\textit{binary radiance fields} (BiRF), a\nstorage-efficient radiance field representation employing binary feature\nencoding that encodes local features using binary encoding parameters in a\nformat of either $+1$ or $-1$. This binarization strategy lets us represent the\nfeature grid with highly compact feature encoding and a dramatic reduction in\nstorage size. Furthermore, our 2D-3D hybrid feature grid design enhances the\ncompactness of feature encoding as the 3D grid includes main components while\n2D grids capture details. In our experiments, binary radiance field\nrepresentation successfully outperforms the reconstruction performance of\nstate-of-the-art (SOTA) efficient radiance field models with lower storage\nallocation. In particular, our model achieves impressive results in static\nscene reconstruction, with a PSNR of 32.03 dB for Synthetic-NeRF scenes, 34.48\ndB for Synthetic-NSVF scenes, 28.20 dB for Tanks and Temples scenes while only\nutilizing 0.5 MB of storage space, respectively. We hope the proposed binary\nradiance field representation will make radiance fields more accessible without\na storage bottleneck.\n","authors":["Seungjoo Shin","Jaesik Park"],"pdf_url":"https://arxiv.org/pdf/2306.07581v2.pdf","comment":"Accepted to NeurIPS 2023. Project page:\n https://seungjooshin.github.io/BiRF"},{"id":"http://arxiv.org/abs/2312.01638v1","updated":"2023-12-04T05:39:51Z","published":"2023-12-04T05:39:51Z","title":"J-Net: Improved U-Net for Terahertz Image Super-Resolution","summary":" Terahertz (THz) waves are electromagnetic waves in the 0.1 to 10 THz\nfrequency range, and THz imaging is utilized in a range of applications,\nincluding security inspections, biomedical fields, and the non-destructive\nexamination of materials. However, THz images have low resolution due to the\nlong wavelength of THz waves. Therefore, improving the resolution of THz images\nis one of the current hot research topics. We propose a novel network\narchitecture called J-Net which is improved version of U-Net to solve the THz\nimage super-resolution. It employs the simple baseline blocks which can extract\nlow resolution (LR) image features and learn the mapping of LR images to\nhighresolution (HR) images efficiently. All training was conducted using the\nDIV2K+Flickr2K dataset, and we employed the peak signal-to-noise ratio (PSNR)\nfor quantitative comparison. In our comparisons with other THz image\nsuper-resolution methods, JNet achieved a PSNR of 32.52 dB, surpassing other\ntechniques by more than 1 dB. J-Net also demonstrates superior performance on\nreal THz images compared to other methods. Experiments show that the proposed\nJ-Net achieves better PSNR and visual improvement compared with other THz image\nsuper-resolution methods.\n","authors":["Woon-Ha Yeo","Seung-Hwan Jung","Seung Jae Oh","Inhee Maeng","Eui Su Lee","Han-Cheol Ryu"],"pdf_url":"https://arxiv.org/pdf/2312.01638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01632v1","updated":"2023-12-04T05:24:45Z","published":"2023-12-04T05:24:45Z","title":"GaussianHead: Impressive 3D Gaussian-based Head Avatars with Dynamic\n Hybrid Neural Field","summary":" Previous head avatar methods have mostly relied on fixed explicit primitives\n(mesh, point) or implicit surfaces (Sign Distance Function) and volumetric\nneural radiance field, it challenging to strike a balance among high fidelity,\ntraining speed, and resource consumption. The recent popularity of hybrid field\nhas brought novel representation, but is limited by relying on parameterization\nfactors obtained through fixed mappings. We propose GaussianHead: an head\navatar algorithm based on anisotropic 3D gaussian primitives. We leverage\ncanonical gaussians to represent dynamic scenes. Using explicit \"dynamic\"\ntri-plane as an efficient container for parameterized head geometry, aligned\nwell with factors in the underlying geometry and tri-plane, we obtain aligned\ncanonical factors for the canonical gaussians. With a tiny MLP, factors are\ndecoded into opacity and spherical harmonic coefficients of 3D gaussian\nprimitives. Finally, we use efficient differentiable gaussian rasterizer for\nrendering. Our approach benefits significantly from our novel representation\nbased on 3D gaussians, and the proper alignment transformation of underlying\ngeometry structures and factors in tri-plane eliminates biases introduced by\nfixed mappings. Compared to state-of-the-art techniques, we achieve optimal\nvisual results in tasks such as self-reconstruction, novel view synthesis, and\ncross-identity reenactment while maintaining high rendering efficiency (0.12s\nper frame). Even the pores around the nose are clearly visible in some cases.\nCode and additional video can be found on the project homepage.\n","authors":["Jie Wang","Xianyan Li","Jiucheng Xie","Feng Xu","Hao Gao"],"pdf_url":"https://arxiv.org/pdf/2312.01632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01629v1","updated":"2023-12-04T05:13:59Z","published":"2023-12-04T05:13:59Z","title":"CLAMP: Contrastive LAnguage Model Prompt-tuning","summary":" Large language models (LLMs) have emerged as powerful general-purpose\ninterfaces for many machine learning problems. Recent work has adapted LLMs to\ngenerative visual tasks like image captioning, visual question answering, and\nvisual chat, using a relatively small amount of instruction-tuning data. In\nthis paper, we explore whether modern LLMs can also be adapted to classifying\nan image into a set of categories. First, we evaluate multimodal LLMs that are\ntuned for generative tasks on zero-shot image classification and find that\ntheir performance is far below that of specialized models like CLIP. We then\npropose an approach for light fine-tuning of LLMs using the same contrastive\nimage-caption matching objective as CLIP. Our results show that LLMs can,\nindeed, achieve good image classification performance when adapted this way.\nOur approach beats state-of-the-art mLLMs by 13% and slightly outperforms\ncontrastive learning with a custom text model, while also retaining the LLM's\ngenerative abilities. LLM initialization appears to particularly help\nclassification in domains under-represented in the visual pre-training data.\n","authors":["Piotr Teterwak","Ximeng Sun","Bryan A. Plummer","Kate Saenko","Ser-Nam Lim"],"pdf_url":"https://arxiv.org/pdf/2312.01629v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18605v2","updated":"2023-12-04T05:05:03Z","published":"2023-11-30T15:02:13Z","title":"Learning Triangular Distribution in Visual World","summary":" Convolution neural network is successful in pervasive vision tasks, including\nlabel distribution learning, which usually takes the form of learning an\ninjection from the non-linear visual features to the well-defined labels.\nHowever, how the discrepancy between features is mapped to the label\ndiscrepancy is ambient, and its correctness is not guaranteed. To address these\nproblems, we study the mathematical connection between feature and its label,\npresenting a general and simple framework for label distribution learning. We\npropose a so-called Triangular Distribution Transform (TDT) to build an\ninjective function between feature and label, guaranteeing that any symmetric\nfeature discrepancy linearly reflects the difference between labels. The\nproposed TDT can be used as a plug-in in mainstream backbone networks to\naddress different label distribution learning tasks. Experiments on Facial Age\nRecognition, Illumination Chromaticity Estimation, and Aesthetics assessment\nshow that TDT achieves on-par or better results than the prior arts.\n","authors":["Ping Chen","Xingpeng Zhang","Chengtao Zhou","Dichao Fan","Peng Tu","Le Zhang","Yanlin Qian"],"pdf_url":"https://arxiv.org/pdf/2311.18605v2.pdf","comment":"11 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.01623v1","updated":"2023-12-04T04:47:48Z","published":"2023-12-04T04:47:48Z","title":"Universal Segmentation at Arbitrary Granularity with Language\n Instruction","summary":" This paper aims to achieve universal segmentation of arbitrary semantic\nlevel. Despite significant progress in recent years, specialist segmentation\napproaches are limited to specific tasks and data distribution. Retraining a\nnew model for adaptation to new scenarios or settings takes expensive\ncomputation and time cost, which raises the demand for versatile and universal\nsegmentation model that can cater to various granularity. Although some\nattempts have been made for unifying different segmentation tasks or\ngeneralization to various scenarios, limitations in the definition of paradigms\nand input-output spaces make it difficult for them to achieve accurate\nunderstanding of content at arbitrary granularity. To this end, we present\nUniLSeg, a universal segmentation model that can perform segmentation at any\nsemantic level with the guidance of language instructions. For training\nUniLSeg, we reorganize a group of tasks from original diverse distributions\ninto a unified data format, where images with texts describing segmentation\ntargets as input and corresponding masks are output. Combined with a automatic\nannotation engine for utilizing numerous unlabeled data, UniLSeg achieves\nexcellent performance on various tasks and settings, surpassing both specialist\nand unified segmentation models.\n","authors":["Yong Liu","Cairong Zhang","Yitong Wang","Jiahao Wang","Yujiu Yang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2312.01623v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15712v2","updated":"2023-12-04T04:38:55Z","published":"2023-05-25T04:49:34Z","title":"Knowledge Diffusion for Distillation","summary":" The representation gap between teacher and student is an emerging topic in\nknowledge distillation (KD). To reduce the gap and improve the performance,\ncurrent methods often resort to complicated training schemes, loss functions,\nand feature alignments, which are task-specific and feature-specific. In this\npaper, we state that the essence of these methods is to discard the noisy\ninformation and distill the valuable information in the feature, and propose a\nnovel KD method dubbed DiffKD, to explicitly denoise and match features using\ndiffusion models. Our approach is based on the observation that student\nfeatures typically contain more noises than teacher features due to the smaller\ncapacity of student model. To address this, we propose to denoise student\nfeatures using a diffusion model trained by teacher features. This allows us to\nperform better distillation between the refined clean feature and teacher\nfeature. Additionally, we introduce a light-weight diffusion model with a\nlinear autoencoder to reduce the computation cost and an adaptive noise\nmatching module to improve the denoising performance. Extensive experiments\ndemonstrate that DiffKD is effective across various types of features and\nachieves state-of-the-art performance consistently on image classification,\nobject detection, and semantic segmentation tasks. Code is available at\nhttps://github.com/hunto/DiffKD.\n","authors":["Tao Huang","Yuan Zhang","Mingkai Zheng","Shan You","Fei Wang","Chen Qian","Chang Xu"],"pdf_url":"https://arxiv.org/pdf/2305.15712v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.00360v2","updated":"2023-12-04T04:38:17Z","published":"2023-12-01T05:50:44Z","title":"Efficient Multimodal Semantic Segmentation via Dual-Prompt Learning","summary":" Multimodal (e.g., RGB-Depth/RGB-Thermal) fusion has shown great potential for\nimproving semantic segmentation in complex scenes (e.g., indoor/low-light\nconditions). Existing approaches often fully fine-tune a dual-branch\nencoder-decoder framework with a complicated feature fusion strategy for\nachieving multimodal semantic segmentation, which is training-costly due to the\nmassive parameter updates in feature extraction and fusion. To address this\nissue, we propose a surprisingly simple yet effective dual-prompt learning\nnetwork (dubbed DPLNet) for training-efficient multimodal (e.g., RGB-D/T)\nsemantic segmentation. The core of DPLNet is to directly adapt a frozen\npre-trained RGB model to multimodal semantic segmentation, reducing parameter\nupdates. For this purpose, we present two prompt learning modules, comprising\nmultimodal prompt generator (MPG) and multimodal feature adapter (MFA). MPG\nworks to fuse the features from different modalities in a compact manner and is\ninserted from shadow to deep stages to generate the multi-level multimodal\nprompts that are injected into the frozen backbone, while MPG adapts prompted\nmultimodal features in the frozen backbone for better multimodal semantic\nsegmentation. Since both the MPG and MFA are lightweight, only a few trainable\nparameters (3.88M, 4.4% of the pre-trained backbone parameters) are introduced\nfor multimodal feature fusion and learning. Using a simple decoder (3.27M\nparameters), DPLNet achieves new state-of-the-art performance or is on a par\nwith other complex approaches on four RGB-D/T semantic segmentation datasets\nwhile satisfying parameter efficiency. Moreover, we show that DPLNet is general\nand applicable to other multimodal tasks such as salient object detection and\nvideo semantic segmentation. Without special design, DPLNet outperforms many\ncomplicated models. Our code will be available at\ngithub.com/ShaohuaDong2021/DPLNet.\n","authors":["Shaohua Dong","Yunhe Feng","Qing Yang","Yan Huang","Dongfang Liu","Heng Fan"],"pdf_url":"https://arxiv.org/pdf/2312.00360v2.pdf","comment":"11 pages, 4 figures, 9 tables"},{"id":"http://arxiv.org/abs/2312.01616v1","updated":"2023-12-04T04:14:09Z","published":"2023-12-04T04:14:09Z","title":"SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation\n System","summary":" Accuracy and computational efficiency are the most important metrics to\nVisual Inertial Navigation System (VINS). The existing VINS algorithms with\neither high accuracy or low computational complexity, are difficult to provide\nthe high precision localization in resource-constrained devices. To this end,\nwe propose a novel filter-based VINS framework named SchurVINS, which could\nguarantee both high accuracy by building a complete residual model and low\ncomputational complexity with Schur complement. Technically, we first formulate\nthe full residual model where Gradient, Hessian and observation covariance are\nexplicitly modeled. Then Schur complement is employed to decompose the full\nmodel into ego-motion residual model and landmark residual model. Finally,\nExtended Kalman Filter (EKF) update is implemented in these two models with\nhigh efficiency. Experiments on EuRoC and TUM-VI datasets show that our method\nnotably outperforms state-of-the-art (SOTA) methods in both accuracy and\ncomputational complexity. We will open source our experimental code to benefit\nthe community.\n","authors":["Yunfei Fan","Tianyu Zhao","Guidong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01616v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.15020v2","updated":"2023-12-04T03:48:06Z","published":"2023-10-23T15:15:19Z","title":"Invariance is Key to Generalization: Examining the Role of\n Representation in Sim-to-Real Transfer for Visual Navigation","summary":" The data-driven approach to robot control has been gathering pace rapidly,\nyet generalization to unseen task domains remains a critical challenge. We\nargue that the key to generalization is representations that are (i) rich\nenough to capture all task-relevant information and (ii) invariant to\nsuperfluous variability between the training and the test domains. We\nexperimentally study such a representation -- containing both depth and\nsemantic information -- for visual navigation and show that it enables a\ncontrol policy trained entirely in simulated indoor scenes to generalize to\ndiverse real-world environments, both indoors and outdoors. Further, we show\nthat our representation reduces the A-distance between the training and test\ndomains, improving the generalization error bound as a result. Our proposed\napproach is scalable: the learned policy improves continuously, as the\nfoundation models that it exploits absorb more diverse data during\npre-training.\n","authors":["Bo Ai","Zhanxin Wu","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2310.15020v2.pdf","comment":"11 pages, accepted by the 18th International Symposium on\n Experimental Robotics (ISER 2023) and published within the Springer\n Proceedings in Advanced Robotics (SPAR)"},{"id":"http://arxiv.org/abs/2312.01605v1","updated":"2023-12-04T03:38:04Z","published":"2023-12-04T03:38:04Z","title":"TextAug: Test time Text Augmentation for Multimodal Person\n Re-identification","summary":" Multimodal Person Reidentification is gaining popularity in the research\ncommunity due to its effectiveness compared to counter-part unimodal\nframeworks. However, the bottleneck for multimodal deep learning is the need\nfor a large volume of multimodal training examples. Data augmentation\ntechniques such as cropping, flipping, rotation, etc. are often employed in the\nimage domain to improve the generalization of deep learning models. Augmenting\nin other modalities than images, such as text, is challenging and requires\nsignificant computational resources and external data sources. In this study,\nwe investigate the effectiveness of two computer vision data augmentation\ntechniques: cutout and cutmix, for text augmentation in multi-modal person\nre-identification. Our approach merges these two augmentation strategies into\none strategy called CutMixOut which involves randomly removing words or\nsub-phrases from a sentence (Cutout) and blending parts of two or more\nsentences to create diverse examples (CutMix) with a certain probability\nassigned to each operation. This augmentation was implemented at inference time\nwithout any prior training. Our results demonstrate that the proposed technique\nis simple and effective in improving the performance on multiple multimodal\nperson re-identification benchmarks.\n","authors":["Mulham Fawakherji","Eduard Vazquez","Pasquale Giampa","Binod Bhattarai"],"pdf_url":"https://arxiv.org/pdf/2312.01605v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.01598v1","updated":"2023-12-04T03:18:51Z","published":"2023-12-04T03:18:51Z","title":"Good Questions Help Zero-Shot Image Reasoning","summary":" Aligning the recent large language models (LLMs) with computer vision models\nleads to large vision-language models (LVLMs), which have paved the way for\nzero-shot image reasoning tasks. However, LVLMs are usually trained on short\nhigh-level captions only referring to sparse focus regions in images. Such a\n``tunnel vision'' limits LVLMs to exploring other relevant contexts in complex\nscenes. To address this challenge, we introduce Question-Driven Visual\nExploration (QVix), a novel prompting strategy that enhances the exploratory\ncapabilities of LVLMs in zero-shot reasoning tasks. QVix leverages LLMs' strong\nlanguage prior to generate input-exploratory questions with more details than\nthe original query, guiding LVLMs to explore visual content more\ncomprehensively and uncover subtle or peripheral details. QVix enables a wider\nexploration of visual scenes, improving the LVLMs' reasoning accuracy and depth\nin tasks such as visual question answering and visual entailment. Our\nevaluations on various challenging zero-shot vision-language benchmarks,\nincluding ScienceQA and fine-grained visual classification, demonstrate that\nQVix significantly outperforms existing methods, highlighting its effectiveness\nin bridging the gap between complex visual data and LVLMs' exploratory\nabilities.\n","authors":["Kaiwen Yang","Tao Shen","Xinmei Tian","Xiubo Geng","Chongyang Tao","Dacheng Tao","Tianyi Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.01598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01597v1","updated":"2023-12-04T03:18:46Z","published":"2023-12-04T03:18:46Z","title":"SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference","summary":" Recent advances in contrastive language-image pretraining (CLIP) have\ndemonstrated strong capabilities in zero-shot classification by aligning visual\nrepresentations with target text embeddings in an image level. However, in\ndense prediction tasks, CLIP often struggles to localize visual features within\nan image and fails to give accurate pixel-level predictions, which prevents it\nfrom functioning as a generalized visual foundation model. In this work, we aim\nto enhance CLIP's potential for semantic segmentation with minimal\nmodifications to its pretrained models. By rethinking self-attention, we\nsurprisingly find that CLIP can adapt to dense prediction tasks by simply\nintroducing a novel Correlative Self-Attention (CSA) mechanism. Specifically,\nwe replace the traditional self-attention block of CLIP vision encoder's last\nlayer by our CSA module and reuse its pretrained projection matrices of query,\nkey, and value, leading to a training-free adaptation approach for CLIP's\nzero-shot semantic segmentation. Extensive experiments show the advantage of\nCSA: we obtain a 38.2% average zero-shot mIoU across eight semantic\nsegmentation benchmarks highlighted in this paper, significantly outperforming\nthe existing SoTA's 33.9% and the vanilla CLIP's 14.1%.\n","authors":["Feng Wang","Jieru Mei","Alan Yuille"],"pdf_url":"https://arxiv.org/pdf/2312.01597v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.11530v2","updated":"2023-12-04T02:58:37Z","published":"2022-11-21T15:00:04Z","title":"Open-Set Object Detection Using Classification-free Object Proposal and\n Instance-level Contrastive Learning","summary":" Detecting both known and unknown objects is a fundamental skill for robot\nmanipulation in unstructured environments. Open-set object detection (OSOD) is\na promising direction to handle the problem consisting of two subtasks: objects\nand background separation, and open-set object classification. In this paper,\nwe present Openset RCNN to address the challenging OSOD. To disambiguate\nunknown objects and background in the first subtask, we propose to use\nclassification-free region proposal network (CF-RPN) which estimates the\nobjectness score of each region purely using cues from object's location and\nshape preventing overfitting to the training categories. To identify unknown\nobjects in the second subtask, we propose to represent them using the\ncomplementary region of known categories in a latent space which is\naccomplished by a prototype learning network (PLN). PLN performs instance-level\ncontrastive learning to encode proposals to a latent space and builds a compact\nregion centering with a prototype for each known category. Further, we note\nthat the detection performance of unknown objects can not be unbiasedly\nevaluated on the situation that commonly used object detection datasets are not\nfully annotated. Thus, a new benchmark is introduced by reorganizing\nGraspNet-1billion, a robotic grasp pose detection dataset with complete\nannotation. Extensive experiments demonstrate the merits of our method. We\nfinally show that our Openset RCNN can endow the robot with an open-set\nperception ability to support robotic rearrangement tasks in cluttered\nenvironments. More details can be found in\nhttps://sites.google.com/view/openset-rcnn/\n","authors":["Zhongxiang Zhou","Yifei Yang","Yue Wang","Rong Xiong"],"pdf_url":"https://arxiv.org/pdf/2211.11530v2.pdf","comment":"IEEE Robotics and Automation Letters"},{"id":"http://arxiv.org/abs/2311.00690v3","updated":"2023-12-04T02:58:02Z","published":"2023-11-01T17:45:52Z","title":"What User Behaviors Make the Differences During the Process of Visual\n Analytics?","summary":" The understanding of visual analytics process can benefit visualization\nresearchers from multiple aspects, including improving visual designs and\ndeveloping advanced interaction functions. However, the log files of user\nbehaviors are still hard to analyze due to the complexity of sensemaking and\nour lack of knowledge on the related user behaviors. This work presents a study\non a comprehensive data collection of user behaviors, and our analysis approach\nwith time-series classification methods. We have chosen a classical\nvisualization application, Covid-19 data analysis, with common analysis tasks\ncovering geo-spatial, time-series and multi-attributes. Our user study collects\nuser behaviors on a diverse set of visualization tasks with two comparable\nsystems, desktop and immersive visualizations. We summarize the classification\nresults with three time-series machine learning algorithms at two scales, and\nexplore the influences of behavior features. Our results reveal that user\nbehaviors can be distinguished during the process of visual analytics and there\nis a potentially strong association between the physical behaviors of users and\nthe visualization tasks they perform. We also demonstrate the usage of our\nmodels by interpreting open sessions of visual analytics, which provides an\nautomatic way to study sensemaking without tedious manual annotations.\n","authors":["Zekun Wu","Shahin Doroudian","Aidong Lu"],"pdf_url":"https://arxiv.org/pdf/2311.00690v3.pdf","comment":"This version corrects the issues of previous versions"},{"id":"http://arxiv.org/abs/2312.01581v1","updated":"2023-12-04T02:33:53Z","published":"2023-12-04T02:33:53Z","title":"Signed Binarization: Unlocking Efficiency Through Repetition-Sparsity\n Trade-Off","summary":" Efficient inference of Deep Neural Networks (DNNs) on resource-constrained\nedge devices is essential. Quantization and sparsity are key algorithmic\ntechniques that translate to repetition and sparsity within tensors at the\nhardware-software interface. This paper introduces the concept of\nrepetition-sparsity trade-off that helps explain computational efficiency\nduring inference. We propose Signed Binarization, a unified co-design framework\nthat synergistically integrates hardware-software systems, quantization\nfunctions, and representation learning techniques to address this trade-off.\nOur results demonstrate that Signed Binarization is more accurate than\nbinarization with the same number of non-zero weights. Detailed analysis\nindicates that signed binarization generates a smaller distribution of\neffectual (non-zero) parameters nested within a larger distribution of total\nparameters, both of the same type, for a DNN block. Finally, our approach\nachieves a 26% speedup on real hardware, doubles energy efficiency, and reduces\ndensity by 2.8x compared to binary methods for ResNet 18, presenting an\nalternative solution for deploying efficient models in resource-limited\nenvironments.\n","authors":["Sachit Kuhar","Yash Jain","Alexey Tumanov"],"pdf_url":"https://arxiv.org/pdf/2312.01581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01576v1","updated":"2023-12-04T02:20:35Z","published":"2023-12-04T02:20:35Z","title":"Learning Efficient Unsupervised Satellite Image-based Building Damage\n Detection","summary":" Existing Building Damage Detection (BDD) methods always require\nlabour-intensive pixel-level annotations of buildings and their conditions,\nhence largely limiting their applications. In this paper, we investigate a\nchallenging yet practical scenario of BDD, Unsupervised Building Damage\nDetection (U-BDD), where only unlabelled pre- and post-disaster satellite image\npairs are provided. As a pilot study, we have first proposed an advanced U-BDD\nbaseline that leverages pre-trained vision-language foundation models (i.e.,\nGrounding DINO, SAM and CLIP) to address the U-BDD task. However, the apparent\ndomain gap between satellite and generic images causes low confidence in the\nfoundation models used to identify buildings and their damages. In response, we\nfurther present a novel self-supervised framework, U-BDD++, which improves upon\nthe U-BDD baseline by addressing domain-specific issues associated with\nsatellite imagery. Furthermore, the new Building Proposal Generation (BPG)\nmodule and the CLIP-enabled noisy Building Proposal Selection (CLIP-BPS) module\nin U-BDD++ ensure high-quality self-training. Extensive experiments on the\nwidely used building damage assessment benchmark demonstrate the effectiveness\nof the proposed method for unsupervised building damage detection. The\npresented annotation-free and foundation model-based paradigm ensures an\nefficient learning phase. This study opens a new direction for real-world BDD\nand sets a strong baseline for future research.\n","authors":["Yiyun Zhang","Zijian Wang","Yadan Luo","Xin Yu","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2312.01576v1.pdf","comment":"ICDM 2023"},{"id":"http://arxiv.org/abs/2312.01575v1","updated":"2023-12-04T02:17:14Z","published":"2023-12-04T02:17:14Z","title":"A Challenging Multimodal Video Summary: Simultaneously Extracting and\n Generating Keyframe-Caption Pairs from Video","summary":" This paper proposes a practical multimodal video summarization task setting\nand a dataset to train and evaluate the task. The target task involves\nsummarizing a given video into a predefined number of keyframe-caption pairs\nand displaying them in a listable format to grasp the video content quickly.\nThis task aims to extract crucial scenes from the video in the form of images\n(keyframes) and generate corresponding captions explaining each keyframe's\nsituation. This task is useful as a practical application and presents a highly\nchallenging problem worthy of study. Specifically, achieving simultaneous\noptimization of the keyframe selection performance and caption quality\nnecessitates careful consideration of the mutual dependence on both preceding\nand subsequent keyframes and captions. To facilitate subsequent research in\nthis field, we also construct a dataset by expanding upon existing datasets and\npropose an evaluation framework. Furthermore, we develop two baseline systems\nand report their respective performance.\n","authors":["Keito Kudo","Haruki Nagasawa","Jun Suzuki","Nobuyuki Shimizu"],"pdf_url":"https://arxiv.org/pdf/2312.01575v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01573v1","updated":"2023-12-04T02:07:47Z","published":"2023-12-04T02:07:47Z","title":"Survey on deep learning in multimodal medical imaging for cancer\n detection","summary":" The task of multimodal cancer detection is to determine the locations and\ncategories of lesions by using different imaging techniques, which is one of\nthe key research methods for cancer diagnosis. Recently, deep learning-based\nobject detection has made significant developments due to its strength in\nsemantic feature extraction and nonlinear function fitting. However, multimodal\ncancer detection remains challenging due to morphological differences in\nlesions, interpatient variability, difficulty in annotation, and imaging\nartifacts. In this survey, we mainly investigate over 150 papers in recent\nyears with respect to multimodal cancer detection using deep learning, with a\nfocus on datasets and solutions to various challenges such as data annotation,\nvariance between classes, small-scale lesions, and occlusion. We also provide\nan overview of the advantages and drawbacks of each approach. Finally, we\ndiscuss the current scope of work and provide directions for the future\ndevelopment of multimodal cancer detection.\n","authors":["Yan Tian","Zhaocheng Xu","Yujun Ma","Weiping Ding","Ruili Wang","Zhihong Gao","Guohua Cheng","Linyang He","Xuran Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.01573v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01571v1","updated":"2023-12-04T02:03:23Z","published":"2023-12-04T02:03:23Z","title":"How to Configure Good In-Context Sequence for Visual Question Answering","summary":" Inspired by the success of Large Language Models in dealing with new tasks\nvia In-Context Learning (ICL) in NLP, researchers have also developed Large\nVision-Language Models (LVLMs) with ICL capabilities. However, when\nimplementing ICL using these LVLMs, researchers usually resort to the simplest\nway like random sampling to configure the in-context sequence, thus leading to\nsub-optimal results. To enhance the ICL performance, in this study, we use\nVisual Question Answering (VQA) as case study to explore diverse in-context\nconfigurations to find the powerful ones. Additionally, through observing the\nchanges of the LVLM outputs by altering the in-context sequence, we gain\ninsights into the inner properties of LVLMs, improving our understanding of\nthem. Specifically, to explore in-context configurations, we design diverse\nretrieval methods and employ different strategies to manipulate the retrieved\ndemonstrations. Through exhaustive experiments on three VQA datasets: VQAv2,\nVizWiz, and OK-VQA, we uncover three important inner properties of the applied\nLVLM and demonstrate which strategies can consistently improve the ICL VQA\nperformance. Our code is provided in:\nhttps://github.com/GaryJiajia/OFv2_ICL_VQA.\n","authors":["Li Li","Jiawei Peng","Huiyi Chen","Chongyang Gao","Xu Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01571v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.00761v2","updated":"2023-12-04T01:57:38Z","published":"2023-12-01T18:29:08Z","title":"Deep Unlearning: Fast and Efficient Training-free Approach to Controlled\n Forgetting","summary":" Machine unlearning has emerged as a prominent and challenging area of\ninterest, driven in large part by the rising regulatory demands for industries\nto delete user data upon request and the heightened awareness of privacy.\nExisting approaches either retrain models from scratch or use several\nfinetuning steps for every deletion request, often constrained by computational\nresource limitations and restricted access to the original training data. In\nthis work, we introduce a novel class unlearning algorithm designed to\nstrategically eliminate an entire class or a group of classes from the learned\nmodel. To that end, our algorithm first estimates the Retain Space and the\nForget Space, representing the feature or activation spaces for samples from\nclasses to be retained and unlearned, respectively. To obtain these spaces, we\npropose a novel singular value decomposition-based technique that requires\nlayer wise collection of network activations from a few forward passes through\nthe network. We then compute the shared information between these spaces and\nremove it from the forget space to isolate class-discriminatory feature space\nfor unlearning. Finally, we project the model weights in the orthogonal\ndirection of the class-discriminatory space to obtain the unlearned model. We\ndemonstrate our algorithm's efficacy on ImageNet using a Vision Transformer\nwith only $\\sim$1.5% drop in retain accuracy compared to the original model\nwhile maintaining under 1% accuracy on the unlearned class samples. Further,\nour algorithm consistently performs well when subject to Membership Inference\nAttacks showing 7.8% improvement on average across a variety of image\nclassification datasets and network architectures, as compared to other\nbaselines while being $\\sim$6x more computationally efficient.\n","authors":["Sangamesh Kodge","Gobinda Saha","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2312.00761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01564v1","updated":"2023-12-04T01:42:09Z","published":"2023-12-04T01:42:09Z","title":"APoLLo: Unified Adapter and Prompt Learning for Vision Language Models","summary":" The choice of input text prompt plays a critical role in the performance of\nVision-Language Pretrained (VLP) models such as CLIP. We present APoLLo, a\nunified multi-modal approach that combines Adapter and Prompt learning for\nVision-Language models. Our method is designed to substantially improve the\ngeneralization capabilities of VLP models when they are fine-tuned in a\nfew-shot setting. We introduce trainable cross-attention-based adapter layers\nin conjunction with vision and language encoders to strengthen the alignment\nbetween the two modalities. We enforce consistency between the respective\nencoder branches (receiving augmented inputs) to prevent overfitting in\ndownstream tasks. Our method is evaluated on three representative tasks:\ngeneralization to novel classes, cross-dataset evaluation, and unseen domain\nshifts. In practice, APoLLo achieves a relative gain up to 6.03% over MaPLe\n(SOTA) on novel classes for 10 diverse image recognition datasets.\n","authors":["Sanjoy Chowdhury","Sayan Nag","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2312.01564v1.pdf","comment":"Accepted at EMNLP 2023 (Main track)"},{"id":"http://arxiv.org/abs/2312.01561v1","updated":"2023-12-04T01:28:38Z","published":"2023-12-04T01:28:38Z","title":"Multi-View Person Matching and 3D Pose Estimation with Arbitrary\n Uncalibrated Camera Networks","summary":" Cross-view person matching and 3D human pose estimation in multi-camera\nnetworks are particularly difficult when the cameras are extrinsically\nuncalibrated. Existing efforts generally require large amounts of 3D data for\ntraining neural networks or known camera poses for geometric constraints to\nsolve the problem. However, camera poses and 3D data annotation are usually\nexpensive and not always available. We present a method, PME, that solves the\ntwo tasks without requiring either information. Our idea is to address\ncross-view person matching as a clustering problem using each person as a\ncluster center, then obtain correspondences from person matches, and estimate\n3D human poses through multi-view triangulation and bundle adjustment. We solve\nthe clustering problem by introducing a \"size constraint\" using the number of\ncameras and a \"source constraint\" using the fact that two people from the same\ncamera view should not match, to narrow the solution space to a small feasible\nregion. The 2D human poses used in clustering are obtained through a\npre-trained 2D pose detector, so our method does not require expensive 3D\ntraining data for each new scene. We extensively evaluate our method on three\nopen datasets and two indoor and outdoor datasets collected using arbitrarily\nset cameras. Our method outperforms other methods by a large margin on\ncross-view person matching, reaches SOTA performance on 3D human pose\nestimation without using either camera poses or 3D training data, and shows\ngood generalization ability across five datasets of various environment\nsettings.\n","authors":["Yan Xu","Kris Kitani"],"pdf_url":"https://arxiv.org/pdf/2312.01561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09718v2","updated":"2023-12-04T01:15:56Z","published":"2023-10-15T03:08:25Z","title":"Efficient and Effective Deep Multi-view Subspace Clustering","summary":" Recent multi-view subspace clustering achieves impressive results utilizing\ndeep networks, where the self-expressive correlation is typically modeled by a\nfully connected (FC) layer. However, they still suffer from two limitations. i)\nThe parameter scale of the FC layer is quadratic to sample numbers, resulting\nin high time and memory costs that significantly degrade their feasibility in\nlarge-scale datasets. ii) It is under-explored to extract a unified\nrepresentation that simultaneously satisfies minimal sufficiency and\ndiscriminability. To this end, we propose a novel deep framework, termed\nEfficient and Effective deep Multi-View Subspace Clustering (E$^2$MVSC).\nInstead of a parameterized FC layer, we design a Relation-Metric Net that\ndecouples network parameter scale from sample numbers for greater computational\nefficiency. Most importantly, the proposed method devises a multi-type\nauto-encoder to explicitly decouple consistent, complementary, and superfluous\ninformation from every view, which is supervised by a soft clustering\nassignment similarity constraint. Following information bottleneck theory and\nthe maximal coding rate reduction principle, a sufficient yet minimal unified\nrepresentation can be obtained, as well as pursuing intra-cluster aggregation\nand inter-cluster separability within it. Extensive experiments show that\nE$^2$MVSC yields comparable results to existing methods and achieves\nstate-of-the-art performance in various types of multi-view datasets.\n","authors":["Yuxiu Lin","Hui Liu","Ren Wang","Qiang Guo","Caiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.09718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01558v1","updated":"2023-12-04T01:10:04Z","published":"2023-12-04T01:10:04Z","title":"Hyperspectral Image Compression Using Sampling and Implicit Neural\n Representations","summary":" Hyperspectral images, which record the electromagnetic spectrum for a pixel\nin the image of a scene, often store hundreds of channels per pixel and contain\nan order of magnitude more information than a similarly-sized RBG color image.\nConsequently, concomitant with the decreasing cost of capturing these images,\nthere is a need to develop efficient techniques for storing, transmitting, and\nanalyzing hyperspectral images. This paper develops a method for hyperspectral\nimage compression using implicit neural representations where a multilayer\nperceptron network F with sinusoidal activation functions \"learns\" to map pixel\nlocations to pixel intensities for a given hyperspectral image I. F thus acts\nas a compressed encoding of this image, and the original image is reconstructed\nby evaluating F at each pixel location. We use a sampling method with two\nfactors: window size and sampling rate to reduce the compression time. We have\nevaluated our method on four benchmarks -- Indian Pines, Jasper Ridge, Pavia\nUniversity, and Cuprite using PSNR and SSIM -- and we show that the proposed\nmethod achieves better compression than JPEG, JPEG2000, and PCA-DCT at low\nbitrates. Besides, we compare our results with the learning-based methods like\nPCA+JPEG2000, FPCA+JPEG2000, 3D DCT, 3D DWT+SVR, and WSRC and show the\ncorresponding results in the \"Compression Results\" section. We also show that\nour methods with sampling achieve better speed and performance than our method\nwithout sampling.\n","authors":["Shima Rezasoltani","Faisal Z. Qureshi"],"pdf_url":"https://arxiv.org/pdf/2312.01558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02396v1","updated":"2023-12-04T23:26:12Z","published":"2023-12-04T23:26:12Z","title":"Unsupervised Change Detection for Space Habitats Using 3D Point Clouds","summary":" This work presents an algorithm for scene change detection from point clouds\nto enable autonomous robotic caretaking in future space habitats. Autonomous\nrobotic systems will help maintain future deep-space habitats, such as the\nGateway space station, which will be uncrewed for extended periods. Existing\nscene analysis software used on the International Space Station (ISS) relies on\nmanually-labeled images for detecting changes. In contrast, the algorithm\npresented in this work uses raw, unlabeled point clouds as inputs. The\nalgorithm first applies modified Expectation-Maximization Gaussian Mixture\nModel (GMM) clustering to two input point clouds. It then performs change\ndetection by comparing the GMMs using the Earth Mover's Distance. The algorithm\nis validated quantitatively and qualitatively using a test dataset collected by\nan Astrobee robot in the NASA Ames Granite Lab comprising single frame depth\nimages taken directly by Astrobee and full-scene reconstructed maps built with\nRGB-D and pose data from Astrobee. The runtimes of the approach are also\nanalyzed in depth. The source code is publicly released to promote further\ndevelopment.\n","authors":["Jamie Santos","Holly Dinkel","Julia Di","Paulo V. K. Borges","Marina Moreira","Oleg Alexandrov","Brian Coltin","Trey Smith"],"pdf_url":"https://arxiv.org/pdf/2312.02396v1.pdf","comment":"15 pages, 7 figures, Manuscript will be presented at the AIAA SciTech\n Forum in Orlando, FL, USA, 8 - 12 January 2024"},{"id":"http://arxiv.org/abs/2306.04037v2","updated":"2023-12-04T23:18:54Z","published":"2023-06-06T22:04:45Z","title":"Quantitative Analysis of Primary Attribution Explainable Artificial\n Intelligence Methods for Remote Sensing Image Classification","summary":" We present a comprehensive analysis of quantitatively evaluating explainable\nartificial intelligence (XAI) techniques for remote sensing image\nclassification. Our approach leverages state-of-the-art machine learning\napproaches to perform remote sensing image classification across multiple\nmodalities. We investigate the results of the models qualitatively through XAI\nmethods. Additionally, we compare the XAI methods quantitatively through\nvarious categories of desired properties. Through our analysis, we offer\ninsights and recommendations for selecting the most appropriate XAI method(s)\nto gain a deeper understanding of the models' decision-making processes. The\ncode for this work is publicly available.\n","authors":["Akshatha Mohan","Joshua Peeples"],"pdf_url":"https://arxiv.org/pdf/2306.04037v2.pdf","comment":"4 pages, 3 figures, Accepted to 2023 IGARSS Community-Contributed\n Sessions - Opening the Black Box: Explainable AI/ML in Remote Sensing\n Analysis"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.02031v1","updated":"2023-12-04T16:51:28Z","published":"2023-12-04T16:51:28Z","title":"Virtual Quantum Markov Chains","summary":" Quantum Markov chains generalize classical Markov chains for random variables\nto the quantum realm and exhibit unique inherent properties, making them an\nimportant feature in quantum information theory. In this work, we propose the\nconcept of virtual quantum Markov chains (VQMCs), focusing on scenarios where\nsubsystems retain classical information about global systems from measurement\nstatistics. As a generalization of quantum Markov chains, VQMCs characterize\nstates where arbitrary global shadow information can be recovered from\nsubsystems through local quantum operations and measurements. We present an\nalgebraic characterization for virtual quantum Markov chains and show that the\nvirtual quantum recovery is fully determined by the block matrices of a quantum\nstate on its subsystems. Notably, we find a distinction between two classes of\ntripartite entanglement by showing that the W state is a VQMC while the GHZ\nstate is not. Furthermore, we establish semidefinite programs to determine the\noptimal sampling overhead and the robustness of virtual quantum Markov chains.\nWe demonstrate the optimal sampling overhead is additive, indicating no free\nlunch to further reduce the sampling cost of recovery from parallel calls of\nthe VQMC states. Our findings elucidate distinctions between quantum Markov\nchains and virtual quantum Markov chains, extending our understanding of\nquantum recovery to scenarios prioritizing classical information from\nmeasurement statistics.\n","authors":["Yu-Ao Chen","Chengkai Zhu","Keming He","Mingrui Jing","Xin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02031v1.pdf","comment":"19 pages including appendix"},{"id":"http://arxiv.org/abs/2312.01998v1","updated":"2023-12-04T16:22:06Z","published":"2023-12-04T16:22:06Z","title":"Language-only Efficient Training of Zero-shot Composed Image Retrieval","summary":" Composed image retrieval (CIR) task takes a composed query of image and text,\naiming to search relative images for both conditions. Conventional CIR\napproaches need a training dataset composed of triplets of query image, query\ntext, and target image, which is very expensive to collect. Several recent\nworks have worked on the zero-shot (ZS) CIR paradigm to tackle the issue\nwithout using pre-collected triplets. However, the existing ZS-CIR methods show\nlimited backbone scalability and generalizability due to the lack of diversity\nof the input texts during training. We propose a novel CIR framework, only\nusing language for its training. Our LinCIR (Language-only training for CIR)\ncan be trained only with text datasets by a novel self-supervision named\nself-masking projection (SMP). We project the text latent embedding to the\ntoken embedding space and construct a new text by replacing the keyword tokens\nof the original text. Then, we let the new and original texts have the same\nlatent embedding vector. With this simple strategy, LinCIR is surprisingly\nefficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in\n48 minutes and shows the best ZS-CIR performances on four different CIR\nbenchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised\nmethod on FashionIQ. Code is available at https://github.com/navervision/lincir\n","authors":["Geonmo Gu","Sanghyuk Chun","Wonjae Kim","Yoohoon Kang","Sangdoo Yun"],"pdf_url":"https://arxiv.org/pdf/2312.01998v1.pdf","comment":"First two authors contributed equally; 16 pages, 2.9MB"},{"id":"http://arxiv.org/abs/2312.01916v1","updated":"2023-12-04T14:20:16Z","published":"2023-12-04T14:20:16Z","title":"PEACE: Prototype lEarning Augmented transferable framework for\n Cross-domain rEcommendation","summary":" To help merchants/customers to provide/access a variety of services through\nminiapps, online service platforms have occupied a critical position in the\neffective content delivery, in which how to recommend items in the new domain\nlaunched by the service provider for customers has become more urgent. However,\nthe non-negligible gap between the source and diversified target domains poses\na considerable challenge to cross-domain recommendation systems, which often\nleads to performance bottlenecks in industrial settings. While entity graphs\nhave the potential to serve as a bridge between domains, rudimentary\nutilization still fail to distill useful knowledge and even induce the negative\ntransfer issue. To this end, we propose PEACE, a Prototype lEarning Augmented\ntransferable framework for Cross-domain rEcommendation. For domain gap\nbridging, PEACE is built upon a multi-interest and entity-oriented pre-training\narchitecture which could not only benefit the learning of generalized knowledge\nin a multi-granularity manner, but also help leverage more structural\ninformation in the entity graph. Then, we bring the prototype learning into the\npre-training over source domains, so that representations of users and items\nare greatly improved by the contrastive prototype learning module and the\nprototype enhanced attention mechanism for adaptive knowledge utilization. To\nease the pressure of online serving, PEACE is carefully deployed in a\nlightweight manner, and significant performance improvements are observed in\nboth online and offline environments.\n","authors":["Chunjing Gan","Bo Huang","Binbin Hu","Jian Ma","Ziqi Liu","Zhiqiang Zhang","Jun Zhou","Guannan Zhang","Wenliang Zhong"],"pdf_url":"https://arxiv.org/pdf/2312.01916v1.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2306.10933v4","updated":"2023-12-04T13:10:14Z","published":"2023-06-19T13:44:48Z","title":"Towards Open-World Recommendation with Knowledge Augmentation from Large\n Language Models","summary":" Recommender systems play a vital role in various online services. However,\nthe insulated nature of training and deploying separately within a specific\ndomain limits their access to open-world knowledge. Recently, the emergence of\nlarge language models (LLMs) has shown promise in bridging this gap by encoding\nextensive world knowledge and demonstrating reasoning capability. Nevertheless,\nprevious attempts to directly use LLMs as recommenders have not achieved\nsatisfactory results. In this work, we propose an Open-World Knowledge\nAugmented Recommendation Framework with Large Language Models, dubbed KAR, to\nacquire two types of external knowledge from LLMs -- the reasoning knowledge on\nuser preferences and the factual knowledge on items. We introduce factorization\nprompting to elicit accurate reasoning on user preferences. The generated\nreasoning and factual knowledge are effectively transformed and condensed into\naugmented vectors by a hybrid-expert adaptor in order to be compatible with the\nrecommendation task. The obtained vectors can then be directly used to enhance\nthe performance of any recommendation model. We also ensure efficient inference\nby preprocessing and prestoring the knowledge from the LLM. Extensive\nexperiments show that KAR significantly outperforms the state-of-the-art\nbaselines and is compatible with a wide range of recommendation algorithms. We\ndeploy KAR to Huawei's news and music recommendation platforms and gain a 7\\%\nand 1.7\\% improvement in the online A/B test, respectively.\n","authors":["Yunjia Xi","Weiwen Liu","Jianghao Lin","Xiaoling Cai","Hong Zhu","Jieming Zhu","Bo Chen","Ruiming Tang","Weinan Zhang","Rui Zhang","Yong Yu"],"pdf_url":"https://arxiv.org/pdf/2306.10933v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01840v1","updated":"2023-12-04T12:25:05Z","published":"2023-12-04T12:25:05Z","title":"An AI-based solution for the cold start and data sparsity problems in\n the recommendation systems","summary":" In recent years, the amount of data available on the internet and the number\nof users who utilize the Internet have increased at an unparalleled pace. The\nexponential development in the quantity of digital information accessible and\nthe number of Internet users has created the possibility for information\noverload, impeding fast access to items of interest on the Internet.\nInformation retrieval systems like as Google, DevilFinder, and Altavista have\npartly overcome this challenge, but prioritizing and customization of\ninformation (where a system maps accessible material to a user's interests and\npreferences) were lacking. This has resulted in a higher-than-ever need for\nrecommender systems. Recommender systems are information filtering systems that\naddress the issue of information overload by filtering important information\nfragments from a huge volume of dynamically produced data based on the user's\ninterests, favorite things, preferences and ratings on the desired item.\nRecommender systems can figure out if a person would like an item or not based\non their profile.\n","authors":["Shahriar Shakir Sumit"],"pdf_url":"https://arxiv.org/pdf/2312.01840v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00372v2","updated":"2023-12-04T11:42:35Z","published":"2023-12-01T06:30:31Z","title":"Event-driven Real-time Retrieval in Web Search","summary":" Information retrieval in real-time search presents unique challenges distinct\nfrom those encountered in classical web search. These challenges are\nparticularly pronounced due to the rapid change of user search intent, which is\ninfluenced by the occurrence and evolution of breaking news events, such as\nearthquakes, elections, and wars. Previous dense retrieval methods, which\nprimarily focused on static semantic representation, lack the capacity to\ncapture immediate search intent, leading to inferior performance in retrieving\nthe most recent event-related documents in time-sensitive scenarios. To address\nthis issue, this paper expands the query with event information that represents\nreal-time search intent. The Event information is then integrated with the\nquery through a cross-attention mechanism, resulting in a time-context query\nrepresentation. We further enhance the model's capacity for event\nrepresentation through multi-task training. Since publicly available datasets\nsuch as MS-MARCO do not contain any event information on the query side and\nhave few time-sensitive queries, we design an automatic data collection and\nannotation pipeline to address this issue, which includes ModelZoo-based Coarse\nAnnotation and LLM-driven Fine Annotation processes. In addition, we share the\ntraining tricks such as two-stage training and hard negative sampling. Finally,\nwe conduct a set of offline experiments on a million-scale production dataset\nto evaluate our approach and deploy an A/B testing in a real online system to\nverify the performance. Extensive experimental results demonstrate that our\nproposed approach significantly outperforms existing state-of-the-art baseline\nmethods.\n","authors":["Nan Yang","Shusen Zhang","Yannan Zhang","Xiaoling Bai","Hualong Deng","Tianhua Zhou","Jin Ma"],"pdf_url":"https://arxiv.org/pdf/2312.00372v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01760v1","updated":"2023-12-04T09:36:21Z","published":"2023-12-04T09:36:21Z","title":"On Gradient Boosted Decision Trees and Neural Rankers: A Case-Study on\n Short-Video Recommendations at ShareChat","summary":" Practitioners who wish to build real-world applications that rely on ranking\nmodels, need to decide which modelling paradigm to follow. This is not an easy\nchoice to make, as the research literature on this topic has been shifting in\nrecent years. In particular, whilst Gradient Boosted Decision Trees (GBDTs)\nhave reigned supreme for more than a decade, the flexibility of neural networks\nhas allowed them to catch up, and recent works report accuracy metrics that are\non par. Nevertheless, practical systems require considerations beyond mere\naccuracy metrics to decide on a modelling approach.\n This work describes our experiences in balancing some of the trade-offs that\narise, presenting a case study on a short-video recommendation application. We\nhighlight (1) neural networks' ability to handle large training data size,\nuser- and item-embeddings allows for more accurate models than GBDTs in this\nsetting, and (2) because GBDTs are less reliant on specialised hardware, they\ncan provide an equally accurate model at a lower cost. We believe these\nfindings are of relevance to researchers in both academia and industry, and\nhope they can inspire practitioners who need to make similar modelling choices\nin the future.\n","authors":["Olivier Jeunen","Hitesh Sagtani","Himanshu Doi","Rasul Karimov","Neeti Pokharna","Danish Kalim","Aleksei Ustimenko","Christopher Green","Wenzhe Shi","Rishabh Mehrotra"],"pdf_url":"https://arxiv.org/pdf/2312.01760v1.pdf","comment":"Appearing in the Industry Track Proceedings of the Forum for\n Information Retrieval Evaluation (FIRE '23)"},{"id":"http://arxiv.org/abs/2206.05654v2","updated":"2023-12-04T09:31:30Z","published":"2022-06-12T04:20:05Z","title":"A Matrix Decomposition Model Based on Feature Factors in Movie\n Recommendation System","summary":" Currently, matrix decomposition is one of the most widely used collaborative\nfiltering algorithms by using factor decomposition to effectively deal with\nlarge-scale rating matrix. It mainly uses the interaction records between users\nand items to predict ratings. Based on the characteristic attributes of items\nand users, this paper proposes a new UISVD++ model that fuses the type\nattributes of movies and the age attributes of users into SVD++ framework. By\nprojecting the age attribute into the user's implicit space and the type\nattribute into the item's implicit space, the model enriches the side\ninformation of the users and items. At last, we conduct comparative experiments\non two public data sets, Movielens-100K and Movielens-1M. Experiment results\nexpress that the prediction accuracy of this model is better than other\nbaselines in the task of predicting scores. In addition, these results also\nshow that UISVD++ can effectively alleviate the cold start situation.\n","authors":["Dan Liu","Hou-biao Li"],"pdf_url":"https://arxiv.org/pdf/2206.05654v2.pdf","comment":"11 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2312.01656v1","updated":"2023-12-04T06:14:25Z","published":"2023-12-04T06:14:25Z","title":"The Contemporary Art of Image Search: Iterative User Intent Expansion\n via Vision-Language Model","summary":" Image search is an essential and user-friendly method to explore vast\ngalleries of digital images. However, existing image search methods heavily\nrely on proximity measurements like tag matching or image similarity, requiring\nprecise user inputs for satisfactory results.To meet the growing demand for a\ncontemporary image search engine that enables accurate comprehension of users'\nsearch intentions, we introduce an innovative user intent expansion framework.\nOur framework leverages visual-language models to parse and compose multi-modal\nuser inputs to provide more accurate and satisfying results. It comprises\ntwo-stage processes: 1) a parsing stage that incorporates a language parsing\nmodule with large language models to enhance the comprehension of textual\ninputs, along with a visual parsing module that integrates an interactive\nsegmentation module to swiftly identify detailed visual elements within images;\nand 2) a logic composition stage that combines multiple user search intents\ninto a unified logic expression for more sophisticated operations in complex\nsearching scenarios. Moreover, the intent expansion framework enables users to\nperform flexible contextualized interactions with the search results to further\nspecify or adjust their detailed search intents iteratively. We implemented the\nframework into an image search system for NFT (non-fungible token) search and\nconducted a user study to evaluate its usability and novel properties. The\nresults indicate that the proposed framework significantly improves users'\nimage search experience. Particularly the parsing and contextualized\ninteractions prove useful in allowing users to express their search intents\nmore accurately and engage in a more enjoyable iterative search experience.\n","authors":["Yilin Ye","Qian Zhu","Shishi Xiao","Kang Zhang","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01656v1.pdf","comment":"Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported\n Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024)"},{"id":"http://arxiv.org/abs/2312.01556v1","updated":"2023-12-04T00:56:50Z","published":"2023-12-04T00:56:50Z","title":"Searching Dense Representations with Inverted Indexes","summary":" Nearly all implementations of top-$k$ retrieval with dense vector\nrepresentations today take advantage of hierarchical navigable small-world\nnetwork (HNSW) indexes. However, the generation of vector representations and\nefficiently searching large collections of vectors are distinct challenges that\ncan be decoupled. In this work, we explore the contrarian approach of\nperforming top-$k$ retrieval on dense vector representations using inverted\nindexes. We present experiments on the MS MARCO passage ranking dataset,\nevaluating three dimensions of interest: output quality, speed, and index size.\nResults show that searching dense representations using inverted indexes is\npossible. Our approach exhibits reasonable effectiveness with compact indexes,\nbut is impractically slow. Thus, while workable, our solution does not provide\na compelling tradeoff and is perhaps best characterized today as a \"technical\ncuriosity\".\n","authors":["Jimmy Lin","Tommaso Teofili"],"pdf_url":"https://arxiv.org/pdf/2312.01556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17082v2","updated":"2023-12-04T23:09:40Z","published":"2023-06-29T16:36:05Z","title":"Adaptive Latent Entity Expansion for Document Retrieval","summary":" Despite considerable progress in neural relevance ranking techniques, search\nengines still struggle to process complex queries effectively - both in terms\nof precision and recall. Sparse and dense Pseudo-Relevance Feedback (PRF)\napproaches have the potential to overcome limitations in recall, but are only\neffective with high precision in the top ranks. In this work, we tackle the\nproblem of search over complex queries using three complementary techniques.\nFirst, we demonstrate that applying a strong neural re-ranker before sparse or\ndense PRF can improve the retrieval effectiveness by 5-8%. This improvement in\nPRF effectiveness can be attributed directly to improving the precision of the\nfeedback set. Second, we propose an enhanced expansion model, Latent Entity\nExpansion (LEE), which applies fine-grained word and entity-based relevance\nmodelling incorporating localized features. Specifically, we find that by\nincluding both words and entities for expansion achieve a further 2-8%\nimprovement in NDCG. Our analysis also demonstrated that LEE is largely robust\nto its parameters across datasets and performs well on entity-centric queries.\nAnd third, we include an 'adaptive' component in the retrieval process, which\niteratively refines the re-ranking pool during scoring using the expansion\nmodel and avoids re-ranking additional documents. We find that this combination\nof techniques achieves the best NDCG, MAP and R@1000 results on the TREC Robust\n2004 and CODEC document datasets, demonstrating a significant advancement in\nexpansion effectiveness.\n","authors":["Iain Mackie","Shubham Chatterjee","Sean MacAvaney","Jeffrey Dalton"],"pdf_url":"https://arxiv.org/pdf/2306.17082v2.pdf","comment":null}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.02151v1","updated":"2023-12-04T18:59:36Z","published":"2023-12-04T18:59:36Z","title":"Guarding Barlow Twins Against Overfitting with Mixed Samples","summary":" Self-supervised Learning (SSL) aims to learn transferable feature\nrepresentations for downstream applications without relying on labeled data.\nThe Barlow Twins algorithm, renowned for its widespread adoption and\nstraightforward implementation compared to its counterparts like contrastive\nlearning methods, minimizes feature redundancy while maximizing invariance to\ncommon corruptions. Optimizing for the above objective forces the network to\nlearn useful representations, while avoiding noisy or constant features,\nresulting in improved downstream task performance with limited adaptation.\nDespite Barlow Twins' proven effectiveness in pre-training, the underlying SSL\nobjective can inadvertently cause feature overfitting due to the lack of strong\ninteraction between the samples unlike the contrastive learning approaches.\nFrom our experiments, we observe that optimizing for the Barlow Twins objective\ndoesn't necessarily guarantee sustained improvements in representation quality\nbeyond a certain pre-training phase, and can potentially degrade downstream\nperformance on some datasets. To address this challenge, we introduce Mixed\nBarlow Twins, which aims to improve sample interaction during Barlow Twins\ntraining via linearly interpolated samples. This results in an additional\nregularization term to the original Barlow Twins objective, assuming linear\ninterpolation in the input space translates to linearly interpolated features\nin the feature space. Pre-training with this regularization effectively\nmitigates feature overfitting and further enhances the downstream performance\non CIFAR-10, CIFAR-100, TinyImageNet, STL-10, and ImageNet datasets. The code\nand checkpoints are available at: https://github.com/wgcban/mix-bt.git\n","authors":["Wele Gedara Chaminda Bandara","Celso M. De Melo","Vishal M. Patel"],"pdf_url":"https://arxiv.org/pdf/2312.02151v1.pdf","comment":"Code and checkpoints are available at:\n https://github.com/wgcban/mix-bt.git"},{"id":"http://arxiv.org/abs/2312.02146v1","updated":"2023-12-04T18:59:19Z","published":"2023-12-04T18:59:19Z","title":"Learning Polynomial Problems with $SL(2,\\mathbb{R})$ Equivariance","summary":" Optimizing and certifying the positivity of polynomials are fundamental\nprimitives across mathematics and engineering applications, from dynamical\nsystems to operations research. However, solving these problems in practice\nrequires large semidefinite programs, with poor scaling in dimension and\ndegree. In this work, we demonstrate for the first time that neural networks\ncan effectively solve such problems in a data-driven fashion, achieving tenfold\nspeedups while retaining high accuracy. Moreover, we observe that these\npolynomial learning problems are equivariant to the non-compact group\n$SL(2,\\mathbb{R})$, which consists of area-preserving linear transformations.\nWe therefore adapt our learning pipelines to accommodate this structure,\nincluding data augmentation, a new $SL(2,\\mathbb{R})$-equivariant architecture,\nand an architecture equivariant with respect to its maximal compact subgroup,\n$SO(2, \\mathbb{R})$. Surprisingly, the most successful approaches in practice\ndo not enforce equivariance to the entire group, which we prove arises from an\nunusual lack of architecture universality for $SL(2,\\mathbb{R})$ in particular.\nA consequence of this result, which is of independent interest, is that there\nexists an equivariant function for which there is no sequence of equivariant\npolynomials multiplied by arbitrary invariants that approximates the original\nfunction. This is a rare example of a symmetric problem where data augmentation\noutperforms a fully equivariant architecture, and provides interesting lessons\nin both theory and practice for other problems with non-compact symmetries.\n","authors":["Hannah Lawrence","Mitchell Tong Harris"],"pdf_url":"https://arxiv.org/pdf/2312.02146v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02139v1","updated":"2023-12-04T18:57:01Z","published":"2023-12-04T18:57:01Z","title":"DiffiT: Diffusion Vision Transformers for Image Generation","summary":" Diffusion models with their powerful expressivity and high sample quality\nhave enabled many new applications and use-cases in various domains. For sample\ngeneration, these models rely on a denoising neural network that generates\nimages by iterative denoising. Yet, the role of denoising network architecture\nis not well-studied with most efforts relying on convolutional residual U-Nets.\nIn this paper, we study the effectiveness of vision transformers in\ndiffusion-based generative learning. Specifically, we propose a new model,\ndenoted as Diffusion Vision Transformers (DiffiT), which consists of a hybrid\nhierarchical architecture with a U-shaped encoder and decoder. We introduce a\nnovel time-dependent self-attention module that allows attention layers to\nadapt their behavior at different stages of the denoising process in an\nefficient manner. We also introduce latent DiffiT which consists of transformer\nmodel with the proposed self-attention layers, for high-resolution image\ngeneration. Our results show that DiffiT is surprisingly effective in\ngenerating high-fidelity images, and it achieves state-of-the-art (SOTA)\nbenchmarks on a variety of class-conditional and unconditional synthesis tasks.\nIn the latent space, DiffiT achieves a new SOTA FID score of 1.73 on\nImageNet-256 dataset. Repository: https://github.com/NVlabs/DiffiT\n","authors":["Ali Hatamizadeh","Jiaming Song","Guilin Liu","Jan Kautz","Arash Vahdat"],"pdf_url":"https://arxiv.org/pdf/2312.02139v1.pdf","comment":"Tech report"},{"id":"http://arxiv.org/abs/2312.02133v1","updated":"2023-12-04T18:55:35Z","published":"2023-12-04T18:55:35Z","title":"Style Aligned Image Generation via Shared Attention","summary":" Large-scale Text-to-Image (T2I) models have rapidly gained prominence across\ncreative fields, generating visually compelling outputs from textual prompts.\nHowever, controlling these models to ensure consistent style remains\nchallenging, with existing methods necessitating fine-tuning and manual\nintervention to disentangle content and style. In this paper, we introduce\nStyleAligned, a novel technique designed to establish style alignment among a\nseries of generated images. By employing minimal `attention sharing' during the\ndiffusion process, our method maintains style consistency across images within\nT2I models. This approach allows for the creation of style-consistent images\nusing a reference style through a straightforward inversion operation. Our\nmethod's evaluation across diverse styles and text prompts demonstrates\nhigh-quality synthesis and fidelity, underscoring its efficacy in achieving\nconsistent style across various inputs.\n","authors":["Amir Hertz","Andrey Voynov","Shlomi Fruchter","Daniel Cohen-Or"],"pdf_url":"https://arxiv.org/pdf/2312.02133v1.pdf","comment":"Project page at style-aligned-gen.github.io"},{"id":"http://arxiv.org/abs/2312.02132v1","updated":"2023-12-04T18:54:34Z","published":"2023-12-04T18:54:34Z","title":"Hot PATE: Private Aggregation of Distributions for Diverse Task","summary":" The Private Aggregation of Teacher Ensembles (PATE)\nframework~\\cite{PapernotAEGT:ICLR2017} is a versatile approach to\nprivacy-preserving machine learning. In PATE, teacher models are trained on\ndistinct portions of sensitive data, and their predictions are privately\naggregated to label new training examples for a student model.\n Until now, PATE has primarily been explored with classification-like tasks,\nwhere each example possesses a ground-truth label, and knowledge is transferred\nto the student by labeling public examples. Generative AI models, however,\nexcel in open ended \\emph{diverse} tasks with multiple valid responses and\nscenarios that may not align with traditional labeled examples. Furthermore,\nthe knowledge of models is often encapsulated in the response distribution\nitself and may be transferred from teachers to student in a more fluid way. We\npropose \\emph{hot PATE}, tailored for the diverse setting. In hot PATE, each\nteacher model produces a response distribution and the aggregation method must\npreserve both privacy and diversity of responses. We demonstrate, analytically\nand empirically, that hot PATE achieves privacy-utility tradeoffs that are\ncomparable to, and in diverse settings, significantly surpass, the baseline\n``cold'' PATE.\n","authors":["Edith Cohen","Xin Lyu","Jelani Nelson","Tamas Sarlos","Uri Stemmer"],"pdf_url":"https://arxiv.org/pdf/2312.02132v1.pdf","comment":"19 pages"},{"id":"http://arxiv.org/abs/2312.02125v1","updated":"2023-12-04T18:52:26Z","published":"2023-12-04T18:52:26Z","title":"TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and\n Advanced Decoding Techniques","summary":" Recent advances in language models (LMs), have demonstrated significant\nefficacy in tasks related to the arts and humanities. While LMs have exhibited\nexceptional performance across a wide range of natural language processing\ntasks, there are notable challenges associated with their utilization on small\ndatasets and their ability to replicate more creative human capacities. In this\nstudy, we aim to address these challenges by training a Persian classical\npoetry generation model using a transformer architecture on a specialized\ndataset with no pretraining. Additionally, we propose a novel decoding method\nto enhance coherence and meaningfulness in the generated poetry, effectively\nmanaging the tradeoff between diversity and quality. Furthermore, the results\nof our training approach and the proposed decoding method are evaluated through\ncomprehensive set of automatic and human evaluations and showed its superior\ncapability to generate coherent and meaningful poetry in compare to other\ndecoding methods and an existing Persian large language model (LLM).\n","authors":["Amir Panahandeh","Hanie Asemi","Esmail Nourani"],"pdf_url":"https://arxiv.org/pdf/2312.02125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09971v3","updated":"2023-12-04T18:51:54Z","published":"2023-10-15T22:20:39Z","title":"AMAGO: Scalable In-Context Reinforcement Learning for Adaptive Agents","summary":" We introduce AMAGO, an in-context Reinforcement Learning (RL) agent that uses\nsequence models to tackle the challenges of generalization, long-term memory,\nand meta-learning. Recent works have shown that off-policy learning can make\nin-context RL with recurrent policies viable. Nonetheless, these approaches\nrequire extensive tuning and limit scalability by creating key bottlenecks in\nagents' memory capacity, planning horizon, and model size. AMAGO revisits and\nredesigns the off-policy in-context approach to successfully train\nlong-sequence Transformers over entire rollouts in parallel with end-to-end RL.\nOur agent is uniquely scalable and applicable to a wide range of problems. We\ndemonstrate its strong performance empirically in meta-RL and long-term memory\ndomains. AMAGO's focus on sparse rewards and off-policy data also allows\nin-context learning to extend to goal-conditioned problems with challenging\nexploration. When combined with a novel hindsight relabeling scheme, AMAGO can\nsolve a previously difficult category of open-world domains, where agents\ncomplete many possible instructions in procedurally generated environments. We\nevaluate our agent on three goal-conditioned domains and study how its\nindividual improvements connect to create a generalist policy.\n","authors":["Jake Grigsby","Linxi Fan","Yuke Zhu"],"pdf_url":"https://arxiv.org/pdf/2310.09971v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02124v1","updated":"2023-12-04T18:51:44Z","published":"2023-12-04T18:51:44Z","title":"VerA: Versatile Anonymization Fit for Clinical Facial Images","summary":" The escalating legislative demand for data privacy in facial image\ndissemination has underscored the significance of image anonymization. Recent\nadvancements in the field surpass traditional pixelation or blur methods, yet\nthey predominantly address regular single images. This leaves clinical image\nanonymization -- a necessity for illustrating medical interventions -- largely\nunaddressed. We present VerA, a versatile facial image anonymization that is\nfit for clinical facial images where: (1) certain semantic areas must be\npreserved to show medical intervention results, and (2) anonymizing image pairs\nis crucial for showing before-and-after results. VerA outperforms or is on par\nwith state-of-the-art methods in de-identification and photorealism for regular\nimages. In addition, we validate our results on paired anonymization, and on\nthe anonymization of both single and paired clinical images with extensive\nquantitative and qualitative evaluation.\n","authors":["Majed El Helou","Doruk Cetin","Petar Stamenkovic","Fabio Zund"],"pdf_url":"https://arxiv.org/pdf/2312.02124v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02119v1","updated":"2023-12-04T18:49:23Z","published":"2023-12-04T18:49:23Z","title":"Tree of Attacks: Jailbreaking Black-Box LLMs Automatically","summary":" While Large Language Models (LLMs) display versatile functionality, they\ncontinue to generate harmful, biased, and toxic content, as demonstrated by the\nprevalence of human-designed jailbreaks. In this work, we present Tree of\nAttacks with Pruning (TAP), an automated method for generating jailbreaks that\nonly requires black-box access to the target LLM. TAP utilizes an LLM to\niteratively refine candidate (attack) prompts using tree-of-thoughts reasoning\nuntil one of the generated prompts jailbreaks the target. Crucially, before\nsending prompts to the target, TAP assesses them and prunes the ones unlikely\nto result in jailbreaks. Using tree-of-thought reasoning allows TAP to navigate\na large search space of prompts and pruning reduces the total number of queries\nsent to the target. In empirical evaluations, we observe that TAP generates\nprompts that jailbreak state-of-the-art LLMs (including GPT4 and GPT4-Turbo)\nfor more than 80% of the prompts using only a small number of queries. This\nsignificantly improves upon the previous state-of-the-art black-box method for\ngenerating jailbreaks.\n","authors":["Anay Mehrotra","Manolis Zampetakis","Paul Kassianik","Blaine Nelson","Hyrum Anderson","Yaron Singer","Amin Karbasi"],"pdf_url":"https://arxiv.org/pdf/2312.02119v1.pdf","comment":"An implementation of the presented method is available at\n https://github.com/RICommunity/TAP"},{"id":"http://arxiv.org/abs/2312.02111v1","updated":"2023-12-04T18:43:45Z","published":"2023-12-04T18:43:45Z","title":"TriDeNT: Triple Deep Network Training for Privileged Knowledge\n Distillation in Histopathology","summary":" Computational pathology models rarely utilise data that will not be available\nfor inference. This means most models cannot learn from highly informative data\nsuch as additional immunohistochemical (IHC) stains and spatial\ntranscriptomics. We present TriDeNT, a novel self-supervised method for\nutilising privileged data that is not available during inference to improve\nperformance. We demonstrate the efficacy of this method for a range of\ndifferent paired data including immunohistochemistry, spatial transcriptomics\nand expert nuclei annotations. In all settings, TriDeNT outperforms other\nstate-of-the-art methods in downstream tasks, with observed improvements of up\nto 101%. Furthermore, we provide qualitative and quantitative measurements of\nthe features learned by these models and how they differ from baselines.\nTriDeNT offers a novel method to distil knowledge from scarce or costly data\nduring training, to create significantly better models for routine inputs.\n","authors":["Lucas Farndale","Robert Insall","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.02111v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04527v3","updated":"2023-12-04T18:42:25Z","published":"2023-06-07T15:36:26Z","title":"ContriMix: Unsupervised disentanglement of content and attribute for\n domain generalization in microscopy image analysis","summary":" Domain generalization is critical for real-world applications of machine\nlearning to microscopy images, including histopathology and fluorescence\nimaging. Artifacts in these modalities arise through a complex combination of\nfactors relating to tissue collection and laboratory processing, as well as\nfactors intrinsic to patient samples. In fluorescence imaging, these artifacts\nstem from variations across experimental batches. The complexity and subtlety\nof these artifacts make the enumeration of data domains intractable. Therefore,\naugmentation-based methods of domain generalization that require domain\nidentifiers and manual fine-tuning are inadequate in this setting. To overcome\nthis challenge, we introduce ContriMix, a domain generalization technique that\nlearns to generate synthetic images by disentangling and permuting the\nbiological content (\"content\") and technical variations (\"attributes\") in\nmicroscopy images. ContriMix does not rely on domain identifiers or handcrafted\naugmentations and makes no assumptions about the input characteristics of\nimages. We assess the performance of ContriMix on two pathology datasets\ndealing with patch classification and Whole Slide Image label prediction tasks\nrespectively (Camelyon17-WILDS and RCC subtyping), and one fluorescence\nmicroscopy dataset (RxRx1-WILDS). Without any access to domain identifiers at\ntrain or test time, ContriMix performs similar or better than current\nstate-of-the-art methods in all these datasets, motivating its usage for\nmicroscopy image analysis in real-world settings where domain information is\nhard to come by. The code for ContriMix can be found at\nhttps://gitlab.com/huutan86/contrimix\n","authors":["Tan H. Nguyen","Dinkar Juyal","Jin Li","Aaditya Prakash","Shima Nofallah","Chintan Shah","Sai Chowdary Gullapally","Limin Yu","Michael Griffin","Anand Sampat","John Abel","Justin Lee","Amaro Taylor-Weiner"],"pdf_url":"https://arxiv.org/pdf/2306.04527v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18206v2","updated":"2023-12-04T18:42:03Z","published":"2023-11-30T02:56:43Z","title":"SCOPE-RL: A Python Library for Offline Reinforcement Learning and\n Off-Policy Evaluation","summary":" This paper introduces SCOPE-RL, a comprehensive open-source Python software\ndesigned for offline reinforcement learning (offline RL), off-policy evaluation\n(OPE), and selection (OPS). Unlike most existing libraries that focus solely on\neither policy learning or evaluation, SCOPE-RL seamlessly integrates these two\nkey aspects, facilitating flexible and complete implementations of both offline\nRL and OPE processes. SCOPE-RL put particular emphasis on its OPE modules,\noffering a range of OPE estimators and robust evaluation-of-OPE protocols. This\napproach enables more in-depth and reliable OPE compared to other packages. For\ninstance, SCOPE-RL enhances OPE by estimating the entire reward distribution\nunder a policy rather than its mere point-wise expected value. Additionally,\nSCOPE-RL provides a more thorough evaluation-of-OPE by presenting the\nrisk-return tradeoff in OPE results, extending beyond mere accuracy evaluations\nin existing OPE literature. SCOPE-RL is designed with user accessibility in\nmind. Its user-friendly APIs, comprehensive documentation, and a variety of\neasy-to-follow examples assist researchers and practitioners in efficiently\nimplementing and experimenting with various offline RL methods and OPE\nestimators, tailored to their specific problem contexts. The documentation of\nSCOPE-RL is available at https://scope-rl.readthedocs.io/en/latest/.\n","authors":["Haruka Kiyohara","Ren Kishimoto","Kosuke Kawakami","Ken Kobayashi","Kazuhide Nakata","Yuta Saito"],"pdf_url":"https://arxiv.org/pdf/2311.18206v2.pdf","comment":"preprint, open-source software:\n https://github.com/hakuhodo-technologies/scope-rl"},{"id":"http://arxiv.org/abs/2311.18207v2","updated":"2023-12-04T18:37:30Z","published":"2023-11-30T02:56:49Z","title":"Towards Assessing and Benchmarking Risk-Return Tradeoff of Off-Policy\n Evaluation","summary":" Off-Policy Evaluation (OPE) aims to assess the effectiveness of\ncounterfactual policies using only offline logged data and is often used to\nidentify the top-k promising policies for deployment in online A/B tests.\nExisting evaluation metrics for OPE estimators primarily focus on the\n\"accuracy\" of OPE or that of downstream policy selection, neglecting\nrisk-return tradeoff in the subsequent online policy deployment. To address\nthis issue, we draw inspiration from portfolio evaluation in finance and\ndevelop a new metric, called SharpeRatio@k, which measures the risk-return\ntradeoff of policy portfolios formed by an OPE estimator under varying online\nevaluation budgets (k). We validate our metric in two example scenarios,\ndemonstrating its ability to effectively distinguish between low-risk and\nhigh-risk estimators and to accurately identify the most efficient estimator.\nThis efficient estimator is characterized by its capability to form the most\nadvantageous policy portfolios, maximizing returns while minimizing risks\nduring online deployment, a nuance that existing metrics typically overlook. To\nfacilitate a quick, accurate, and consistent evaluation of OPE via\nSharpeRatio@k, we have also integrated this metric into an open-source\nsoftware, SCOPE-RL. Employing SharpeRatio@k and SCOPE-RL, we conduct\ncomprehensive benchmarking experiments on various estimators and RL tasks,\nfocusing on their risk-return tradeoff. These experiments offer several\ninteresting directions and suggestions for future OPE research.\n","authors":["Haruka Kiyohara","Ren Kishimoto","Kosuke Kawakami","Ken Kobayashi","Kazuhide Nakata","Yuta Saito"],"pdf_url":"https://arxiv.org/pdf/2311.18207v2.pdf","comment":"preprint, under review"},{"id":"http://arxiv.org/abs/2312.02102v1","updated":"2023-12-04T18:26:31Z","published":"2023-12-04T18:26:31Z","title":"Mitigating Data Injection Attacks on Federated Learning","summary":" Federated learning is a technique that allows multiple entities to\ncollaboratively train models using their data without compromising data\nprivacy. However, despite its advantages, federated learning can be susceptible\nto false data injection attacks. In these scenarios, a malicious entity with\ncontrol over specific agents in the network can manipulate the learning\nprocess, leading to a suboptimal model. Consequently, addressing these data\ninjection attacks presents a significant research challenge in federated\nlearning systems. In this paper, we propose a novel technique to detect and\nmitigate data injection attacks on federated learning systems. Our mitigation\nmethod is a local scheme, performed during a single instance of training by the\ncoordinating node, allowing the mitigation during the convergence of the\nalgorithm. Whenever an agent is suspected to be an attacker, its data will be\nignored for a certain period, this decision will often be re-evaluated. We\nprove that with probability 1, after a finite time, all attackers will be\nignored while the probability of ignoring a trustful agent becomes 0, provided\nthat there is a majority of truthful agents. Simulations show that when the\ncoordinating node detects and isolates all the attackers, the model recovers\nand converges to the truthful model.\n","authors":["Or Shalom","Amir Leshem","Waheed U. Bajwa"],"pdf_url":"https://arxiv.org/pdf/2312.02102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13771v2","updated":"2023-12-04T18:23:59Z","published":"2023-07-25T19:07:03Z","title":"Accuracy Improvement in Differentially Private Logistic Regression: A\n Pre-training Approach","summary":" Machine learning (ML) models can memorize training datasets. As a result,\ntraining ML models over private datasets can lead to the violation of\nindividuals' privacy. Differential privacy (DP) is a rigorous privacy notion to\npreserve the privacy of underlying training datasets. Yet, training ML models\nin a DP framework usually degrades the accuracy of ML models. This paper aims\nto boost the accuracy of a DP logistic regression (LR) via a pre-training\nmodule. In more detail, we initially pre-train our LR model on a public\ntraining dataset that there is no privacy concern about it. Then, we fine-tune\nour DP-LR model with the private dataset. In the numerical results, we show\nthat adding a pre-training module significantly improves the accuracy of the\nDP-LR model.\n","authors":["Mohammad Hoseinpour","Milad Hoseinpour","Ali Aghagolzadeh"],"pdf_url":"https://arxiv.org/pdf/2307.13771v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.07902v3","updated":"2023-12-04T18:19:03Z","published":"2023-01-19T06:08:01Z","title":"A Nonstochastic Control Approach to Optimization","summary":" Selecting the best hyperparameters for a particular optimization instance,\nsuch as the learning rate and momentum, is an important but nonconvex problem.\nAs a result, iterative optimization methods such as hypergradient descent lack\nglobal optimality guarantees in general.\n We propose an online nonstochastic control methodology for mathematical\noptimization. First, we formalize the setting of meta-optimization, an online\nlearning formulation of learning the best optimization algorithm from a class\nof methods. The meta-optimization problem over gradient-based methods can be\nframed as a feedback control problem over the choice of hyperparameters,\nincluding the learning rate, momentum, and the preconditioner.\n Although the original optimal control problem is nonconvex, we show how\nrecent methods from online nonstochastic control using convex relaxations can\nbe used to overcome the challenge of nonconvexity, and obtain regret guarantees\nagainst the best offline solution. This guarantees that in meta-optimization,\ngiven a sequence of optimization problems, we can learn a method that attains\nconvergence comparable to that of the best optimization method in hindsight\nfrom a class of methods.\n","authors":["Xinyi Chen","Elad Hazan"],"pdf_url":"https://arxiv.org/pdf/2301.07902v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02095v1","updated":"2023-12-04T18:13:58Z","published":"2023-12-04T18:13:58Z","title":"Single-sample versus case-control sampling scheme for Positive Unlabeled\n data: the story of two scenarios","summary":" In the paper we argue that performance of the classifiers based on Empirical\nRisk Minimization (ERM) for positive unlabeled data, which are designed for\ncase-control sampling scheme may significantly deteriorate when applied to a\nsingle-sample scenario. We reveal why their behavior depends, in all but very\nspecific cases, on the scenario. Also, we introduce a single-sample case\nanalogue of the popular non-negative risk classifier designed for case-control\ndata and compare its performance with the original proposal. We show that the\nsignificant differences occur between them, especiall when half or more\npositive of observations are labeled. The opposite case when ERM minimizer\ndesigned for the case-control case is applied for single-sample data is also\nconsidered and similar conclusions are drawn. Taking into account difference of\nscenarios requires a sole, but crucial, change in the definition of the\nEmpirical Risk.\n","authors":["Jan Mielniczuk","Adam Wawrzeńczyk"],"pdf_url":"https://arxiv.org/pdf/2312.02095v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.01432v2","updated":"2023-12-04T17:51:53Z","published":"2022-09-03T14:17:58Z","title":"From Monte Carlo to neural networks approximations of boundary value\n problems","summary":" In this paper we study probabilistic and neural network approximations for\nsolutions to Poisson equation subject to H\\\" older data in general bounded\ndomains of $\\mathbb{R}^d$. We aim at two fundamental goals.\n The first, and the most important, we show that the solution to Poisson\nequation can be numerically approximated in the sup-norm by Monte Carlo\nmethods, { and that this can be done highly efficiently if we use a modified\nversion} of the walk on spheres algorithm { as an acceleration method. This\nprovides estimates which are efficient with respect to the prescribed\napproximation error and with polynomial complexity in the dimension and the\nreciprocal of the error.} {A crucial feature is that} the overall number of\nsamples does not not depend on the point at which the approximation is\nperformed.\n As a second goal, we show that the obtained Monte Carlo solver renders { in a\nconstructive way} ReLU deep neural network (DNN) solutions to Poisson problem,\nwhose sizes depend at most polynomialy in the dimension $d$ and in the desired\nerror. In fact we show that the random DNN provides with high probability a\nsmall approximation error and low polynomial complexity in the dimension.\n","authors":["Lucian Beznea","Iulian Cimpean","Oana Lupascu-Stamate","Ionel Popescu","Arghir Zarnescu"],"pdf_url":"https://arxiv.org/pdf/2209.01432v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02079v1","updated":"2023-12-04T17:46:57Z","published":"2023-12-04T17:46:57Z","title":"Deep Set Neural Networks for forecasting asynchronous bioprocess\n timeseries","summary":" Cultivation experiments often produce sparse and irregular time series.\nClassical approaches based on mechanistic models, like Maximum Likelihood\nfitting or Monte-Carlo Markov chain sampling, can easily account for sparsity\nand time-grid irregularities, but most statistical and Machine Learning tools\nare not designed for handling sparse data out-of-the-box. Among popular\napproaches there are various schemes for filling missing values (imputation)\nand interpolation into a regular grid (alignment). However, such methods\ntransfer the biases of the interpolation or imputation models to the target\nmodel. We show that Deep Set Neural Networks equipped with triplet encoding of\nthe input data can successfully handle bio-process data without any need for\nimputation or alignment procedures. The method is agnostic to the particular\nnature of the time series and can be adapted for any task, for example, online\nmonitoring, predictive control, design of experiments, etc. In this work, we\nfocus on forecasting. We argue that such an approach is especially suitable for\ntypical cultivation processes, demonstrate the performance of the method on\nseveral forecasting tasks using data generated from macrokinetic growth models\nunder realistic conditions, and compare the method to a conventional fitting\nprocedure and methods based on imputation and alignment.\n","authors":["Maxim Borisyak","Stefan Born","Peter Neubauer","Nicolás Cruz-Bournazou"],"pdf_url":"https://arxiv.org/pdf/2312.02079v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.02078v1","updated":"2023-12-04T17:41:52Z","published":"2023-12-04T17:41:52Z","title":"Integrating AI into CCTV Systems: A Comprehensive Evaluation of Smart\n Video Surveillance in Community Space","summary":" This article presents an AI-enabled Smart Video Surveillance (SVS) designed\nto enhance safety in community spaces such as educational and recreational\nareas, and small businesses. The proposed system innovatively integrates with\nexisting CCTV and wired camera networks, simplifying its adoption across\nvarious community cases to leverage recent AI advancements. Our SVS system,\nfocusing on privacy, uses metadata instead of pixel data for activity\nrecognition, aligning with ethical standards. It features cloud-based\ninfrastructure and a mobile app for real-time, privacy-conscious alerts in\ncommunities.\n This article notably pioneers a comprehensive real-world evaluation of the\nSVS system, covering AI-driven visual processing, statistical analysis,\ndatabase management, cloud communication, and user notifications. It's also the\nfirst to assess an end-to-end anomaly detection system's performance, vital for\nidentifying potential public safety incidents.\n For our evaluation, we implemented the system in a community college, serving\nas an ideal model to exemplify the proposed system's capabilities. Our findings\nin this setting demonstrate the system's robustness, with throughput, latency,\nand scalability effectively managing 16 CCTV cameras. The system maintained a\nconsistent 16.5 frames per second (FPS) over a 21-hour operation. The average\nend-to-end latency for detecting behavioral anomalies and alerting users was\n26.76 seconds.\n","authors":["Shanle Yao","Babak Rahimi Ardabili","Armin Danesh Pazho","Ghazal Alinezhad Noghre","Christopher Neff","Hamed Tabkhi"],"pdf_url":"https://arxiv.org/pdf/2312.02078v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02074v1","updated":"2023-12-04T17:37:41Z","published":"2023-12-04T17:37:41Z","title":"Federated Learning is Better with Non-Homomorphic Encryption","summary":" Traditional AI methodologies necessitate centralized data collection, which\nbecomes impractical when facing problems with network communication, data\nprivacy, or storage capacity. Federated Learning (FL) offers a paradigm that\nempowers distributed AI model training without collecting raw data. There are\ndifferent choices for providing privacy during FL training. One of the popular\nmethodologies is employing Homomorphic Encryption (HE) - a breakthrough in\nprivacy-preserving computation from Cryptography. However, these methods have a\nprice in the form of extra computation and memory footprint. To resolve these\nissues, we propose an innovative framework that synergizes permutation-based\ncompressors with Classical Cryptography, even though employing Classical\nCryptography was assumed to be impossible in the past in the context of FL. Our\nframework offers a way to replace HE with cheaper Classical Cryptography\nprimitives which provides security for the training process. It fosters\nasynchronous communication and provides flexible deployment options in various\ncommunication topologies.\n","authors":["Konstantin Burlachenko","Abdulmajeed Alrowithi","Fahad Ali Albalawi","Peter Richtarik"],"pdf_url":"https://arxiv.org/pdf/2312.02074v1.pdf","comment":"56 pages, 10 figures, Accepted to presentation and proceedings to 4th\n ACM International Workshop on Distributed Machine Learning"},{"id":"http://arxiv.org/abs/2310.18144v3","updated":"2023-12-04T17:32:31Z","published":"2023-10-27T13:51:18Z","title":"Improving Intrinsic Exploration by Creating Stationary Objectives","summary":" Exploration bonuses in reinforcement learning guide long-horizon exploration\nby defining custom intrinsic objectives. Several exploration objectives like\ncount-based bonuses, pseudo-counts, and state-entropy maximization are\nnon-stationary and hence are difficult to optimize for the agent. While this\nissue is generally known, it is usually omitted and solutions remain\nunder-explored. The key contribution of our work lies in transforming the\noriginal non-stationary rewards into stationary rewards through an augmented\nstate representation. For this purpose, we introduce the Stationary Objectives\nFor Exploration (SOFE) framework. SOFE requires identifying sufficient\nstatistics for different exploration bonuses and finding an efficient encoding\nof these statistics to use as input to a deep network. SOFE is based on\nproposing state augmentations that expand the state space but hold the promise\nof simplifying the optimization of the agent's objective. We show that SOFE\nimproves the performance of several exploration objectives, including\ncount-based bonuses, pseudo-counts, and state-entropy maximization. Moreover,\nSOFE outperforms prior methods that attempt to stabilize the optimization of\nintrinsic objectives. We demonstrate the efficacy of SOFE in hard-exploration\nproblems, including sparse-reward tasks, pixel-based observations, 3D\nnavigation, and procedurally generated environments.\n","authors":["Roger Creus Castanyer","Joshua Romoff","Glen Berseth"],"pdf_url":"https://arxiv.org/pdf/2310.18144v3.pdf","comment":"Accepted at the ALOE 2023 Workshop at NeurIPS. Under Review at ICLR"},{"id":"http://arxiv.org/abs/2312.02063v1","updated":"2023-12-04T17:19:37Z","published":"2023-12-04T17:19:37Z","title":"The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet\n Transits","summary":" This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase\nFolding and Convolutional Neural Network (CNN) system to detect exoplanets\nusing the transit method. We devise a fast folding algorithm parallelized on a\nGPU to amplify low signal-to-noise ratio transit signals, allowing a search at\nhigh precision and speed. A CNN trained on two million synthetic light curves\nreports a score indicating the likelihood of a planetary signal at each period.\nGPFC improves on speed by three orders of magnitude over the predominant\nBox-fitting Least Squares (BLS) method. Our simulation results show GPFC\nachieves 97% training accuracy, higher true positive rate at the same false\npositive rate of detection, and higher precision at the same recall rate when\ncompared to BLS. GPFC recovers 100% of known ultra-short-period planets in\nKepler light curves from a blind search. These results highlight the promise of\nGPFC as an alternative approach to the traditional BLS algorithm for finding\nnew transiting exoplanets in data taken with Kepler and other space transit\nmissions such as K2, TESS and future PLATO and Earth 2.0.\n","authors":["Kaitlyn Wang","Kevin Wang","Jian Ge","Yinan Zhao","Kevin Willis"],"pdf_url":"https://arxiv.org/pdf/2312.02063v1.pdf","comment":"12 pages, 17 figures; To be published in the Monthly Notices of the\n Royal Astronomical Society (MNRAS)"},{"id":"http://arxiv.org/abs/2312.02052v1","updated":"2023-12-04T17:10:25Z","published":"2023-12-04T17:10:25Z","title":"DUCK: Distance-based Unlearning via Centroid Kinematics","summary":" Machine Unlearning is rising as a new field, driven by the pressing necessity\nof ensuring privacy in modern artificial intelligence models. This technique\nprimarily aims to eradicate any residual influence of a specific subset of data\nfrom the knowledge acquired by a neural model during its training. This work\nintroduces a novel unlearning algorithm, denoted as Distance-based Unlearning\nvia Centroid Kinematics (DUCK), which employs metric learning to guide the\nremoval of samples matching the nearest incorrect centroid in the embedding\nspace. Evaluation of the algorithm's performance is conducted across various\nbenchmark datasets in two distinct scenarios, class removal, and homogeneous\nsampling removal, obtaining state-of-the-art performance. We introduce a novel\nmetric, called Adaptive Unlearning Score (AUS), encompassing not only the\nefficacy of the unlearning process in forgetting target data but also\nquantifying the performance loss relative to the original model. Moreover, we\npropose a novel membership inference attack to assess the algorithm's capacity\nto erase previously acquired knowledge, designed to be adaptable to future\nmethodologies.\n","authors":["Marco Cotogni","Jacopo Bonato","Luigi Sabetta","Francesco Pelosin","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02129v3","updated":"2023-12-04T17:06:12Z","published":"2023-07-05T09:11:09Z","title":"How Deep Neural Networks Learn Compositional Data: The Random Hierarchy\n Model","summary":" Deep learning algorithms demonstrate a surprising ability to learn\nhigh-dimensional tasks from limited examples. This is commonly attributed to\nthe depth of neural networks, enabling them to build a hierarchy of abstract,\nlow-dimensional data representations. However, how many training examples are\nrequired to learn such representations remains unknown. To quantitatively study\nthis question, we introduce the Random Hierarchy Model: a family of synthetic\ntasks inspired by the hierarchical structure of language and images. The model\nis a classification task where each class corresponds to a group of high-level\nfeatures, chosen among several equivalent groups associated with the same\nclass. In turn, each feature corresponds to a group of sub-features chosen\namong several equivalent ones and so on, following a hierarchy of composition\nrules. We find that deep networks learn the task by developing internal\nrepresentations invariant to exchanging equivalent groups. Moreover, the number\nof data required corresponds to the point where correlations between low-level\nfeatures and classes become detectable. Overall, our results indicate how deep\nnetworks overcome the curse of dimensionality by building invariant\nrepresentations, and provide an estimate of the number of data required to\nlearn a hierarchical task.\n","authors":["Francesco Cagnetta","Leonardo Petrini","Umberto M. Tomasini","Alessandro Favero","Matthieu Wyart"],"pdf_url":"https://arxiv.org/pdf/2307.02129v3.pdf","comment":"9 pages, 8 figures"},{"id":"http://arxiv.org/abs/2311.11629v2","updated":"2023-12-04T17:01:20Z","published":"2023-11-20T09:28:04Z","title":"Generating Realistic Counterfactuals for Retinal Fundus and OCT Images\n using Diffusion Models","summary":" Counterfactual reasoning is often used in clinical settings to explain\ndecisions or weigh alternatives. Therefore, for imaging based specialties such\nas ophthalmology, it would be beneficial to be able to create counterfactual\nimages, illustrating answers to questions like \"If the subject had had diabetic\nretinopathy, how would the fundus image have looked?\". Here, we demonstrate\nthat using a diffusion model in combination with an adversarially robust\nclassifier trained on retinal disease classification tasks enables the\ngeneration of highly realistic counterfactuals of retinal fundus images and\noptical coherence tomography (OCT) B-scans. The key to the realism of\ncounterfactuals is that these classifiers encode salient features indicative\nfor each disease class and can steer the diffusion model to depict disease\nsigns or remove disease-related lesions in a realistic way. In a user study,\ndomain experts also found the counterfactuals generated using our method\nsignificantly more realistic than counterfactuals generated from a previous\nmethod, and even indistinguishable from real images.\n","authors":["Indu Ilanchezian","Valentyn Boreiko","Laura Kühlewein","Ziwei Huang","Murat Seçkin Ayhan","Matthias Hein","Lisa Koch","Philipp Berens"],"pdf_url":"https://arxiv.org/pdf/2311.11629v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04848v2","updated":"2023-12-04T17:00:18Z","published":"2023-06-08T00:56:33Z","title":"Interpreting and Improving Diffusion Models Using the Euclidean Distance\n Function","summary":" Denoising is intuitively related to projection. Indeed, under the manifold\nhypothesis, adding random noise is approximately equivalent to orthogonal\nperturbation. Hence, learning to denoise is approximately learning to project.\nIn this paper, we use this observation to reinterpret denoising diffusion\nmodels as approximate gradient descent applied to the Euclidean distance\nfunction. We then provide straight-forward convergence analysis of the DDIM\nsampler under simple assumptions on the projection-error of the denoiser.\nFinally, we propose a new sampler based on two simple modifications to DDIM\nusing insights from our theoretical results. In as few as 5-10 function\nevaluations, our sampler achieves state-of-the-art FID scores on pretrained\nCIFAR-10 and CelebA models and can generate high quality samples on latent\ndiffusion models.\n","authors":["Frank Permenter","Chenyang Yuan"],"pdf_url":"https://arxiv.org/pdf/2306.04848v2.pdf","comment":"19 pages, 8 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.02037v1","updated":"2023-12-04T16:54:40Z","published":"2023-12-04T16:54:40Z","title":"GFS: Graph-based Feature Synthesis for Prediction over Relational\n Databases","summary":" Relational databases are extensively utilized in a variety of modern\ninformation system applications, and they always carry valuable data patterns.\nThere are a huge number of data mining or machine learning tasks conducted on\nrelational databases. However, it is worth noting that there are limited\nmachine learning models specifically designed for relational databases, as most\nmodels are primarily tailored for single table settings. Consequently, the\nprevalent approach for training machine learning models on data stored in\nrelational databases involves performing feature engineering to merge the data\nfrom multiple tables into a single table and subsequently applying single table\nmodels. This approach not only requires significant effort in feature\nengineering but also destroys the inherent relational structure present in the\ndata. To address these challenges, we propose a novel framework called\nGraph-based Feature Synthesis (GFS). GFS formulates the relational database as\na heterogeneous graph, thereby preserving the relational structure within the\ndata. By leveraging the inductive bias from single table models, GFS\neffectively captures the intricate relationships inherent in each table.\nAdditionally, the whole framework eliminates the need for manual feature\nengineering. In the extensive experiment over four real-world multi-table\nrelational databases, GFS outperforms previous methods designed for relational\ndatabases, demonstrating its superior performance.\n","authors":["Han Zhang","Quan Gan","David Wipf","Weinan Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02037v1.pdf","comment":"13 pages, 5 figures, VLDB 2024 under review"},{"id":"http://arxiv.org/abs/2312.02027v1","updated":"2023-12-04T16:49:43Z","published":"2023-12-04T16:49:43Z","title":"Stochastic Optimal Control Matching","summary":" Stochastic optimal control, which has the goal of driving the behavior of\nnoisy systems, is broadly applicable in science, engineering and artificial\nintelligence. Our work introduces Stochastic Optimal Control Matching (SOCM), a\nnovel Iterative Diffusion Optimization (IDO) technique for stochastic optimal\ncontrol that stems from the same philosophy as the conditional score matching\nloss for diffusion models. That is, the control is learned via a least squares\nproblem by trying to fit a matching vector field. The training loss, which is\nclosely connected to the cross-entropy loss, is optimized with respect to both\nthe control function and a family of reparameterization matrices which appear\nin the matching vector field. The optimization with respect to the\nreparameterization matrices aims at minimizing the variance of the matching\nvector field. Experimentally, our algorithm achieves lower error than all the\nexisting IDO techniques for stochastic optimal control for four different\ncontrol settings. The key idea underlying SOCM is the path-wise\nreparameterization trick, a novel technique that is of independent interest,\ne.g., for generative modeling.\n","authors":["Carles Domingo-Enrich","Jiequn Han","Brandon Amos","Joan Bruna","Ricky T. Q. Chen"],"pdf_url":"https://arxiv.org/pdf/2312.02027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02021v1","updated":"2023-12-04T16:46:38Z","published":"2023-12-04T16:46:38Z","title":"VLTSeg: Simple Transfer of CLIP-Based Vision-Language Representations\n for Domain Generalized Semantic Segmentation","summary":" Domain generalization (DG) remains a significant challenge for perception\nbased on deep neural networks (DNN), where domain shifts occur due to lighting,\nweather, or geolocation changes. In this work, we propose VLTSeg to enhance\ndomain generalization in semantic segmentation, where the network is solely\ntrained on the source domain and evaluated on unseen target domains. Our method\nleverages the inherent semantic robustness of vision-language models. First, by\nsubstituting traditional vision-only backbones with pre-trained encoders from\nCLIP and EVA-CLIP as transfer learning setting we find that in the field of DG,\nvision-language pre-training significantly outperforms supervised and\nself-supervised vision pre-training. We thus propose a new vision-language\napproach for domain generalized segmentation, which improves the domain\ngeneralization SOTA by 7.6% mIoU when training on the synthetic GTA5 dataset.\nWe further show the superior generalization capabilities of vision-language\nsegmentation models by reaching 76.48% mIoU on the popular Cityscapes-to-ACDC\nbenchmark, outperforming the previous SOTA approach by 6.9% mIoU on the test\nset at the time of writing. Additionally, our approach shows strong in-domain\ngeneralization capabilities indicated by 86.1% mIoU on the Cityscapes test set,\nresulting in a shared first place with the previous SOTA on the current\nleaderboard at the time of submission.\n","authors":["Christoph Hümmer","Manuel Schwonberg","Liangwei Zhong","Hu Cao","Alois Knoll","Hanno Gottschalk"],"pdf_url":"https://arxiv.org/pdf/2312.02021v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16849v2","updated":"2023-12-04T16:44:45Z","published":"2023-09-28T20:59:51Z","title":"Space-Time Attention with Shifted Non-Local Search","summary":" Efficiently computing attention maps for videos is challenging due to the\nmotion of objects between frames. While a standard non-local search is\nhigh-quality for a window surrounding each query point, the window's small size\ncannot accommodate motion. Methods for long-range motion use an auxiliary\nnetwork to predict the most similar key coordinates as offsets from each query\nlocation. However, accurately predicting this flow field of offsets remains\nchallenging, even for large-scale networks. Small spatial inaccuracies\nsignificantly impact the attention module's quality. This paper proposes a\nsearch strategy that combines the quality of a non-local search with the range\nof predicted offsets. The method, named Shifted Non-Local Search, executes a\nsmall grid search surrounding the predicted offsets to correct small spatial\nerrors. Our method's in-place computation consumes 10 times less memory and is\nover 3 times faster than previous work. Experimentally, correcting the small\nspatial errors improves the video frame alignment quality by over 3 dB PSNR.\nOur search upgrades existing space-time attention modules, which improves video\ndenoising results by 0.30 dB PSNR for a 7.5% increase in overall runtime. We\nintegrate our space-time attention module into a UNet-like architecture to\nachieve state-of-the-art results on video denoising.\n","authors":["Kent Gauen","Stanley Chan"],"pdf_url":"https://arxiv.org/pdf/2309.16849v2.pdf","comment":"15 pages, 12 figures"},{"id":"http://arxiv.org/abs/2312.02019v1","updated":"2023-12-04T16:43:36Z","published":"2023-12-04T16:43:36Z","title":"Action Inference by Maximising Evidence: Zero-Shot Imitation from\n Observation with World Models","summary":" Unlike most reinforcement learning agents which require an unrealistic amount\nof environment interactions to learn a new behaviour, humans excel at learning\nquickly by merely observing and imitating others. This ability highly depends\non the fact that humans have a model of their own embodiment that allows them\nto infer the most likely actions that led to the observed behaviour. In this\npaper, we propose Action Inference by Maximising Evidence (AIME) to replicate\nthis behaviour using world models. AIME consists of two distinct phases. In the\nfirst phase, the agent learns a world model from its past experience to\nunderstand its own body by maximising the ELBO. While in the second phase, the\nagent is given some observation-only demonstrations of an expert performing a\nnovel task and tries to imitate the expert's behaviour. AIME achieves this by\ndefining a policy as an inference model and maximising the evidence of the\ndemonstration under the policy and world model. Our method is \"zero-shot\" in\nthe sense that it does not require further training for the world model or\nonline interactions with the environment after given the demonstration. We\nempirically validate the zero-shot imitation performance of our method on the\nWalker and Cheetah embodiment of the DeepMind Control Suite and find it\noutperforms the state-of-the-art baselines. Code is available at:\nhttps://github.com/argmax-ai/aime.\n","authors":["Xingyuan Zhang","Philip Becker-Ehmck","Patrick van der Smagt","Maximilian Karl"],"pdf_url":"https://arxiv.org/pdf/2312.02019v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02012v1","updated":"2023-12-04T16:36:29Z","published":"2023-12-04T16:36:29Z","title":"Optimal Data Generation in Multi-Dimensional Parameter Spaces, using\n Bayesian Optimization","summary":" Acquiring a substantial number of data points for training accurate machine\nlearning (ML) models is a big challenge in scientific fields where data\ncollection is resource-intensive. Here, we propose a novel approach for\nconstructing a minimal yet highly informative database for training ML models\nin complex multi-dimensional parameter spaces. To achieve this, we mimic the\nunderlying relation between the output and input parameters using Gaussian\nprocess regression (GPR). Using a set of known data, GPR provides predictive\nmeans and standard deviation for the unknown data. Given the predicted standard\ndeviation by GPR, we select data points using Bayesian optimization to obtain\nan efficient database for training ML models. We compare the performance of ML\nmodels trained on databases obtained through this method, with databases\nobtained using traditional approaches. Our results demonstrate that the ML\nmodels trained on the database obtained using Bayesian optimization approach\nconsistently outperform the other two databases, achieving high accuracy with a\nsignificantly smaller number of data points. Our work contributes to the\nresource-efficient collection of data in high-dimensional complex parameter\nspaces, to achieve high precision machine learning predictions.\n","authors":["M. R. Mahani","Igor A. Nechepurenko","Yasmin Rahimof","Andreas Wicht"],"pdf_url":"https://arxiv.org/pdf/2312.02012v1.pdf","comment":"9 pages, 3 figures"},{"id":"http://arxiv.org/abs/2311.14743v4","updated":"2023-12-04T16:31:30Z","published":"2023-11-21T18:41:26Z","title":"A Baseline Analysis of Reward Models' Ability To Accurately Analyze\n Foundation Models Under Distribution Shift","summary":" Foundation models, specifically Large Language Models (LLM's), have lately\ngained wide-spread attention and adoption. Reinforcement Learning with Human\nFeedback (RLHF) involves training a reward model to capture desired behaviors,\nwhich is then used to align LLM's. These reward models are additionally used at\ninference-time to estimate LLM responses' adherence to those desired behaviors.\nHowever, there is little work measuring how robust these reward models are to\ndistribution shifts. In this work, we evaluate how reward model performance -\nmeasured via accuracy and calibration (i.e. alignment between accuracy and\nconfidence) - is affected by distribution shift. We show novel calibration\npatterns and accuracy drops due to OOD prompts and responses, and that the\nreward model is more sensitive to shifts in responses than prompts.\nAdditionally, we adapt an OOD detection technique commonly used in\nclassification to the reward model setting to detect these distribution shifts\nin prompts and responses.\n","authors":["Ben Pikus","Will LeVine","Tony Chen","Sean Hendryx"],"pdf_url":"https://arxiv.org/pdf/2311.14743v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13998v4","updated":"2023-12-04T16:27:04Z","published":"2023-05-23T12:27:56Z","title":"SMT 2.0: A Surrogate Modeling Toolbox with a focus on Hierarchical and\n Mixed Variables Gaussian Processes","summary":" The Surrogate Modeling Toolbox (SMT) is an open-source Python package that\noffers a collection of surrogate modeling methods, sampling techniques, and a\nset of sample problems. This paper presents SMT 2.0, a major new release of SMT\nthat introduces significant upgrades and new features to the toolbox. This\nrelease adds the capability to handle mixed-variable surrogate models and\nhierarchical variables. These types of variables are becoming increasingly\nimportant in several surrogate modeling applications. SMT 2.0 also improves SMT\nby extending sampling methods, adding new surrogate models, and computing\nvariance and kernel derivatives for Kriging. This release also includes new\nfunctions to handle noisy and use multifidelity data. To the best of our\nknowledge, SMT 2.0 is the first open-source surrogate library to propose\nsurrogate models for hierarchical and mixed inputs. This open-source software\nis distributed under the New BSD license.\n","authors":["Paul Saves","Remi Lafage","Nathalie Bartoli","Youssef Diouane","Jasper Bussemaker","Thierry Lefebvre","John T. Hwang","Joseph Morlier","Joaquim R. R. A. Martins"],"pdf_url":"https://arxiv.org/pdf/2305.13998v4.pdf","comment":"version 4"},{"id":"http://arxiv.org/abs/2311.06650v2","updated":"2023-12-04T16:18:37Z","published":"2023-11-11T19:39:50Z","title":"Heuristic Optimal Transport in Branching Networks","summary":" Optimal transport aims to learn a mapping of sources to targets by minimizing\nthe cost, which is typically defined as a function of distance. The solution to\nthis problem consists of straight line segments optimally connecting sources to\ntargets, and it does not exhibit branching. These optimal solutions are in\nstark contrast with both natural, and man-made transportation networks, where\nbranching structures are prevalent. Here we discuss a fast heuristic branching\nmethod for optimal transport in networks, and we provide several applications.\n","authors":["M. Andrecut"],"pdf_url":"https://arxiv.org/pdf/2311.06650v2.pdf","comment":"11 pages, 6 figures"},{"id":"http://arxiv.org/abs/2212.09928v2","updated":"2023-12-04T16:18:33Z","published":"2022-12-20T00:33:11Z","title":"Improving the Robustness of Summarization Models by Detecting and\n Removing Input Noise","summary":" The evaluation of abstractive summarization models typically uses test data\nthat is identically distributed as training data. In real-world practice,\ndocuments to be summarized may contain input noise caused by text extraction\nartifacts or data pipeline bugs. The robustness of model performance under\ndistribution shift caused by such noise is relatively under-studied. We present\na large empirical study quantifying the sometimes severe loss in performance\n(up to 12 ROUGE-1 points) from different types of input noise for a range of\ndatasets and model sizes. We then propose a light-weight method for detecting\nand removing such noise in the input during model inference without requiring\nany extra training, auxiliary models, or even prior knowledge of the type of\nnoise. Our proposed approach effectively mitigates the loss in performance,\nrecovering a large fraction of the performance drop, sometimes as large as 11\nROUGE-1 points.\n","authors":["Kundan Krishna","Yao Zhao","Jie Ren","Balaji Lakshminarayanan","Jiaming Luo","Mohammad Saleh","Peter J. Liu"],"pdf_url":"https://arxiv.org/pdf/2212.09928v2.pdf","comment":"EMNLP Findings 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2312.01994v1","updated":"2023-12-04T16:14:43Z","published":"2023-12-04T16:14:43Z","title":"A Generative Self-Supervised Framework using Functional Connectivity in\n fMRI Data","summary":" Deep neural networks trained on Functional Connectivity (FC) networks\nextracted from functional Magnetic Resonance Imaging (fMRI) data have gained\npopularity due to the increasing availability of data and advances in model\narchitectures, including Graph Neural Network (GNN). Recent research on the\napplication of GNN to FC suggests that exploiting the time-varying properties\nof the FC could significantly improve the accuracy and interpretability of the\nmodel prediction. However, the high cost of acquiring high-quality fMRI data\nand corresponding phenotypic labels poses a hurdle to their application in\nreal-world settings, such that a model na\\\"ively trained in a supervised\nfashion can suffer from insufficient performance or a lack of generalization on\na small number of data. In addition, most Self-Supervised Learning (SSL)\napproaches for GNNs to date adopt a contrastive strategy, which tends to lose\nappropriate semantic information when the graph structure is perturbed or does\nnot leverage both spatial and temporal information simultaneously. In light of\nthese challenges, we propose a generative SSL approach that is tailored to\neffectively harness spatio-temporal information within dynamic FC. Our\nempirical results, experimented with large-scale (>50,000) fMRI datasets,\ndemonstrate that our approach learns valuable representations and enables the\nconstruction of accurate and robust models when fine-tuned for downstream\ntasks.\n","authors":["Jungwon Choi","Seongho Keum","EungGu Yun","Byung-Hoon Kim","Juho Lee"],"pdf_url":"https://arxiv.org/pdf/2312.01994v1.pdf","comment":"NeurIPS 2023 Temporal Graph Learning Workshop"},{"id":"http://arxiv.org/abs/2311.18803v2","updated":"2023-12-04T16:13:21Z","published":"2023-11-30T18:49:43Z","title":"BioCLIP: A Vision Foundation Model for the Tree of Life","summary":" Images of the natural world, collected by a variety of cameras, from drones\nto individual phones, are increasingly abundant sources of biological\ninformation. There is an explosion of computational methods and tools,\nparticularly computer vision, for extracting biologically relevant information\nfrom images for science and conservation. Yet most of these are bespoke\napproaches designed for a specific task and are not easily adaptable or\nextendable to new questions, contexts, and datasets. A vision model for general\norganismal biology questions on images is of timely need. To approach this, we\ncurate and release TreeOfLife-10M, the largest and most diverse ML-ready\ndataset of biology images. We then develop BioCLIP, a foundation model for the\ntree of life, leveraging the unique properties of biology captured by\nTreeOfLife-10M, namely the abundance and variety of images of plants, animals,\nand fungi, together with the availability of rich structured biological\nknowledge. We rigorously benchmark our approach on diverse fine-grained biology\nclassification tasks, and find that BioCLIP consistently and substantially\noutperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation\nreveals that BioCLIP has learned a hierarchical representation conforming to\nthe tree of life, shedding light on its strong generalizability. Our code,\nmodels and data will be made available at\nhttps://github.com/Imageomics/bioclip.\n","authors":["Samuel Stevens","Jiaman Wu","Matthew J Thompson","Elizabeth G Campolongo","Chan Hee Song","David Edward Carlyn","Li Dong","Wasila M Dahdul","Charles Stewart","Tanya Berger-Wolf","Wei-Lun Chao","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2311.18803v2.pdf","comment":"18 pages; updated title"},{"id":"http://arxiv.org/abs/2312.01991v1","updated":"2023-12-04T16:10:34Z","published":"2023-12-04T16:10:34Z","title":"Information Modified K-Nearest Neighbor","summary":" In this research paper, we introduce a novel classification method aimed at\nimproving the performance of the K-Nearest Neighbors (KNN) algorithm. Our\napproach leverages Mutual Information (MI) to enhance the significance of\nweights and draw inspiration from Shapley values, a concept originating from\ncooperative game theory, to refine value allocation. The fundamental concept\nunderlying KNN is the classification of samples based on the majority thorough\ntheir k-nearest neighbors. While both the distances and labels of these\nneighbors are crucial, traditional KNN assigns equal weight to all samples and\nprevance considering the varying importance of each neighbor based on their\ndistances and labels.\n In the proposed method, known as Information-Modified KNN (IMKNN), we address\nthis issue by introducing a straightforward algorithm. To evaluate the\neffectiveness of our approach, it is compared with 7 contemporary variants of\nKNN, as well as the traditional KNN. Each of these variants exhibits its unique\nadvantages and limitations. We conduct experiments on 12 widely-used datasets,\nassessing the methods' performance in terms of accuracy, precision and recall.\n Our study demonstrates that IMKNN consistently outperforms other methods\nacross different datasets and criteria by highlighting its superior performance\nin various classification tasks. These findings underscore the potential of\nIMKNN as a valuable tool for enhancing the capabilities of the KNN algorithm in\ndiverse applications.\n","authors":["Mohammad Ali Vahedifar","Azim Akhtarshenas","Mariam Sabbaghian","Mohammad Rafatpanah"],"pdf_url":"https://arxiv.org/pdf/2312.01991v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02980v2","updated":"2023-12-04T16:01:13Z","published":"2023-10-04T17:17:06Z","title":"Never Train from Scratch: Fair Comparison of Long-Sequence Models\n Requires Data-Driven Priors","summary":" Modeling long-range dependencies across sequences is a longstanding goal in\nmachine learning and has led to architectures, such as state space models, that\ndramatically outperform Transformers on long sequences. However, these\nimpressive empirical gains have been by and large demonstrated on benchmarks\n(e.g. Long Range Arena), where models are randomly initialized and trained to\npredict a target label from an input sequence. In this work, we show that\nrandom initialization leads to gross overestimation of the differences between\narchitectures and that pretraining with standard denoising objectives, using\n$\\textit{only the downstream task data}$, leads to dramatic gains across\nmultiple architectures and to very small gaps between Transformers and state\nspace models (SSMs). In stark contrast to prior works, we find vanilla\nTransformers to match the performance of S4 on Long Range Arena when properly\npretrained, and we improve the best reported results of SSMs on the PathX-256\ntask by 20 absolute points. Subsequently, we analyze the utility of\npreviously-proposed structured parameterizations for SSMs and show they become\nmostly redundant in the presence of data-driven initialization obtained through\npretraining. Our work shows that, when evaluating different architectures on\nsupervised tasks, incorporation of data-driven priors via pretraining is\nessential for reliable performance estimation, and can be done efficiently.\n","authors":["Ido Amos","Jonathan Berant","Ankit Gupta"],"pdf_url":"https://arxiv.org/pdf/2310.02980v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17378v2","updated":"2023-12-04T15:57:40Z","published":"2023-10-26T13:14:13Z","title":"Optimization dependent generalization bound for ReLU networks based on\n sensitivity in the tangent bundle","summary":" Recent advances in deep learning have given us some very promising results on\nthe generalization ability of deep neural networks, however literature still\nlacks a comprehensive theory explaining why heavily over-parametrized models\nare able to generalize well while fitting the training data. In this paper we\npropose a PAC type bound on the generalization error of feedforward ReLU\nnetworks via estimating the Rademacher complexity of the set of networks\navailable from an initial parameter vector via gradient descent. The key idea\nis to bound the sensitivity of the network's gradient to perturbation of the\ninput data along the optimization trajectory. The obtained bound does not\nexplicitly depend on the depth of the network. Our results are experimentally\nverified on the MNIST and CIFAR-10 datasets.\n","authors":["Dániel Rácz","Mihály Petreczky","András Csertán","Bálint Daróczy"],"pdf_url":"https://arxiv.org/pdf/2310.17378v2.pdf","comment":"17 pages, 5 figures, OPT2023: 15th Annual Workshop on Optimization\n for Machine Learning at the 37th NeurIPS 2023, New Orleans, LA, USA"},{"id":"http://arxiv.org/abs/2305.14296v2","updated":"2023-12-04T15:53:50Z","published":"2023-05-23T17:39:54Z","title":"USB: A Unified Summarization Benchmark Across Tasks and Domains","summary":" While the NLP community has produced numerous summarization benchmarks, none\nprovide the rich annotations required to simultaneously address many important\nproblems related to control and reliability. We introduce a Wikipedia-derived\nbenchmark, complemented by a rich set of crowd-sourced annotations, that\nsupports $8$ interrelated tasks: (i) extractive summarization; (ii) abstractive\nsummarization; (iii) topic-based summarization; (iv) compressing selected\nsentences into a one-line summary; (v) surfacing evidence for a summary\nsentence; (vi) predicting the factual accuracy of a summary sentence; (vii)\nidentifying unsubstantiated spans in a summary sentence; (viii) correcting\nfactual errors in summaries. We compare various methods on this benchmark and\ndiscover that on multiple tasks, moderately-sized fine-tuned models\nconsistently outperform much larger few-shot prompted language models. For\nfactuality-related tasks, we also evaluate existing heuristics to create\ntraining data and find that training on them results in worse performance than\ntraining on $20\\times$ less human-labeled data. Our articles draw from $6$\ndomains, facilitating cross-domain analysis. On some tasks, the amount of\ntraining data matters more than the domain where it comes from, while for other\ntasks training specifically on data from the target domain, even if limited, is\nmore beneficial.\n","authors":["Kundan Krishna","Prakhar Gupta","Sanjana Ramprasad","Byron C. Wallace","Jeffrey P. Bigham","Zachary C. Lipton"],"pdf_url":"https://arxiv.org/pdf/2305.14296v2.pdf","comment":"EMNLP Findings 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2303.04788v3","updated":"2023-12-04T15:40:19Z","published":"2023-03-08T18:39:43Z","title":"Enabling Non-Linear Quantum Operations through Variational Quantum\n Splines","summary":" The postulates of quantum mechanics impose only unitary transformations on\nquantum states, which is a severe limitation for quantum machine learning\nalgorithms. Quantum Splines (QSplines) have recently been proposed to\napproximate quantum activation functions to introduce non-linearity in quantum\nalgorithms. However, QSplines make use of the HHL as a subroutine and require a\nfault-tolerant quantum computer to be correctly implemented. This work proposes\nthe Generalised Hybrid Quantum Splines (GHQSplines), a novel method for\napproximating non-linear quantum activation functions using hybrid\nquantum-classical computation. The GHQSplines overcome the highly demanding\nrequirements of the original QSplines in terms of quantum hardware and can be\nimplemented using near-term quantum computers. Furthermore, the proposed method\nrelies on a flexible problem representation for non-linear approximation and it\nis suitable to be embedded in existing quantum neural network architectures. In\naddition, we provide a practical implementation of the GHQSplines using\nPennylane and show that our model outperforms the original QSplines in terms of\nquality of fitting.\n","authors":["Matteo Antonio Inajetovic","Filippo Orazi","Antonio Macaluso","Stefano Lodi","Claudio Sartori"],"pdf_url":"https://arxiv.org/pdf/2303.04788v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17065v2","updated":"2023-12-04T15:37:57Z","published":"2023-11-22T17:14:18Z","title":"Efficient Deep Speech Understanding at the Edge","summary":" In contemporary speech understanding (SU), a sophisticated pipeline is\nemployed, encompassing the ingestion of streaming voice input. The pipeline\nexecutes beam search iteratively, invoking a deep neural network to generate\ntentative outputs (referred to as hypotheses) in an autoregressive manner.\nPeriodically, the pipeline assesses attention and Connectionist Temporal\nClassification (CTC) scores.\n This paper aims to enhance SU performance on edge devices with limited\nresources. Adopting a hybrid strategy, our approach focuses on accelerating\non-device execution and offloading inputs surpassing the device's capacity.\nWhile this approach is established, we tackle SU's distinctive challenges\nthrough innovative techniques: (1) Late Contextualization: This involves the\nparallel execution of a model's attentive encoder during input ingestion. (2)\nPilot Inference: Addressing temporal load imbalances in the SU pipeline, this\ntechnique aims to mitigate them effectively. (3) Autoregression Offramps:\nDecisions regarding offloading are made solely based on hypotheses, presenting\na novel approach.\n These techniques are designed to seamlessly integrate with existing speech\nmodels, pipelines, and frameworks, offering flexibility for independent or\ncombined application. Collectively, they form a hybrid solution for edge SU.\nOur prototype, named XYZ, has undergone testing on Arm platforms featuring 6 to\n8 cores, demonstrating state-of-the-art accuracy. Notably, it achieves a 2x\nreduction in end-to-end latency and a corresponding 2x decrease in offloading\nrequirements.\n","authors":["Rongxiang Wang","Felix Xiaozhu Lin"],"pdf_url":"https://arxiv.org/pdf/2311.17065v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.05371v4","updated":"2023-12-04T15:37:47Z","published":"2022-10-10T06:22:46Z","title":"On skip connections and normalisation layers in deep optimisation","summary":" We introduce a general theoretical framework, designed for the study of\ngradient optimisation of deep neural networks, that encompasses ubiquitous\narchitecture choices including batch normalisation, weight normalisation and\nskip connections. Our framework determines the curvature and regularity\nproperties of multilayer loss landscapes in terms of their constituent layers,\nthereby elucidating the roles played by normalisation layers and skip\nconnections in globalising these properties. We then demonstrate the utility of\nthis framework in two respects. First, we give the only proof of which we are\naware that a class of deep neural networks can be trained using gradient\ndescent to global optima even when such optima only exist at infinity, as is\nthe case for the cross-entropy cost. Second, we identify a novel causal\nmechanism by which skip connections accelerate training, which we verify\npredictively with ResNets on MNIST, CIFAR10, CIFAR100 and ImageNet.\n","authors":["Lachlan Ewen MacDonald","Jack Valmadre","Hemanth Saratchandran","Simon Lucey"],"pdf_url":"https://arxiv.org/pdf/2210.05371v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.13440v2","updated":"2023-12-04T15:27:00Z","published":"2023-06-23T11:09:43Z","title":"Trading-off price for data quality to achieve fair online allocation","summary":" We consider the problem of online allocation subject to a long-term fairness\npenalty. Contrary to existing works, however, we do not assume that the\ndecision-maker observes the protected attributes -- which is often unrealistic\nin practice. Instead they can purchase data that help estimate them from\nsources of different quality; and hence reduce the fairness penalty at some\ncost. We model this problem as a multi-armed bandit problem where each arm\ncorresponds to the choice of a data source, coupled with the online allocation\nproblem. We propose an algorithm that jointly solves both problems and show\nthat it has a regret bounded by $\\mathcal{O}(\\sqrt{T})$. A key difficulty is\nthat the rewards received by selecting a source are correlated by the fairness\npenalty, which leads to a need for randomization (despite a stochastic\nsetting). Our algorithm takes into account contextual information available\nbefore the source selection, and can adapt to many different fairness notions.\nWe also show that in some instances, the estimates used can be learned on the\nfly.\n","authors":["Mathieu Molina","Nicolas Gast","Patrick Loiseau","Vianney Perchet"],"pdf_url":"https://arxiv.org/pdf/2306.13440v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01959v1","updated":"2023-12-04T15:16:42Z","published":"2023-12-04T15:16:42Z","title":"Learning-Based Approaches to Predictive Monitoring with Conformal\n Statistical Guarantees","summary":" This tutorial focuses on efficient methods to predictive monitoring (PM), the\nproblem of detecting at runtime future violations of a given requirement from\nthe current state of a system. While performing model checking at runtime would\noffer a precise solution to the PM problem, it is generally computationally\nexpensive. To address this scalability issue, several lightweight approaches\nbased on machine learning have recently been proposed. These approaches work by\nlearning an approximate yet efficient surrogate (deep learning) model of the\nexpensive model checker. A key challenge remains to ensure reliable\npredictions, especially in safety-critical applications. We review our recent\nwork on predictive monitoring, one of the first to propose learning-based\napproximations for CPS verification of temporal logic specifications and the\nfirst in this context to apply conformal prediction (CP) for rigorous\nuncertainty quantification. These CP-based uncertainty estimators offer\nstatistical guarantees regarding the generalization error of the learning\nmodel, and they can be used to determine unreliable predictions that should be\nrejected. In this tutorial, we present a general and comprehensive framework\nsummarizing our approach to the predictive monitoring of CPSs, examining in\ndetail several variants determined by three main dimensions: system dynamics\n(deterministic, non-deterministic, stochastic), state observability, and\nsemantics of requirements' satisfaction (Boolean or quantitative).\n","authors":["Francesca Cairoli","Luca Bortolussi","Nicola Paoletti"],"pdf_url":"https://arxiv.org/pdf/2312.01959v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01957v1","updated":"2023-12-04T15:16:12Z","published":"2023-12-04T15:16:12Z","title":"Distilled Self-Critique of LLMs with Synthetic Data: a Bayesian\n Perspective","summary":" This paper proposes an interpretation of RLAIF as Bayesian inference by\nintroducing distilled Self-Critique (dSC), which refines the outputs of a LLM\nthrough a Gibbs sampler that is later distilled into a fine-tuned model. Only\nrequiring synthetic data, dSC is exercised in experiments regarding safety,\nsentiment, and privacy control, showing it can be a viable and cheap\nalternative to align LLMs. Code released at\n\\url{https://github.com/vicgalle/distilled-self-critique}.\n","authors":["Victor Gallego"],"pdf_url":"https://arxiv.org/pdf/2312.01957v1.pdf","comment":"Submitted to ICLR 2024 (TinyPapers track)"},{"id":"http://arxiv.org/abs/2306.12509v2","updated":"2023-12-04T15:07:13Z","published":"2023-06-21T18:45:56Z","title":"Joint Prompt Optimization of Stacked LLMs using Variational Inference","summary":" Large language models (LLMs) can be seen as atomic units of computation\nmapping sequences to a distribution over sequences. Thus, they can be seen as\nstochastic language layers in a language network, where the learnable\nparameters are the natural language prompts at each layer. By stacking two such\nlayers and feeding the output of one layer to the next, we obtain a Deep\nLanguage Network (DLN). We first show how to effectively perform prompt\noptimization for a 1-Layer language network (DLN-1). Then, we present an\nextension that applies to 2-layer DLNs (DLN-2), where two prompts must be\nlearned. The key idea is to consider the output of the first layer as a latent\nvariable, which requires inference, and prompts to be learned as the parameters\nof the generative distribution. We first test the effectiveness of DLN-1 in\nmultiple reasoning and natural language understanding tasks. Then, we show that\nDLN-2 can reach higher performance than a single layer, showing promise that we\nmight reach comparable performance to GPT-4, even when each LLM in the network\nis smaller and less powerful.\n","authors":["Alessandro Sordoni","Xingdi Yuan","Marc-Alexandre Côté","Matheus Pereira","Adam Trischler","Ziang Xiao","Arian Hosseini","Friederike Niedtner","Nicolas Le Roux"],"pdf_url":"https://arxiv.org/pdf/2306.12509v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.01947v1","updated":"2023-12-04T15:04:42Z","published":"2023-12-04T15:04:42Z","title":"Maximising Quantum-Computing Expressive Power through Randomised\n Circuits","summary":" In the noisy intermediate-scale quantum era, variational quantum algorithms\n(VQAs) have emerged as a promising avenue to obtain quantum advantage. However,\nthe success of VQAs depends on the expressive power of parameterised quantum\ncircuits, which is constrained by the limited gate number and the presence of\nbarren plateaus. In this work, we propose and numerically demonstrate a novel\napproach for VQAs, utilizing randomised quantum circuits to generate the\nvariational wavefunction. We parameterize the distribution function of these\nrandom circuits using artificial neural networks and optimize it to find the\nsolution. This random-circuit approach presents a trade-off between the\nexpressive power of the variational wavefunction and time cost, in terms of the\nsampling cost of quantum circuits. Given a fixed gate number, we can\nsystematically increase the expressive power by extending the quantum-computing\ntime. With a sufficiently large permissible time cost, the variational\nwavefunction can approximate any quantum state with arbitrary accuracy.\nFurthermore, we establish explicit relationships between expressive power, time\ncost, and gate number for variational quantum eigensolvers. These results\nhighlight the promising potential of the random-circuit approach in achieving a\nhigh expressive power in quantum computing.\n","authors":["Yingli Yang","Zongkang Zhang","Anbang Wang","Xiaosi Xu","Xiaoting Wang","Ying Li"],"pdf_url":"https://arxiv.org/pdf/2312.01947v1.pdf","comment":"19 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.01941v1","updated":"2023-12-04T14:58:19Z","published":"2023-12-04T14:58:19Z","title":"Intrusion Detection System with Machine Learning and Multiple Datasets","summary":" As Artificial Intelligence (AI) technologies continue to gain traction in the\nmodern-day world, they ultimately pose an immediate threat to current\ncybersecurity systems via exploitative methods. Prompt engineering is a\nrelatively new field that explores various prompt designs that can hijack large\nlanguage models (LLMs). If used by an unethical attacker, it can enable an AI\nsystem to offer malicious insights and code to them. In this paper, an enhanced\nintrusion detection system (IDS) that utilizes machine learning (ML) and\nhyperparameter tuning is explored, which can improve a model's performance in\nterms of accuracy and efficacy. Ultimately, this improved system can be used to\ncombat the attacks made by unethical hackers. A standard IDS is solely\nconfigured with pre-configured rules and patterns; however, with the\nutilization of machine learning, implicit and different patterns can be\ngenerated through the models' hyperparameter settings and parameters. In\naddition, the IDS will be equipped with multiple datasets so that the accuracy\nof the models improves. We evaluate the performance of multiple ML models and\ntheir respective hyperparameter settings through various metrics to compare\ntheir results to other models and past research work. The results of the\nproposed multi-dataset integration method yielded an accuracy score of 99.9%\nwhen equipped with the XGBoost and random forest classifiers and\nRandomizedSearchCV hyperparameter technique.\n","authors":["Haiyan Xuan","Mohith Manohar"],"pdf_url":"https://arxiv.org/pdf/2312.01941v1.pdf","comment":"12 pages, 2 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.01939v1","updated":"2023-12-04T14:55:58Z","published":"2023-12-04T14:55:58Z","title":"Foundations for Transfer in Reinforcement Learning: A Taxonomy of\n Knowledge Modalities","summary":" Contemporary artificial intelligence systems exhibit rapidly growing\nabilities accompanied by the growth of required resources, expansive datasets\nand corresponding investments into computing infrastructure. Although earlier\nsuccesses predominantly focus on constrained settings, recent strides in\nfundamental research and applications aspire to create increasingly general\nsystems. This evolving landscape presents a dual panorama of opportunities and\nchallenges in refining the generalisation and transfer of knowledge - the\nextraction from existing sources and adaptation as a comprehensive foundation\nfor tackling new problems. Within the domain of reinforcement learning (RL),\nthe representation of knowledge manifests through various modalities, including\ndynamics and reward models, value functions, policies, and the original data.\nThis taxonomy systematically targets these modalities and frames its discussion\nbased on their inherent properties and alignment with different objectives and\nmechanisms for transfer. Where possible, we aim to provide coarse guidance\ndelineating approaches which address requirements such as limiting environment\ninteractions, maximising computational efficiency, and enhancing generalisation\nacross varying axes of change. Finally, we analyse reasons contributing to the\nprevalence or scarcity of specific forms of transfer, the inherent potential\nbehind pushing these frontiers, and underscore the significance of\ntransitioning from designed to learned transfer.\n","authors":["Markus Wulfmeier","Arunkumar Byravan","Sarah Bechtle","Karol Hausman","Nicolas Heess"],"pdf_url":"https://arxiv.org/pdf/2312.01939v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16563v2","updated":"2023-12-04T14:53:15Z","published":"2023-03-29T09:45:50Z","title":"Skill Reinforcement Learning and Planning for Open-World Long-Horizon\n Tasks","summary":" We study building multi-task agents in open-world environments. Without human\ndemonstrations, learning to accomplish long-horizon tasks in a large open-world\nenvironment with reinforcement learning (RL) is extremely inefficient. To\ntackle this challenge, we convert the multi-task learning problem into learning\nbasic skills and planning over the skills. Using the popular open-world game\nMinecraft as the testbed, we propose three types of fine-grained basic skills,\nand use RL with intrinsic rewards to acquire skills. A novel Finding-skill that\nperforms exploration to find diverse items provides better initialization for\nother skills, improving the sample efficiency for skill learning. In skill\nplanning, we leverage the prior knowledge in Large Language Models to find the\nrelationships between skills and build a skill graph. When the agent is solving\na task, our skill search algorithm walks on the skill graph and generates the\nproper skill plans for the agent. In experiments, our method accomplishes 40\ndiverse Minecraft tasks, where many tasks require sequentially executing for\nmore than 10 skills. Our method outperforms baselines by a large margin and is\nthe most sample-efficient demonstration-free RL method to solve Minecraft Tech\nTree tasks. The project's website and code can be found at\nhttps://sites.google.com/view/plan4mc.\n","authors":["Haoqi Yuan","Chi Zhang","Hongcheng Wang","Feiyang Xie","Penglin Cai","Hao Dong","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2303.16563v2.pdf","comment":"24 pages, presented in Foundation Models for Decision Making Workshop\n at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2308.14831v2","updated":"2023-12-04T14:52:08Z","published":"2023-08-28T18:31:09Z","title":"Continual Learning with Dynamic Sparse Training: Exploring Algorithms\n for Effective Model Updates","summary":" Continual learning (CL) refers to the ability of an intelligent system to\nsequentially acquire and retain knowledge from a stream of data with as little\ncomputational overhead as possible. To this end; regularization, replay,\narchitecture, and parameter isolation approaches were introduced to the\nliterature. Parameter isolation using a sparse network which enables to\nallocate distinct parts of the neural network to different tasks and also\nallows to share of parameters between tasks if they are similar. Dynamic Sparse\nTraining (DST) is a prominent way to find these sparse networks and isolate\nthem for each task. This paper is the first empirical study investigating the\neffect of different DST components under the CL paradigm to fill a critical\nresearch gap and shed light on the optimal configuration of DST for CL if it\nexists. Therefore, we perform a comprehensive study in which we investigate\nvarious DST components to find the best topology per task on well-known\nCIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our\nprimary focus is to evaluate the performance of various DST criteria, rather\nthan the process of mask selection. We found that, at a low sparsity level,\nErdos-R\\'enyi Kernel (ERK) initialization utilizes the backbone more\nefficiently and allows to effectively learn increments of tasks. At a high\nsparsity level, unless it is extreme, uniform initialization demonstrates a\nmore reliable and robust performance. In terms of growth strategy; performance\nis dependent on the defined initialization strategy and the extent of sparsity.\nFinally, adaptivity within DST components is a promising way for better\ncontinual learners.\n","authors":["Murat Onur Yildirim","Elif Ceren Gok Yildirim","Ghada Sokar","Decebal Constantin Mocanu","Joaquin Vanschoren"],"pdf_url":"https://arxiv.org/pdf/2308.14831v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00102v2","updated":"2023-12-04T14:27:37Z","published":"2023-11-30T16:01:51Z","title":"FedEmb: A Vertical and Hybrid Federated Learning Algorithm using Network\n And Feature Embedding Aggregation","summary":" Federated learning (FL) is an emerging paradigm for decentralized training of\nmachine learning models on distributed clients, without revealing the data to\nthe central server. The learning scheme may be horizontal, vertical or hybrid\n(both vertical and horizontal). Most existing research work with deep neural\nnetwork (DNN) modelling is focused on horizontal data distributions, while\nvertical and hybrid schemes are much less studied. In this paper, we propose a\ngeneralized algorithm FedEmb, for modelling vertical and hybrid DNN-based\nlearning. The idea of our algorithm is characterised by higher inference\naccuracy, stronger privacy-preserving properties, and lower client-server\ncommunication bandwidth demands as compared with existing work. The\nexperimental results show that FedEmb is an effective method to tackle both\nsplit feature & subject space decentralized problems, shows 0.3% to 4.2%\ninference accuracy improvement with limited privacy revealing for datasets\nstored in local clients, and reduces 88.9 % time complexity over vertical\nbaseline method.\n","authors":["Fanfei Meng","Lele Zhang","Yu Chen","Yuxin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.00102v2.pdf","comment":"Accepted by Proceedings on Engineering Sciences"},{"id":"http://arxiv.org/abs/2305.14912v4","updated":"2023-12-04T14:20:17Z","published":"2023-05-24T09:02:01Z","title":"SVDinsTN: A Tensor Network Paradigm for Efficient Structure Search from\n Regularized Modeling Perspective","summary":" Tensor network (TN) representation is a powerful technique for computer\nvision and machine learning. TN structure search (TN-SS) aims to search for a\ncustomized structure to achieve a compact representation, which is a\nchallenging NP-hard problem. Recent \"sampling-evaluation-based\" methods require\nsampling an extensive collection of structures and evaluating them one by one,\nresulting in prohibitively high computational costs. To address this issue, we\npropose a novel TN paradigm, named SVD-inspired TN decomposition (SVDinsTN),\nwhich allows us to efficiently solve the TN-SS problem from a regularized\nmodeling perspective, eliminating the repeated structure evaluations. To be\nspecific, by inserting a diagonal factor for each edge of the fully-connected\nTN, SVDinsTN allows us to calculate TN cores and diagonal factors\nsimultaneously, with the factor sparsity revealing a compact TN structure. In\ntheory, we prove a convergence guarantee for the proposed method. Experimental\nresults demonstrate that the proposed method achieves approximately 100 to 1000\ntimes acceleration compared to the state-of-the-art TN-SS methods while\nmaintaining a comparable representation ability.\n","authors":["Yu-Bang Zheng","Xi-Le Zhao","Junhua Zeng","Chao Li","Qibin Zhao","Heng-Chao Li","Ting-Zhu Huang"],"pdf_url":"https://arxiv.org/pdf/2305.14912v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01904v1","updated":"2023-12-04T14:02:56Z","published":"2023-12-04T14:02:56Z","title":"Unsupervised Anomaly Detection using Aggregated Normative Diffusion","summary":" Early detection of anomalies in medical images such as brain MRI is highly\nrelevant for diagnosis and treatment of many conditions. Supervised machine\nlearning methods are limited to a small number of pathologies where there is\ngood availability of labeled data. In contrast, unsupervised anomaly detection\n(UAD) has the potential to identify a broader spectrum of anomalies by spotting\ndeviations from normal patterns. Our research demonstrates that existing\nstate-of-the-art UAD approaches do not generalise well to diverse types of\nanomalies in realistic multi-modal MR data. To overcome this, we introduce a\nnew UAD method named Aggregated Normative Diffusion (ANDi). ANDi operates by\naggregating differences between predicted denoising steps and ground truth\nbackwards transitions in Denoising Diffusion Probabilistic Models (DDPMs) that\nhave been trained on pyramidal Gaussian noise. We validate ANDi against three\nrecent UAD baselines, and across three diverse brain MRI datasets. We show that\nANDi, in some cases, substantially surpasses these baselines and shows\nincreased robustness to varying types of anomalies. Particularly in detecting\nmultiple sclerosis (MS) lesions, ANDi achieves improvements of up to 178% in\nterms of AUPRC.\n","authors":["Alexander Frotscher","Jaivardhan Kapoor","Thomas Wolfers","Christian F. Baumgartner"],"pdf_url":"https://arxiv.org/pdf/2312.01904v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01898v1","updated":"2023-12-04T13:54:05Z","published":"2023-12-04T13:54:05Z","title":"Unlocking optimal batch size schedules using continuous-time control and\n perturbation theory","summary":" Stochastic Gradient Descent (SGD) and its variants are almost universally\nused to train neural networks and to fit a variety of other parametric models.\nAn important hyperparameter in this context is the batch size, which determines\nhow many samples are processed before an update of the parameters occurs.\nPrevious studies have demonstrated the benefits of using variable batch sizes.\nIn this work, we will theoretically derive optimal batch size schedules for SGD\nand similar algorithms, up to an error that is quadratic in the learning rate.\nTo achieve this, we approximate the discrete process of parameter updates using\na family of stochastic differential equations indexed by the learning rate. To\nbetter handle the state-dependent diffusion coefficient, we further expand the\nsolution of this family into a series with respect to the learning rate. Using\nthis setup, we derive a continuous-time optimal batch size schedule for a large\nfamily of diffusion coefficients and then apply the results in the setting of\nlinear regression.\n","authors":["Stefan Perko"],"pdf_url":"https://arxiv.org/pdf/2312.01898v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16860v2","updated":"2023-12-04T13:46:17Z","published":"2023-11-28T15:07:25Z","title":"Data-efficient operator learning for solving high Mach number fluid flow\n problems","summary":" We consider the problem of using SciML to predict solutions of high Mach\nfluid flows over irregular geometries. In this setting, data is limited, and so\nit is desirable for models to perform well in the low-data setting. We show\nthat Neural Basis Functions (NBF), which learns a basis of behavior modes from\nthe data and then uses this basis to make predictions, is more effective than a\nbasis-unaware baseline model. In addition, we identify continuing challenges in\nthe space of predicting solutions for this type of problem.\n","authors":["Noah Ford","Victor J. Leon","Honest Mrema","Jeffrey Gilbert","Alexander New"],"pdf_url":"https://arxiv.org/pdf/2311.16860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01887v1","updated":"2023-12-04T13:40:22Z","published":"2023-12-04T13:40:22Z","title":"Non-Intrusive Load Monitoring for Feeder-Level EV Charging Detection:\n Sliding Window-based Approaches to Offline and Online Detection","summary":" Understanding electric vehicle (EV) charging on the distribution network is\nkey to effective EV charging management and aiding decarbonization across the\nenergy and transport sectors. Advanced metering infrastructure has allowed\ndistribution system operators and utility companies to collect high-resolution\nload data from their networks. These advancements enable the non-intrusive load\nmonitoring (NILM) technique to detect EV charging using load measurement data.\nWhile existing studies primarily focused on NILM for EV charging detection in\nindividual households, there is a research gap on EV charging detection at the\nfeeder level, presenting unique challenges due to the combined load measurement\nfrom multiple households. In this paper, we develop a novel and effective\napproach for EV detection at the feeder level, involving sliding-window feature\nextraction and classical machine learning techniques, specifically models like\nXGBoost and Random Forest. Our developed method offers a lightweight and\nefficient solution, capable of quick training. Moreover, our developed method\nis versatile, supporting both offline and online EV charging detection. Our\nexperimental results demonstrate high-accuracy EV charging detection at the\nfeeder level, achieving an F-Score of 98.88% in offline detection and 93.01% in\nonline detection.\n","authors":["Cameron Martin","Fucai Ke","Hao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01887v1.pdf","comment":"The 7th IEEE Conference on Energy Internet and Energy System\n Integration (EI2 2023)"},{"id":"http://arxiv.org/abs/2312.01884v1","updated":"2023-12-04T13:33:51Z","published":"2023-12-04T13:33:51Z","title":"Correlation and Unintended Biases on Univariate and Multivariate\n Decision Trees","summary":" Decision Trees are accessible, interpretable, and well-performing\nclassification models. A plethora of variants with increasing expressiveness\nhas been proposed in the last forty years. We contrast the two families of\nunivariate DTs, whose split functions partition data through axis-parallel\nhyperplanes, and multivariate DTs, whose splits instead partition data through\noblique hyperplanes. The latter include the former, hence multivariate DTs are\nin principle more powerful. Surprisingly enough, however, univariate DTs\nconsistently show comparable performances in the literature. We analyze the\nreasons behind this, both with synthetic and real-world benchmark datasets. Our\nresearch questions test whether the pre-processing phase of removing\ncorrelation among features in datasets has an impact on the relative\nperformances of univariate vs multivariate DTs. We find that existing benchmark\ndatasets are likely biased towards favoring univariate DTs.\n","authors":["Mattia Setzu","Salvatore Ruggieri"],"pdf_url":"https://arxiv.org/pdf/2312.01884v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.03863v4","updated":"2023-12-04T13:24:31Z","published":"2021-07-08T14:19:28Z","title":"Benchpress: A Scalable and Versatile Workflow for Benchmarking Structure\n Learning Algorithms","summary":" Describing the relationship between the variables in a study domain and\nmodelling the data generating mechanism is a fundamental problem in many\nempirical sciences. Probabilistic graphical models are one common approach to\ntackle the problem. Learning the graphical structure for such models is\ncomputationally challenging and a fervent area of current research with a\nplethora of algorithms being developed. To facilitate the benchmarking of\ndifferent methods, we present a novel Snakemake workflow, called Benchpress for\nproducing scalable, reproducible, and platform-independent benchmarks of\nstructure learning algorithms for probabilistic graphical models. Benchpress is\ninterfaced via a simple JSON-file, which makes it accessible for all users,\nwhile the code is designed in a fully modular fashion to enable researchers to\ncontribute additional methodologies. Benchpress currently provides an interface\nto a large number of state-of-the-art algorithms from libraries such as\nBDgraph, BiDAG, bnlearn, causal-learn, gCastle, GOBNILP, pcalg, r.blip,\nscikit-learn, TETRAD, and trilearn as well as a variety of methods for data\ngenerating models and performance evaluation. Alongside user-defined models and\nrandomly generated datasets, the workflow also includes a number of standard\ndatasets and graphical models from the literature, which may be included in a\nbenchmarking study. We demonstrate the applicability of this workflow for\nlearning Bayesian networks in five typical data scenarios. The source code and\ndocumentation is publicly available from http://benchpressdocs.readthedocs.io.\n","authors":["Felix L. Rios","Giusi Moffa","Jack Kuipers"],"pdf_url":"https://arxiv.org/pdf/2107.03863v4.pdf","comment":"41 pages, 8 figure"},{"id":"http://arxiv.org/abs/2310.14714v3","updated":"2023-12-04T13:23:46Z","published":"2023-10-23T08:51:05Z","title":"BatteryML:An Open-source platform for Machine Learning on Battery\n Degradation","summary":" Battery degradation remains a pivotal concern in the energy storage domain,\nwith machine learning emerging as a potent tool to drive forward insights and\nsolutions. However, this intersection of electrochemical science and machine\nlearning poses complex challenges. Machine learning experts often grapple with\nthe intricacies of battery science, while battery researchers face hurdles in\nadapting intricate models tailored to specific datasets. Beyond this, a\ncohesive standard for battery degradation modeling, inclusive of data formats\nand evaluative benchmarks, is conspicuously absent. Recognizing these\nimpediments, we present BatteryML - a one-step, all-encompass, and open-source\nplatform designed to unify data preprocessing, feature extraction, and the\nimplementation of both traditional and state-of-the-art models. This\nstreamlined approach promises to enhance the practicality and efficiency of\nresearch applications. BatteryML seeks to fill this void, fostering an\nenvironment where experts from diverse specializations can collaboratively\ncontribute, thus elevating the collective understanding and advancement of\nbattery research.The code for our project is publicly available on GitHub at\nhttps://github.com/microsoft/BatteryML.\n","authors":["Han Zhang","Xiaofan Gui","Shun Zheng","Ziheng Lu","Yuqi Li","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2310.14714v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01878v1","updated":"2023-12-04T13:20:15Z","published":"2023-12-04T13:20:15Z","title":"HGPROMPT: Bridging Homogeneous and Heterogeneous Graphs for Few-shot\n Prompt Learning","summary":" Graph neural networks (GNNs) and heterogeneous graph neural networks (HGNNs)\nare prominent techniques for homogeneous and heterogeneous graph representation\nlearning, yet their performance in an end-to-end supervised framework greatly\ndepends on the availability of task-specific supervision. To reduce the\nlabeling cost, pre-training on self-supervised pretext tasks has become a\npopular paradigm,but there is often a gap between the pre-trained model and\ndownstream tasks, stemming from the divergence in their objectives. To bridge\nthe gap, prompt learning has risen as a promising direction especially in\nfew-shot settings, without the need to fully fine-tune the pre-trained model.\nWhile there has been some early exploration of prompt-based learning on graphs,\nthey primarily deal with homogeneous graphs, ignoring the heterogeneous graphs\nthat are prevalent in downstream applications. In this paper, we propose\nHGPROMPT, a novel pre-training and prompting framework to unify not only\npre-training and downstream tasks but also homogeneous and heterogeneous graphs\nvia a dual-template design. Moreover, we propose dual-prompt in HGPROMPT to\nassist a downstream task in locating the most relevant prior to bridge the gaps\ncaused by not only feature variations but also heterogeneity differences across\ntasks. Finally, we thoroughly evaluate and analyze HGPROMPT through extensive\nexperiments on three public datasets.\n","authors":["Xingtong Yu","Zemin Liu","Yuan Fang","Xinming Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01878v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.01853v1","updated":"2023-12-04T12:35:43Z","published":"2023-12-04T12:35:43Z","title":"Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing","summary":" Executing contact-rich manipulation tasks necessitates the fusion of tactile\nand visual feedback. However, the distinct nature of these modalities poses\nsignificant challenges. In this paper, we introduce a system that leverages\nvisual and tactile sensory inputs to enable dexterous in-hand manipulation.\nSpecifically, we propose Robot Synesthesia, a novel point cloud-based tactile\nrepresentation inspired by human tactile-visual synesthesia. This approach\nallows for the simultaneous and seamless integration of both sensory inputs,\noffering richer spatial information and facilitating better reasoning about\nrobot actions. The method, trained in a simulated environment and then deployed\nto a real robot, is applicable to various in-hand object rotation tasks.\nComprehensive ablations are performed on how the integration of vision and\ntouch can improve reinforcement learning and Sim2Real performance. Our project\npage is available at https://yingyuan0414.github.io/visuotactile/ .\n","authors":["Ying Yuan","Haichuan Che","Yuzhe Qin","Binghao Huang","Zhao-Heng Yin","Kang-Won Lee","Yi Wu","Soo-Chul Lim","Xiaolong Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01853v1.pdf","comment":"Project page: https://yingyuan0414.github.io/visuotactile/"},{"id":"http://arxiv.org/abs/2312.01850v1","updated":"2023-12-04T12:31:45Z","published":"2023-12-04T12:31:45Z","title":"Generalization by Adaptation: Diffusion-Based Domain Extension for\n Domain-Generalized Semantic Segmentation","summary":" When models, e.g., for semantic segmentation, are applied to images that are\nvastly different from training data, the performance will drop significantly.\nDomain adaptation methods try to overcome this issue, but need samples from the\ntarget domain. However, this might not always be feasible for various reasons\nand therefore domain generalization methods are useful as they do not require\nany target data. We present a new diffusion-based domain extension (DIDEX)\nmethod and employ a diffusion model to generate a pseudo-target domain with\ndiverse text prompts. In contrast to existing methods, this allows to control\nthe style and content of the generated images and to introduce a high\ndiversity. In a second step, we train a generalizing model by adapting towards\nthis pseudo-target domain. We outperform previous approaches by a large margin\nacross various datasets and architectures without using any real data. For the\ngeneralization from GTA5, we improve state-of-the-art mIoU performance by 3.8%\nabsolute on average and for SYNTHIA by 11.8% absolute, marking a big step for\nthe generalization performance on these benchmarks. Code is available at\nhttps://github.com/JNiemeijer/DIDEX\n","authors":["Joshua Niemeijer","Manuel Schwonberg","Jan-Aike Termöhlen","Nico M. Schmidt","Tim Fingscheidt"],"pdf_url":"https://arxiv.org/pdf/2312.01850v1.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2010.02180v3","updated":"2023-12-04T12:23:52Z","published":"2020-10-05T17:27:31Z","title":"Pareto Probing: Trading Off Accuracy for Complexity","summary":" The question of how to probe contextual word representations for linguistic\nstructure in a way that is both principled and useful has seen significant\nattention recently in the NLP literature. In our contribution to this\ndiscussion, we argue for a probe metric that reflects the fundamental trade-off\nbetween probe complexity and performance: the Pareto hypervolume. To measure\ncomplexity, we present a number of parametric and non-parametric metrics. Our\nexperiments using Pareto hypervolume as an evaluation metric show that probes\noften do not conform to our expectations -- e.g., why should the non-contextual\nfastText representations encode more morpho-syntactic information than the\ncontextual BERT representations? These results suggest that common, simplistic\nprobing tasks, such as part-of-speech labeling and dependency arc labeling, are\ninadequate to evaluate the linguistic structure encoded in contextual word\nrepresentations. This leads us to propose full dependency parsing as a probing\ntask. In support of our suggestion that harder probing tasks are necessary, our\nexperiments with dependency parsing reveal a wide gap in syntactic knowledge\nbetween contextual and non-contextual representations.\n","authors":["Tiago Pimentel","Naomi Saphra","Adina Williams","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2010.02180v3.pdf","comment":"Tiago Pimentel and Naomi Saphra contributed equally to this work.\n Camera ready version of EMNLP 2020 publication. In this new version, we fixed\n some notation issues in the appendix, and added a new appendix section\n describing our MLP. Code available in\n https://github.com/rycolab/pareto-probing"},{"id":"http://arxiv.org/abs/2312.01818v1","updated":"2023-12-04T11:46:34Z","published":"2023-12-04T11:46:34Z","title":"Learning Machine Morality through Experience and Interaction","summary":" Increasing interest in ensuring safety of next-generation Artificial\nIntelligence (AI) systems calls for novel approaches to embedding morality into\nautonomous agents. Traditionally, this has been done by imposing explicit\ntop-down rules or hard constraints on systems, for example by filtering system\noutputs through pre-defined ethical rules. Recently, instead, entirely\nbottom-up methods for learning implicit preferences from human behavior have\nbecome increasingly popular, such as those for training and fine-tuning Large\nLanguage Models. In this paper, we provide a systematization of existing\napproaches to the problem of introducing morality in machines - modeled as a\ncontinuum, and argue that the majority of popular techniques lie at the\nextremes - either being fully hard-coded, or entirely learned, where no\nexplicit statement of any moral principle is required. Given the relative\nstrengths and weaknesses of each type of methodology, we argue that more hybrid\nsolutions are needed to create adaptable and robust, yet more controllable and\ninterpretable agents.\n In particular, we present three case studies of recent works which use\nlearning from experience (i.e., Reinforcement Learning) to explicitly provide\nmoral principles to learning agents - either as intrinsic rewards, moral\nlogical constraints or textual principles for language models. For example,\nusing intrinsic rewards in Social Dilemma games, we demonstrate how it is\npossible to represent classical moral frameworks for agents. We also present an\noverview of the existing work in this area in order to provide empirical\nevidence for the potential of this hybrid approach. We then discuss strategies\nfor evaluating the effectiveness of moral learning agents. Finally, we present\nopen research questions and implications for the future of AI safety and ethics\nwhich are emerging from this framework.\n","authors":["Elizaveta Tennant","Stephen Hailes","Mirco Musolesi"],"pdf_url":"https://arxiv.org/pdf/2312.01818v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01816v1","updated":"2023-12-04T11:45:44Z","published":"2023-12-04T11:45:44Z","title":"Class Symbolic Regression: Gotta Fit 'Em All","summary":" We introduce \"Class Symbolic Regression\" a first framework for automatically\nfinding a single analytical functional form that accurately fits multiple\ndatasets - each governed by its own (possibly) unique set of fitting\nparameters. This hierarchical framework leverages the common constraint that\nall the members of a single class of physical phenomena follow a common\ngoverning law. Our approach extends the capabilities of our earlier Physical\nSymbolic Optimization ($\\Phi$-SO) framework for Symbolic Regression, which\nintegrates dimensional analysis constraints and deep reinforcement learning for\nsymbolic analytical function discovery from data. We demonstrate the efficacy\nof this novel approach by applying it to a panel of synthetic toy case datasets\nand showcase its practical utility for astrophysics by successfully extracting\nan analytic galaxy potential from a set of simulated orbits approximating\nstellar streams.\n","authors":["Wassim Tenachi","Rodrigo Ibata","Thibaut L. François","Foivos I. Diakogiannis"],"pdf_url":"https://arxiv.org/pdf/2312.01816v1.pdf","comment":"7 pages, 1 figure, 2 tables. Submitted to ApJL"},{"id":"http://arxiv.org/abs/2312.01811v1","updated":"2023-12-04T11:30:26Z","published":"2023-12-04T11:30:26Z","title":"Energy-based Potential Games for Joint Motion Forecasting and Control","summary":" This work uses game theory as a mathematical framework to address interaction\nmodeling in multi-agent motion forecasting and control. Despite its\ninterpretability, applying game theory to real-world robotics, like automated\ndriving, faces challenges such as unknown game parameters. To tackle these, we\nestablish a connection between differential games, optimal control, and\nenergy-based models, demonstrating how existing approaches can be unified under\nour proposed Energy-based Potential Game formulation. Building upon this, we\nintroduce a new end-to-end learning application that combines neural networks\nfor game-parameter inference with a differentiable game-theoretic optimization\nlayer, acting as an inductive bias. The analysis provides empirical evidence\nthat the game-theoretic layer adds interpretability and improves the predictive\nperformance of various neural network backbones using two simulations and two\nreal-world driving datasets.\n","authors":["Christopher Diehl","Tobias Klosek","Martin Krüger","Nils Murzyn","Timo Osterburg","Torsten Bertram"],"pdf_url":"https://arxiv.org/pdf/2312.01811v1.pdf","comment":"Conference on Robot Learning, CoRL 2023"},{"id":"http://arxiv.org/abs/2308.16458v4","updated":"2023-12-04T11:05:29Z","published":"2023-08-31T04:52:58Z","title":"BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual\n Pragmatic Knowledge","summary":" Pre-trained large language models have significantly improved code\ngeneration. As these models scale up, there is an increasing need for the\noutput to handle more intricate tasks and to be appropriately specialized to\nparticular domains. Here, we target bioinformatics due to the amount of\nspecialized domain knowledge, algorithms, and data operations this discipline\nrequires. We present BioCoder, a benchmark developed to evaluate large language\nmodels (LLMs) in generating bioinformatics-specific code. BioCoder spans a\nbroad spectrum of the field and covers cross-file dependencies, class\ndeclarations, and global variables. It incorporates 1026 Python functions and\n1243 Java methods extracted from GitHub, along with 253 examples from the\nRosalind Project, all pertaining to bioinformatics. Using topic modeling we\nshow that overall coverage of the included code is representative of the full\nspectrum of bioinformatics calculations. BioCoder incorporates a fuzz-testing\nframework for evaluation. We have applied it to evaluate many models including\nInCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, InstructCodeT5+,\nGPT-3.5, and GPT-4. Furthermore, we finetuned StarCoder, demonstrating how our\ndataset can effectively enhance the performance of LLMs on our benchmark (by\n>15% in terms of Pass@K in certain prompt configurations and always >3%). The\nresults highlight two key aspects of successful models: (1) Successful models\naccommodate a long prompt (> ~2600 tokens) with full context, for functional\ndependencies. (2) They contain specific domain knowledge of bioinformatics,\nbeyond just general coding knowledge. This is evident from the performance gain\nof GPT-3.5/4 compared to the smaller models on the benchmark (50% vs up to\n~25%). Our dataset, benchmark, Docker images, and scripts required for testing\nare all available at https://github.com/gersteinlab/biocoder.\n","authors":["Xiangru Tang","Bill Qian","Rick Gao","Jiakang Chen","Xinyun Chen","Mark Gerstein"],"pdf_url":"https://arxiv.org/pdf/2308.16458v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02842v3","updated":"2023-12-04T10:37:00Z","published":"2023-07-06T08:14:54Z","title":"Provably Efficient Iterated CVaR Reinforcement Learning with Function\n Approximation and Human Feedback","summary":" Risk-sensitive reinforcement learning (RL) aims to optimize policies that\nbalance the expected reward and risk. In this paper, we present a novel\nrisk-sensitive RL framework that employs an Iterated Conditional Value-at-Risk\n(CVaR) objective under both linear and general function approximations,\nenriched by human feedback. These new formulations provide a principled way to\nguarantee safety in each decision making step throughout the control process.\nMoreover, integrating human feedback into risk-sensitive RL framework bridges\nthe gap between algorithmic decision-making and human participation, allowing\nus to also guarantee safety for human-in-the-loop systems. We propose provably\nsample-efficient algorithms for this Iterated CVaR RL and provide rigorous\ntheoretical analysis. Furthermore, we establish a matching lower bound to\ncorroborate the optimality of our algorithms in a linear context.\n","authors":["Yu Chen","Yihan Du","Pihe Hu","Siwei Wang","Desheng Wu","Longbo Huang"],"pdf_url":"https://arxiv.org/pdf/2307.02842v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01795v1","updated":"2023-12-04T10:35:46Z","published":"2023-12-04T10:35:46Z","title":"Distributed Continual Learning with CoCoA in High-dimensional Linear\n Regression","summary":" We consider estimation under scenarios where the signals of interest exhibit\nchange of characteristics over time. In particular, we consider the continual\nlearning problem where different tasks, e.g., data with different\ndistributions, arrive sequentially and the aim is to perform well on the newly\narrived task without performance degradation on the previously seen tasks. In\ncontrast to the continual learning literature focusing on the centralized\nsetting, we investigate the problem from a distributed estimation perspective.\nWe consider the well-established distributed learning algorithm COCOA, which\ndistributes the model parameters and the corresponding features over the\nnetwork. We provide exact analytical characterization for the generalization\nerror of COCOA under continual learning for linear regression in a range of\nscenarios, where overparameterization is of particular interest. These\nanalytical results characterize how the generalization error depends on the\nnetwork structure, the task similarity and the number of tasks, and show how\nthese dependencies are intertwined. In particular, our results show that the\ngeneralization error can be significantly reduced by adjusting the network\nsize, where the most favorable network size depends on task similarity and the\nnumber of tasks. We present numerical results verifying the theoretical\nanalysis and illustrate the continual learning performance of COCOA with a\ndigit classification task.\n","authors":["Martin Hellkvist","Ayça Özçelikkale","Anders Ahlén"],"pdf_url":"https://arxiv.org/pdf/2312.01795v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01792v1","updated":"2023-12-04T10:27:38Z","published":"2023-12-04T10:27:38Z","title":"Wild-Tab: A Benchmark For Out-Of-Distribution Generalization In Tabular\n Regression","summary":" Out-of-Distribution (OOD) generalization, a cornerstone for building robust\nmachine learning models capable of handling data diverging from the training\nset's distribution, is an ongoing challenge in deep learning. While significant\nprogress has been observed in computer vision and natural language processing,\nits exploration in tabular data, ubiquitous in many industrial applications,\nremains nascent. To bridge this gap, we present Wild-Tab, a large-scale\nbenchmark tailored for OOD generalization in tabular regression tasks. The\nbenchmark incorporates 3 industrial datasets sourced from fields like weather\nprediction and power consumption estimation, providing a challenging testbed\nfor evaluating OOD performance under real-world conditions. Our extensive\nexperiments, evaluating 10 distinct OOD generalization methods on Wild-Tab,\nreveal nuanced insights. We observe that many of these methods often struggle\nto maintain high-performance levels on unseen data, with OOD performance\nshowing a marked drop compared to in-distribution performance. At the same\ntime, Empirical Risk Minimization (ERM), despite its simplicity, delivers\nrobust performance across all evaluations, rivaling the results of\nstate-of-the-art methods. Looking forward, we hope that the release of Wild-Tab\nwill facilitate further research on OOD generalization and aid in the\ndeployment of machine learning models in various real-world contexts where\nhandling distribution shifts is a crucial requirement.\n","authors":["Sergey Kolesnikov"],"pdf_url":"https://arxiv.org/pdf/2312.01792v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.18534v3","updated":"2023-12-04T10:20:40Z","published":"2023-10-27T23:18:44Z","title":"Multi Time Scale World Models","summary":" Intelligent agents use internal world models to reason and make predictions\nabout different courses of their actions at many scales. Devising learning\nparadigms and architectures that allow machines to learn world models that\noperate at multiple levels of temporal abstractions while dealing with complex\nuncertainty predictions is a major technical hurdle. In this work, we propose a\nprobabilistic formalism to learn multi-time scale world models which we call\nthe Multi Time Scale State Space (MTS3) model. Our model uses a computationally\nefficient inference scheme on multiple time scales for highly accurate\nlong-horizon predictions and uncertainty estimates over several seconds into\nthe future. Our experiments, which focus on action conditional long horizon\nfuture predictions, show that MTS3 outperforms recent methods on several system\nidentification benchmarks including complex simulated and real-world dynamical\nsystems. Code is available at this repository: https://github.com/ALRhub/MTS3.\n","authors":["Vaisakh Shaj","Saleh Gholam Zadeh","Ozan Demir","Luiz Ricardo Douat","Gerhard Neumann"],"pdf_url":"https://arxiv.org/pdf/2310.18534v3.pdf","comment":"Accepted as spotlight at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2307.10936v2","updated":"2023-12-04T10:15:26Z","published":"2023-07-20T15:09:06Z","title":"PASTA: Pretrained Action-State Transformer Agents","summary":" Self-supervised learning has brought about a revolutionary paradigm shift in\nvarious computing domains, including NLP, vision, and biology. Recent\napproaches involve pre-training transformer models on vast amounts of unlabeled\ndata, serving as a starting point for efficiently solving downstream tasks. In\nreinforcement learning, researchers have recently adapted these approaches,\ndeveloping models pre-trained on expert trajectories. This advancement enables\nthe models to tackle a broad spectrum of tasks, ranging from robotics to\nrecommendation systems. However, existing methods mostly rely on intricate\npre-training objectives tailored to specific downstream applications. This\npaper conducts a comprehensive investigation of models, referred to as\npre-trained action-state transformer agents (PASTA). Our study covers a unified\nmethodology and covers an extensive set of general downstream tasks including\nbehavioral cloning, offline RL, sensor failure robustness, and dynamics change\nadaptation. Our objective is to systematically compare various design choices\nand offer valuable insights that will aid practitioners in developing robust\nmodels. Key highlights of our study include tokenization at the component level\nfor actions and states, the use of fundamental pre-training objectives such as\nnext token prediction or masked language modeling, simultaneous training of\nmodels across multiple domains, and the application of various fine-tuning\nstrategies. In this study, the developed models contain fewer than 7 million\nparameters allowing a broad community to use these models and reproduce our\nexperiments. We hope that this study will encourage further research into the\nuse of transformers with first principle design choices to represent RL\ntrajectories and contribute to robust policy learning.\n","authors":["Raphael Boige","Yannis Flet-Berliac","Arthur Flajolet","Guillaume Richard","Thomas Pierrot"],"pdf_url":"https://arxiv.org/pdf/2307.10936v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.02658v3","updated":"2023-12-04T10:12:10Z","published":"2022-11-04T07:45:48Z","title":"Dealing with Drift of Adaptation Spaces in Learning-based Self-Adaptive\n Systems using Lifelong Self-Adaptation","summary":" Recently, machine learning (ML) has become a popular approach to support\nself-adaptation. ML has been used to deal with several problems in\nself-adaptation, such as maintaining an up-to-date runtime model under\nuncertainty and scalable decision-making. Yet, exploiting ML comes with\ninherent challenges. In this paper, we focus on a particularly important\nchallenge for learning-based self-adaptive systems: drift in adaptation spaces.\nWith adaptation space we refer to the set of adaptation options a self-adaptive\nsystem can select from at a given time to adapt based on the estimated quality\nproperties of the adaptation options. Drift of adaptation spaces originates\nfrom uncertainties, affecting the quality properties of the adaptation options.\nSuch drift may imply that eventually no adaptation option can satisfy the\ninitial set of the adaptation goals, deteriorating the quality of the system,\nor adaptation options may emerge that allow enhancing the adaptation goals. In\nML, such shift corresponds to novel class appearance, a type of concept drift\nin target data that common ML techniques have problems dealing with. To tackle\nthis problem, we present a novel approach to self-adaptation that enhances\nlearning-based self-adaptive systems with a lifelong ML layer. We refer to this\napproach as lifelong self-adaptation. The lifelong ML layer tracks the system\nand its environment, associates this knowledge with the current tasks,\nidentifies new tasks based on differences, and updates the learning models of\nthe self-adaptive system accordingly. A human stakeholder may be involved to\nsupport the learning process and adjust the learning and goal models. We\npresent a general architecture for lifelong self-adaptation and apply it to the\ncase of drift of adaptation spaces that affects the decision-making in\nself-adaptation. We validate the approach for a series of scenarios using the\nDeltaIoT exemplar.\n","authors":["Omid Gheibi","Danny Weyns"],"pdf_url":"https://arxiv.org/pdf/2211.02658v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.07816v2","updated":"2023-12-04T10:08:57Z","published":"2023-07-15T14:46:43Z","title":"Minimal Random Code Learning with Mean-KL Parameterization","summary":" This paper studies the qualitative behavior and robustness of two variants of\nMinimal Random Code Learning (MIRACLE) used to compress variational Bayesian\nneural networks. MIRACLE implements a powerful, conditionally Gaussian\nvariational approximation for the weight posterior $Q_{\\mathbf{w}}$ and uses\nrelative entropy coding to compress a weight sample from the posterior using a\nGaussian coding distribution $P_{\\mathbf{w}}$. To achieve the desired\ncompression rate, $D_{\\mathrm{KL}}[Q_{\\mathbf{w}} \\Vert P_{\\mathbf{w}}]$ must\nbe constrained, which requires a computationally expensive annealing procedure\nunder the conventional mean-variance (Mean-Var) parameterization for\n$Q_{\\mathbf{w}}$. Instead, we parameterize $Q_{\\mathbf{w}}$ by its mean and KL\ndivergence from $P_{\\mathbf{w}}$ to constrain the compression cost to the\ndesired value by construction. We demonstrate that variational training with\nMean-KL parameterization converges twice as fast and maintains predictive\nperformance after compression. Furthermore, we show that Mean-KL leads to more\nmeaningful variational distributions with heavier tails and compressed weight\nsamples which are more robust to pruning.\n","authors":["Jihao Andreas Lin","Gergely Flamich","José Miguel Hernández-Lobato"],"pdf_url":"https://arxiv.org/pdf/2307.07816v2.pdf","comment":"ICML Neural Compression Workshop 2023"},{"id":"http://arxiv.org/abs/2312.01753v1","updated":"2023-12-04T09:27:03Z","published":"2023-12-04T09:27:03Z","title":"Long-Tail Learning with Rebalanced Contrastive Loss","summary":" Integrating supervised contrastive loss to cross entropy-based communication\nhas recently been proposed as a solution to address the long-tail learning\nproblem. However, when the class imbalance ratio is high, it requires adjusting\nthe supervised contrastive loss to support the tail classes, as the\nconventional contrastive learning is biased towards head classes by default. To\nthis end, we present Rebalanced Contrastive Learning (RCL), an efficient means\nto increase the long tail classification accuracy by addressing three main\naspects: 1. Feature space balancedness - Equal division of the feature space\namong all the classes, 2. Intra-Class compactness - Reducing the distance\nbetween same-class embeddings, 3. Regularization - Enforcing larger margins for\ntail classes to reduce overfitting. RCL adopts class frequency-based SoftMax\nloss balancing to supervised contrastive learning loss and exploits scalar\nmultiplied features fed to the contrastive learning loss to enforce\ncompactness. We implement RCL on the Balanced Contrastive Learning (BCL)\nFramework, which has the SOTA performance. Our experiments on three benchmark\ndatasets demonstrate the richness of the learnt embeddings and increased top-1\nbalanced accuracy RCL provides to the BCL framework. We further demonstrate\nthat the performance of RCL as a standalone loss also achieves state-of-the-art\nlevel accuracy.\n","authors":["Charika De Alvis","Dishanika Denipitiyage","Suranga Seneviratne"],"pdf_url":"https://arxiv.org/pdf/2312.01753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01739v1","updated":"2023-12-04T09:03:06Z","published":"2023-12-04T09:03:06Z","title":"Divide-and-Conquer Strategy for Large-Scale Dynamic Bayesian Network\n Structure Learning","summary":" Dynamic Bayesian Networks (DBNs), renowned for their interpretability, have\nbecome increasingly vital in representing complex stochastic processes in\nvarious domains such as gene expression analysis, healthcare, and traffic\nprediction. Structure learning of DBNs from data is challenging, particularly\nfor datasets with thousands of variables. Most current algorithms for DBN\nstructure learning are adaptations from those used in static Bayesian Networks\n(BNs), and are typically focused on small-scale problems. In order to solve\nlarge-scale problems while taking full advantage of existing algorithms, this\npaper introduces a novel divide-and-conquer strategy, originally developed for\nstatic BNs, and adapts it for large-scale DBN structure learning. In this work,\nwe specifically concentrate on 2 Time-sliced Bayesian Networks (2-TBNs), a\nspecial class of DBNs. Furthermore, we leverage the prior knowledge of 2-TBNs\nto enhance the performance of the strategy we introduce. Our approach\nsignificantly improves the scalability and accuracy of 2-TBN structure\nlearning. Experimental results demonstrate the effectiveness of our method,\nshowing substantial improvements over existing algorithms in both computational\nefficiency and structure learning accuracy. On problem instances with more than\n1,000 variables, our approach improves two accuracy metrics by 74.45% and\n110.94% on average , respectively, while reducing runtime by 93.65% on average.\n","authors":["Hui Ouyang","Cheng Chen","Ke Tang"],"pdf_url":"https://arxiv.org/pdf/2312.01739v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.17100v3","updated":"2023-12-04T09:01:53Z","published":"2023-06-29T16:57:22Z","title":"RL4CO: a Unified Reinforcement Learning for Combinatorial Optimization\n Library","summary":" Deep reinforcement learning offers notable benefits in addressing\ncombinatorial problems over traditional solvers, reducing the reliance on\ndomain-specific knowledge and expert solutions, and improving computational\nefficiency. Despite the recent surge in interest in neural combinatorial\noptimization, practitioners often do not have access to a standardized code\nbase. Moreover, different algorithms are frequently based on fragmentized\nimplementations that hinder reproducibility and fair comparison. To address\nthese challenges, we introduce RL4CO, a unified Reinforcement Learning (RL) for\nCombinatorial Optimization (CO) library. We employ state-of-the-art software\nand best practices in implementation, such as modularity and configuration\nmanagement, to be flexible, easily modifiable, and extensible by researchers.\nThanks to our unified codebase, we benchmark baseline RL solvers with different\nevaluation schemes on zero-shot performance, generalization, and adaptability\non diverse tasks. Notably, we find that some recent methods may fall behind\ntheir predecessors depending on the evaluation settings. We hope RL4CO will\nencourage the exploration of novel solutions to complex real-world tasks,\nallowing the community to compare with existing methods through a unified\nframework that decouples the science from software engineering. We open-source\nour library at https://github.com/ai4co/rl4co.\n","authors":["Federico Berto","Chuanbo Hua","Junyoung Park","Minsu Kim","Hyeonah Kim","Jiwoo Son","Haeyeon Kim","Joungho Kim","Jinkyoo Park"],"pdf_url":"https://arxiv.org/pdf/2306.17100v3.pdf","comment":"Accepted as a workshop paper at the NeurIPS 2023 GLFrontiers Workshop\n (Oral)"},{"id":"http://arxiv.org/abs/2307.08596v2","updated":"2023-12-04T09:01:14Z","published":"2023-07-14T07:09:57Z","title":"Omnipotent Adversarial Training in the Wild","summary":" Adversarial training is an important topic in robust deep learning, but the\ncommunity lacks attention to its practical usage. In this paper, we aim to\nresolve a real-world challenge, i.e., training a model on an imbalanced and\nnoisy dataset to achieve high clean accuracy and adversarial robustness, with\nour proposed Omnipotent Adversarial Training (OAT) strategy. OAT consists of\ntwo innovative methodologies to address the imperfection in the training set.\nWe first introduce an oracle into the adversarial training process to help the\nmodel learn a correct data-label conditional distribution. This\ncarefully-designed oracle can provide correct label annotations for adversarial\ntraining. We further propose logits adjustment adversarial training to overcome\nthe data imbalance issue, which can help the model learn a Bayes-optimal\ndistribution. Our comprehensive evaluation results show that OAT outperforms\nother baselines by more than 20% clean accuracy improvement and 10% robust\naccuracy improvement under complex combinations of data imbalance and label\nnoise scenarios. The code can be found in https://github.com/GuanlinLee/OAT.\n","authors":["Guanlin Li","Kangjie Chen","Yuan Xu","Han Qiu","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.08596v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10205v2","updated":"2023-12-04T08:53:58Z","published":"2023-07-14T07:01:48Z","title":"Alleviating the Effect of Data Imbalance on Adversarial Training","summary":" In this paper, we study adversarial training on datasets that obey the\nlong-tailed distribution, which is practical but rarely explored in previous\nworks. Compared with conventional adversarial training on balanced datasets,\nthis process falls into the dilemma of generating uneven adversarial examples\n(AEs) and an unbalanced feature embedding space, causing the resulting model to\nexhibit low robustness and accuracy on tail data. To combat that, we\ntheoretically analyze the lower bound of the robust risk to train a model on a\nlong-tailed dataset to obtain the key challenges in addressing the\naforementioned dilemmas. Based on it, we propose a new adversarial training\nframework -- Re-balancing Adversarial Training (REAT). This framework consists\nof two components: (1) a new training strategy inspired by the effective number\nto guide the model to generate more balanced and informative AEs; (2) a\ncarefully constructed penalty function to force a satisfactory feature space.\nEvaluation results on different datasets and model structures prove that REAT\ncan effectively enhance the model's robustness and preserve the model's clean\naccuracy. The code can be found in https://github.com/GuanlinLee/REAT.\n","authors":["Guanlin Li","Guowen Xu","Tianwei Zhang"],"pdf_url":"https://arxiv.org/pdf/2307.10205v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15984v2","updated":"2023-12-04T08:50:31Z","published":"2023-08-30T12:13:13Z","title":"Learning Structure-from-Motion with Graph Attention Networks","summary":" In this paper we tackle the problem of learning Structure-from-Motion (SfM)\nthrough the use of graph attention networks. SfM is a classic computer vision\nproblem that is solved though iterative minimization of reprojection errors,\nreferred to as Bundle Adjustment (BA), starting from a good initialization. In\norder to obtain a good enough initialization to BA, conventional methods rely\non a sequence of sub-problems (such as pairwise pose estimation, pose averaging\nor triangulation) which provides an initial solution that can then be refined\nusing BA. In this work we replace these sub-problems by learning a model that\ntakes as input the 2D keypoints detected across multiple views, and outputs the\ncorresponding camera poses and 3D keypoint coordinates. Our model takes\nadvantage of graph neural networks to learn SfM-specific primitives, and we\nshow that it can be used for fast inference of the reconstruction for new and\nunseen sequences. The experimental results show that the proposed model\noutperforms competing learning-based methods, and challenges COLMAP while\nhaving lower runtime.\n","authors":["Lucas Brynte","José Pedro Iglesias","Carl Olsson","Fredrik Kahl"],"pdf_url":"https://arxiv.org/pdf/2308.15984v2.pdf","comment":"Added additional metrics"},{"id":"http://arxiv.org/abs/2312.01729v1","updated":"2023-12-04T08:38:54Z","published":"2023-12-04T08:38:54Z","title":"EdgeConvFormer: Dynamic Graph CNN and Transformer based Anomaly\n Detection in Multivariate Time Series","summary":" Transformer-based models for anomaly detection in multivariate time series\ncan benefit from the self-attention mechanism due to its advantage in modeling\nlong-term dependencies. However, Transformer-based anomaly detection models\nhave problems such as a large amount of data being required for training,\nstandard positional encoding is not suitable for multivariate time series data,\nand the interdependence between time series is not considered. To address these\nlimitations, we propose a novel anomaly detection method, named EdgeConvFormer,\nwhich integrates Time2vec embedding, stacked dynamic graph CNN, and Transformer\nto extract global and local spatial-time information. This design of\nEdgeConvFormer empowers it with decomposition capacities for complex time\nseries, progressive spatiotemporal correlation discovery between time series,\nand representation aggregation of multi-scale features. Experiments demonstrate\nthat EdgeConvFormer can learn the spatial-temporal correlations from\nmultivariate time series data and achieve better anomaly detection performance\nthan the state-of-the-art approaches on many real-world datasets of different\nscales.\n","authors":["Jie Liu","Qilin Li","Senjian An","Bradley Ezard","Ling Li"],"pdf_url":"https://arxiv.org/pdf/2312.01729v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01728v1","updated":"2023-12-04T08:35:31Z","published":"2023-12-04T08:35:31Z","title":"ImputeFormer: Graph Transformers for Generalizable Spatiotemporal\n Imputation","summary":" This paper focuses on the multivariate time series imputation problem using\ndeep neural architectures. The ubiquitous issue of missing data in both\nscientific and engineering tasks necessitates the development of an effective\nand general imputation model. Leveraging the wisdom and expertise garnered from\nlow-rank imputation methods, we power the canonical Transformers with three key\nknowledge-driven enhancements, including projected temporal attention, global\nadaptive graph convolution, and Fourier imputation loss. These task-agnostic\ninductive biases exploit the inherent structures of incomplete time series, and\nthus make our model versatile for a variety of imputation problems. We\ndemonstrate its superiority in terms of accuracy, efficiency, and flexibility\non heterogeneous datasets, including traffic speed, traffic volume, solar\nenergy, smart metering, and air quality. Comprehensive case studies are\nperformed to further strengthen the interpretability. Promising empirical\nresults provide strong conviction that incorporating time series primitives,\nsuch as low-rank properties, can substantially facilitate the development of a\ngeneralizable model to approach a wide range of spatiotemporal imputation\nproblems.\n","authors":["Tong Nie","Guoyang Qin","Yuewen Mei","Jian Sun"],"pdf_url":"https://arxiv.org/pdf/2312.01728v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02066v3","updated":"2023-12-04T08:34:48Z","published":"2023-10-03T14:09:15Z","title":"De Novo Drug Design with Joint Transformers","summary":" De novo drug design requires simultaneously generating novel molecules\noutside of training data and predicting their target properties, making it a\nhard task for generative models. To address this, we propose Joint Transformer\nthat combines a Transformer decoder, Transformer encoder, and a predictor in a\njoint generative model with shared weights. We formulate a probabilistic\nblack-box optimization algorithm that employs Joint Transformer to generate\nnovel molecules with improved target properties and outperforms other\nSMILES-based optimization methods in de novo drug design.\n","authors":["Adam Izdebski","Ewelina Weglarz-Tomczak","Ewa Szczurek","Jakub M. Tomczak"],"pdf_url":"https://arxiv.org/pdf/2310.02066v3.pdf","comment":"Accepted to NeurIPS 2023 Generative AI and Biology Workshop"},{"id":"http://arxiv.org/abs/2312.01721v1","updated":"2023-12-04T08:23:00Z","published":"2023-12-04T08:23:00Z","title":"The Self-Loop Paradox: Investigating the Impact of Self-Loops on Graph\n Neural Networks","summary":" Many Graph Neural Networks (GNNs) add self-loops to a graph to include\nfeature information about a node itself at each layer. However, if the GNN\nconsists of more than one layer, this information can return to its origin via\ncycles in the graph topology. Intuition suggests that this \"backflow\" of\ninformation should be larger in graphs with self-loops compared to graphs\nwithout. In this work, we counter this intuition and show that for certain GNN\narchitectures, the information a node gains from itself can be smaller in\ngraphs with self-loops compared to the same graphs without. We adopt an\nanalytical approach for the study of statistical graph ensembles with a given\ndegree sequence and show that this phenomenon, which we call the self-loop\nparadox, can depend both on the number of GNN layers $k$ and whether $k$ is\neven or odd. We experimentally validate our theoretical findings in a synthetic\nnode classification task and investigate its practical relevance in 23\nreal-world graphs.\n","authors":["Moritz Lampert","Ingo Scholtes"],"pdf_url":"https://arxiv.org/pdf/2312.01721v1.pdf","comment":"Presented at the Second Learning on Graphs Conference (LoG 2023) as\n extended abstract"},{"id":"http://arxiv.org/abs/2312.01699v1","updated":"2023-12-04T07:39:05Z","published":"2023-12-04T07:39:05Z","title":"Rethinking Urban Mobility Prediction: A Super-Multivariate Time Series\n Forecasting Approach","summary":" Long-term urban mobility predictions play a crucial role in the effective\nmanagement of urban facilities and services. Conventionally, urban mobility\ndata has been structured as spatiotemporal videos, treating longitude and\nlatitude grids as fundamental pixels. Consequently, video prediction methods,\nrelying on Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs),\nhave been instrumental in this domain. In our research, we introduce a fresh\nperspective on urban mobility prediction. Instead of oversimplifying urban\nmobility data as traditional video data, we regard it as a complex multivariate\ntime series. This perspective involves treating the time-varying values of each\ngrid in each channel as individual time series, necessitating a thorough\nexamination of temporal dynamics, cross-variable correlations, and\nfrequency-domain insights for precise and reliable predictions. To address this\nchallenge, we present the Super-Multivariate Urban Mobility Transformer\n(SUMformer), which utilizes a specially designed attention mechanism to\ncalculate temporal and cross-variable correlations and reduce computational\ncosts stemming from a large number of time series. SUMformer also employs\nlow-frequency filters to extract essential information for long-term\npredictions. Furthermore, SUMformer is structured with a temporal patch merge\nmechanism, forming a hierarchical framework that enables the capture of\nmulti-scale correlations. Consequently, it excels in urban mobility pattern\nmodeling and long-term prediction, outperforming current state-of-the-art\nmethods across three real-world datasets.\n","authors":["Jinguo Cheng","Ke Li","Yuxuan Liang","Lijun Sun","Junchi Yan","Yuankai Wu"],"pdf_url":"https://arxiv.org/pdf/2312.01699v1.pdf","comment":"14 pages,9 figures"},{"id":"http://arxiv.org/abs/2310.18940v2","updated":"2023-12-04T07:34:35Z","published":"2023-10-29T09:02:57Z","title":"Language Agents with Reinforcement Learning for Strategic Play in the\n Werewolf Game","summary":" Agents built with large language models (LLMs) have recently achieved great\nadvancements. However, most of the efforts focus on single-agent or cooperative\nsettings, leaving more general multi-agent environments underexplored. We\npropose a new framework powered by reinforcement learning (RL) to develop\nstrategic language agents, i.e., LLM-based agents with strategic thinking\nability, for a popular language game, Werewolf. Werewolf is a social deduction\ngame with hidden roles that involves both cooperation and competition and\nemphasizes deceptive communication and diverse gameplay. Our agent tackles this\ngame by first using LLMs to reason about potential deceptions and generate a\nset of strategically diverse actions. Then an RL policy, which selects an\naction from the candidates, is learned by population-based training to enhance\nthe agents' decision-making ability. By combining LLMs with the RL policy, our\nagent produces a variety of emergent strategies, achieves the highest win rate\nagainst other LLM-based agents, and stays robust against adversarial human\nplayers in the Werewolf game.\n","authors":["Zelai Xu","Chao Yu","Fei Fang","Yu Wang","Yi Wu"],"pdf_url":"https://arxiv.org/pdf/2310.18940v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01692v1","updated":"2023-12-04T07:29:44Z","published":"2023-12-04T07:29:44Z","title":"Risk-Controlling Model Selection via Guided Bayesian Optimization","summary":" Adjustable hyperparameters of machine learning models typically impact\nvarious key trade-offs such as accuracy, fairness, robustness, or inference\ncost. Our goal in this paper is to find a configuration that adheres to\nuser-specified limits on certain risks while being useful with respect to other\nconflicting metrics. We solve this by combining Bayesian Optimization (BO) with\nrigorous risk-controlling procedures, where our core idea is to steer BO\ntowards an efficient testing strategy. Our BO method identifies a set of Pareto\noptimal configurations residing in a designated region of interest. The\nresulting candidates are statistically verified and the best-performing\nconfiguration is selected with guaranteed risk levels. We demonstrate the\neffectiveness of our approach on a range of tasks with multiple desiderata,\nincluding low error rates, equitable predictions, handling spurious\ncorrelations, managing rate and distortion in generative models, and reducing\ncomputational costs.\n","authors":["Bracha Laufer-Goldshtein","Adam Fisch","Regina Barzilay","Tommi Jaakkola"],"pdf_url":"https://arxiv.org/pdf/2312.01692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01691v1","updated":"2023-12-04T07:25:55Z","published":"2023-12-04T07:25:55Z","title":"Estimating Coronal Mass Ejection Mass and Kinetic Energy by Fusion of\n Multiple Deep-learning Models","summary":" Coronal mass ejections (CMEs) are massive solar eruptions, which have a\nsignificant impact on Earth. In this paper, we propose a new method, called\nDeepCME, to estimate two properties of CMEs, namely, CME mass and kinetic\nenergy. Being able to estimate these properties helps better understand CME\ndynamics. Our study is based on the CME catalog maintained at the Coordinated\nData Analysis Workshops (CDAW) Data Center, which contains all CMEs manually\nidentified since 1996 using the Large Angle and Spectrometric Coronagraph\n(LASCO) on board the Solar and Heliospheric Observatory (SOHO). We use LASCO C2\ndata in the period between January 1996 and December 2020 to train, validate\nand test DeepCME through 10-fold cross validation. The DeepCME method is a\nfusion of three deep learning models, including ResNet, InceptionNet, and\nInceptionResNet. Our fusion model extracts features from LASCO C2 images,\neffectively combining the learning capabilities of the three component models\nto jointly estimate the mass and kinetic energy of CMEs. Experimental results\nshow that the fusion model yields a mean relative error (MRE) of 0.013 (0.009,\nrespectively) compared to the MRE of 0.019 (0.017, respectively) of the best\ncomponent model InceptionResNet (InceptionNet, respectively) in estimating the\nCME mass (kinetic energy, respectively). To our knowledge, this is the first\ntime that deep learning has been used for CME mass and kinetic energy\nestimations.\n","authors":["Khalid A. Alobaid","Yasser Abduallah","Jason T. L. Wang","Haimin Wang","Shen Fan","Jialiang Li","Huseyin Cavus","Vasyl Yurchyshyn"],"pdf_url":"https://arxiv.org/pdf/2312.01691v1.pdf","comment":"10 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.01687v1","updated":"2023-12-04T07:21:27Z","published":"2023-12-04T07:21:27Z","title":"Optimizing Bus Travel: A Novel Approach to Feature Mining with P-KMEANS\n and P-LDA Algorithms","summary":" Customizing services for bus travel can bolster its attractiveness, optimize\nusage, alleviate traffic congestion, and diminish carbon emissions. This\npotential is realized by harnessing recent advancements in positioning\ncommunication facilities, the Internet of Things, and artificial intelligence\nfor feature mining in public transportation. However, the inherent complexities\nof disorganized and unstructured public transportation data introduce\nsubstantial challenges to travel feature extraction. This study presents a bus\ntravel feature extraction method rooted in Point of Interest (POI) data,\nemploying enhanced P-KMENAS and P-LDA algorithms to overcome these limitations.\nWhile the KMEANS algorithm adeptly segments passenger travel paths into\ndistinct clusters, its outcomes can be influenced by the initial K value. On\nthe other hand, Latent Dirichlet Allocation (LDA) excels at feature\nidentification and probabilistic interpretations yet encounters difficulties\nwith feature intermingling and nuanced sub-feature interactions. Incorporating\nthe POI dimension enhances our understanding of travel behavior, aligning it\nmore closely with passenger attributes and facilitating easier data analysis.\nBy incorporating POI data, our refined P-KMENAS and P-LDA algorithms grant a\nholistic insight into travel behaviors and attributes, effectively mitigating\nthe limitations above. Consequently, this POI-centric algorithm effectively\namalgamates diverse POI attributes, delineates varied travel contexts, and\nimparts probabilistic metrics to feature properties. Our method successfully\nmines the diverse aspects of bus travel, such as age, occupation, gender,\nsports, cost, safety, and personality traits. It effectively calculates\nrelationships between individual travel behaviors and assigns explanatory and\nevaluative probabilities to POI labels, thereby enhancing bus travel\noptimization.\n","authors":["Hongjie Liu","Haotian Shi","Sicheng Fu","Tengfei Yuan","Xinhuan Zhang","Hongzhe Xu","Bin Ran"],"pdf_url":"https://arxiv.org/pdf/2312.01687v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01679v1","updated":"2023-12-04T07:04:20Z","published":"2023-12-04T07:04:20Z","title":"Adversarial Medical Image with Hierarchical Feature Hiding","summary":" Deep learning based methods for medical images can be easily compromised by\nadversarial examples (AEs), posing a great security flaw in clinical\ndecision-making. It has been discovered that conventional adversarial attacks\nlike PGD which optimize the classification logits, are easy to distinguish in\nthe feature space, resulting in accurate reactive defenses. To better\nunderstand this phenomenon and reassess the reliability of the reactive\ndefenses for medical AEs, we thoroughly investigate the characteristic of\nconventional medical AEs. Specifically, we first theoretically prove that\nconventional adversarial attacks change the outputs by continuously optimizing\nvulnerable features in a fixed direction, thereby leading to outlier\nrepresentations in the feature space. Then, a stress test is conducted to\nreveal the vulnerability of medical images, by comparing with natural images.\nInterestingly, this vulnerability is a double-edged sword, which can be\nexploited to hide AEs. We then propose a simple-yet-effective hierarchical\nfeature constraint (HFC), a novel add-on to conventional white-box attacks,\nwhich assists to hide the adversarial feature in the target feature\ndistribution. The proposed method is evaluated on three medical datasets, both\n2D and 3D, with different modalities. The experimental results demonstrate the\nsuperiority of HFC, \\emph{i.e.,} it bypasses an array of state-of-the-art\nadversarial medical AE detectors more efficiently than competing adaptive\nattacks, which reveals the deficiencies of medical reactive defense and allows\nto develop more robust defenses in future.\n","authors":["Qingsong Yao","Zecheng He","Yuexiang Li","Yi Lin","Kai Ma","Yefeng Zheng","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.01679v1.pdf","comment":"Our code is available at\n \\url{https://github.com/qsyao/Hierarchical_Feature_Constraint}"},{"id":"http://arxiv.org/abs/2312.01678v1","updated":"2023-12-04T07:01:54Z","published":"2023-12-04T07:01:54Z","title":"Jellyfish: A Large Language Model for Data Preprocessing","summary":" In this paper, we present Jellyfish, an open-source LLM as a universal task\nsolver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned\nwith the datasets of several typical DP tasks including error detection, data\nimputation, schema matching, and entity matching, and delivers generalizability\nto other tasks. Remarkably, Jellyfish can operate on a local, single, and\nlow-priced GPU with its 13 billion parameters, ensuring data security and\nenabling further tuning. Its proficiency in understanding natural language\nallows users to manually craft instructions for DP tasks. Unlike many existing\nmethods that heavily rely on prior knowledge, Jellyfish acquires domain\nknowledge during its tuning process and integrates optional knowledge injection\nduring inference. A distinctive feature of Jellyfish is its interpreter, which\nelucidates its output decisions. To construct Jellyfish, we develop a series of\npre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance\nserializer, which automatically translates raw data into model prompts, and a\nknowledge injector, which optionally introduces task- and dataset-specific\nknowledge to enhance DP performance. Our evaluation of Jellyfish, using a range\nof real datasets, shows its competitiveness compared to state-of-the-art\nmethods and its strong generalizability to unseen tasks. Jellyfish's\nperformance rivals that of GPT series models, and its interpreter offers\nenhanced reasoning capabilities compared to GPT-3.5. Furthermore, our\nevaluation highlights the effectiveness of the techniques employed in\nconstructing Jellyfish. Our model is available at Hugging Face:\nhttps://huggingface.co/NECOUDBFM/Jellyfish .\n","authors":["Haochen Zhang","Yuyang Dong","Chuan Xiao","Masafumi Oyamada"],"pdf_url":"https://arxiv.org/pdf/2312.01678v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.15452v5","updated":"2023-12-04T07:01:15Z","published":"2023-08-29T17:22:39Z","title":"When Do Program-of-Thoughts Work for Reasoning?","summary":" In the realm of embodied artificial intelligence, the reasoning capabilities\nof Large Language Models (LLMs) play a pivotal role. Although there are\neffective methods like program-of-thought prompting for LLMs which uses\nprogramming language to tackle complex reasoning tasks, the specific impact of\ncode data on the improvement of reasoning capabilities remains under-explored.\nTo address this gap, we propose complexity-impacted reasoning score (CIRS),\nwhich combines structural and logical attributes, to measure the correlation\nbetween code and reasoning abilities. Specifically, we use the abstract syntax\ntree to encode the structural information and calculate logical complexity by\nconsidering the difficulty and the cyclomatic complexity. Through an empirical\nanalysis, we find not all code data of complexity can be learned or understood\nby LLMs. Optimal level of complexity is critical to the improvement of\nreasoning abilities by program-aided prompting. Then we design an\nauto-synthesizing and stratifying algorithm, and apply it to instruction\ngeneration for mathematical reasoning and code data filtering for code\ngeneration tasks. Extensive results demonstrates the effectiveness of our\nproposed approach. Code will be integrated into the EasyInstruct framework at\nhttps://github.com/zjunlp/EasyInstruct.\n","authors":["Zhen Bi","Ningyu Zhang","Yinuo Jiang","Shumin Deng","Guozhou Zheng","Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2308.15452v5.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2312.01674v1","updated":"2023-12-04T06:51:46Z","published":"2023-12-04T06:51:46Z","title":"EDALearn: A Comprehensive RTL-to-Signoff EDA Benchmark for Democratized\n and Reproducible ML for EDA Research","summary":" The application of Machine Learning (ML) in Electronic Design Automation\n(EDA) for Very Large-Scale Integration (VLSI) design has garnered significant\nresearch attention. Despite the requirement for extensive datasets to build\neffective ML models, most studies are limited to smaller, internally generated\ndatasets due to the lack of comprehensive public resources. In response, we\nintroduce EDALearn, the first holistic, open-source benchmark suite\nspecifically for ML tasks in EDA. This benchmark suite presents an end-to-end\nflow from synthesis to physical implementation, enriching data collection\nacross various stages. It fosters reproducibility and promotes research into ML\ntransferability across different technology nodes. Accommodating a wide range\nof VLSI design instances and sizes, our benchmark aptly represents the\ncomplexity of contemporary VLSI designs. Additionally, we provide an in-depth\ndata analysis, enabling users to fully comprehend the attributes and\ndistribution of our data, which is essential for creating efficient ML models.\nOur contributions aim to encourage further advances in the ML-EDA domain.\n","authors":["Jingyu Pan","Chen-Chia Chang","Zhiyao Xie","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01674v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2311.18426v2","updated":"2023-12-04T06:27:04Z","published":"2023-11-30T10:24:07Z","title":"Convergence Analysis of Fractional Gradient Descent","summary":" Fractional derivatives are a well-studied generalization of integer order\nderivatives. Naturally, for optimization, it is of interest to understand the\nconvergence properties of gradient descent using fractional derivatives.\nConvergence analysis of fractional gradient descent is currently limited both\nin the methods analyzed and the settings analyzed. This paper aims to fill in\nthese gaps by analyzing variations of fractional gradient descent in smooth and\nconvex, smooth and strongly convex, and smooth and non-convex settings. First,\nnovel bounds will be established bridging fractional and integer derivatives.\nThen, these bounds will be applied to the aforementioned settings to prove\n$O(1/T)$ convergence for smooth and convex functions and linear convergence for\nsmooth and strongly convex functions. Additionally, we prove $O(1/T)$\nconvergence for smooth and non-convex functions using an extended notion of\nsmoothness that is more natural for fractional derivatives. Finally, empirical\nresults will be presented on the potential speed up of fractional gradient\ndescent over standard gradient descent as well as the challenges of predicting\nwhich will be faster in general.\n","authors":["Ashwani Aggarwal"],"pdf_url":"https://arxiv.org/pdf/2311.18426v2.pdf","comment":"21 pages, 4 figures. Submitted to TMLR. Updated to TMLR format. Minor\n corrections in Figure 1 description, statements of Theorem 14, 18 and\n Corollary 17. Minor clarification in statement of Theorem 10, 21. Moved most\n proofs to appendix and added sketches, moved remarks within proofs into main\n body"},{"id":"http://arxiv.org/abs/2312.01662v1","updated":"2023-12-04T06:24:49Z","published":"2023-12-04T06:24:49Z","title":"Universal Deoxidation of Semiconductor Substrates Assisted by\n Machine-Learning and Real-Time-Feedback-Control","summary":" Thin film deposition is an essential step in the semiconductor process.\nDuring preparation or loading, the substrate is exposed to the air unavoidably,\nwhich has motivated studies of the process control to remove the surface oxide\nbefore thin film deposition. Optimizing the deoxidation process in molecular\nbeam epitaxy (MBE) for a random substrate is a multidimensional challenge and\nsometimes controversial. Due to variations in semiconductor materials and\ngrowth processes, the determination of substrate deoxidation temperature is\nhighly dependent on the grower's expertise; the same substrate may yield\ninconsistent results when evaluated by different growers. Here, we employ a\nmachine learning (ML) hybrid convolution and vision transformer (CNN-ViT)\nmodel. This model utilizes reflection high-energy electron diffraction (RHEED)\nvideo as input to determine the deoxidation status of the substrate as output,\nenabling automated substrate deoxidation under a controlled architecture. This\nalso extends to the successful application of deoxidation processes on other\nsubstrates. Furthermore, we showcase the potential of models trained on data\nfrom a single MBE equipment to achieve high-accuracy deployment on other\nequipment. In contrast to traditional methods, our approach holds exceptional\npractical value. It standardizes deoxidation temperatures across various\nequipment and substrate materials, advancing the standardization research\nprocess in semiconductor preparation, a significant milestone in thin film\ngrowth technology. The concepts and methods demonstrated in this work are\nanticipated to revolutionize semiconductor manufacturing in optoelectronics and\nmicroelectronics industries by applying them to diverse material growth\nprocesses.\n","authors":["Chao Shen","Wenkang Zhan","Jian Tang","Zhaofeng Wu","Bo Xu","Chao Zhao","Zhanguo Wang"],"pdf_url":"https://arxiv.org/pdf/2312.01662v1.pdf","comment":"5 figures"},{"id":"http://arxiv.org/abs/2311.15500v2","updated":"2023-12-04T06:24:02Z","published":"2023-11-27T02:55:34Z","title":"Function-constrained Program Synthesis","summary":" This work introduces (1) a technique that allows large language models (LLMs)\nto leverage user-provided code when solving programming tasks and (2) a method\nto iteratively generate modular sub-functions that can aid future code\ngeneration attempts when the initial code generated by the LLM is inadequate.\nGenerating computer programs in general-purpose programming languages like\nPython poses a challenge for LLMs when instructed to use code provided in the\nprompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code\ncompletions in real-time by drawing on all code available in a development\nenvironment. However, restricting code-specific LLMs to use only in-context\ncode is not straightforward, as the model is not explicitly instructed to use\nthe user-provided code and users cannot highlight precisely which snippets of\ncode the model should incorporate into its context. Moreover, current systems\nlack effective recovery methods, forcing users to iteratively re-prompt the\nmodel with modified prompts until a sufficient solution is reached. Our method\ndiffers from traditional LLM-powered code-generation by constraining\ncode-generation to an explicit function set and enabling recovery from failed\nattempts through automatically generated sub-functions. When the LLM cannot\nproduce working code, we generate modular sub-functions to aid subsequent\nattempts at generating functional code. A by-product of our method is a library\nof reusable sub-functions that can solve related tasks, imitating a software\nteam where efficiency scales with experience. We also introduce a new\n\"half-shot\" evaluation paradigm that provides tighter estimates of LLMs' coding\nabilities compared to traditional zero-shot evaluation. Our proposed evaluation\nmethod encourages models to output solutions in a structured format, decreasing\nsyntax errors that can be mistaken for poor coding ability.\n","authors":["Patrick Hajali","Ignas Budvytis"],"pdf_url":"https://arxiv.org/pdf/2311.15500v2.pdf","comment":"17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop; corrected typo on\n fig 1 caption"},{"id":"http://arxiv.org/abs/2312.01659v1","updated":"2023-12-04T06:21:22Z","published":"2023-12-04T06:21:22Z","title":"RiskBench: A Scenario-based Benchmark for Risk Identification","summary":" Intelligent driving systems aim to achieve a zero-collision mobility\nexperience, requiring interdisciplinary efforts to enhance safety performance.\nThis work focuses on risk identification, the process of identifying and\nanalyzing risks stemming from dynamic traffic participants and unexpected\nevents. While significant advances have been made in the community, the current\nevaluation of different risk identification algorithms uses independent\ndatasets, leading to difficulty in direct comparison and hindering collective\nprogress toward safety performance enhancement. To address this limitation, we\nintroduce \\textbf{RiskBench}, a large-scale scenario-based benchmark for risk\nidentification. We design a scenario taxonomy and augmentation pipeline to\nenable a systematic collection of ground truth risks under diverse scenarios.\nWe assess the ability of ten algorithms to (1) detect and locate risks, (2)\nanticipate risks, and (3) facilitate decision-making. We conduct extensive\nexperiments and summarize future research on risk identification. Our aim is to\nencourage collaborative endeavors in achieving a society with zero collisions.\nWe have made our dataset and benchmark toolkit publicly on the project page:\nhttps://hcis-lab.github.io/RiskBench/\n","authors":["Chi-Hsi Kung","Chieh-Chi Yang","Pang-Yuan Pao","Shu-Wei Lu","Pin-Lun Chen","Hsin-Cheng Lu","Yi-Ting Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01659v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01658v1","updated":"2023-12-04T06:20:14Z","published":"2023-12-04T06:20:14Z","title":"AGD: an Auto-switchable Optimizer using Stepwise Gradient Difference for\n Preconditioning Matrix","summary":" Adaptive optimizers, such as Adam, have achieved remarkable success in deep\nlearning. A key component of these optimizers is the so-called preconditioning\nmatrix, providing enhanced gradient information and regulating the step size of\neach gradient direction. In this paper, we propose a novel approach to\ndesigning the preconditioning matrix by utilizing the gradient difference\nbetween two successive steps as the diagonal elements. These diagonal elements\nare closely related to the Hessian and can be perceived as an approximation of\nthe inner product between the Hessian row vectors and difference of the\nadjacent parameter vectors. Additionally, we introduce an auto-switching\nfunction that enables the preconditioning matrix to switch dynamically between\nStochastic Gradient Descent (SGD) and the adaptive optimizer. Based on these\ntwo techniques, we develop a new optimizer named AGD that enhances the\ngeneralization performance. We evaluate AGD on public datasets of Natural\nLanguage Processing (NLP), Computer Vision (CV), and Recommendation Systems\n(RecSys). Our experimental results demonstrate that AGD outperforms the\nstate-of-the-art (SOTA) optimizers, achieving highly competitive or\nsignificantly better predictive performance. Furthermore, we analyze how AGD is\nable to switch automatically between SGD and the adaptive optimizer and its\nactual effects on various scenarios. The code is available at\nhttps://github.com/intelligent-machine-learning/dlrover/tree/master/atorch/atorch/optimizers.\n","authors":["Yun Yue","Zhiling Ye","Jiadi Jiang","Yongchao Liu","Ke Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.01658v1.pdf","comment":"21 pages. Accepted as a conference paper at NeurIPS '23"},{"id":"http://arxiv.org/abs/2312.01657v1","updated":"2023-12-04T06:18:10Z","published":"2023-12-04T06:18:10Z","title":"On Tuning Neural ODE for Stability, Consistency and Faster Convergence","summary":" Neural-ODE parameterize a differential equation using continuous depth neural\nnetwork and solve it using numerical ODE-integrator. These models offer a\nconstant memory cost compared to models with discrete sequence of hidden layers\nin which memory cost increases linearly with the number of layers. In addition\nto memory efficiency, other benefits of neural-ode include adaptability of\nevaluation approach to input, and flexibility to choose numerical precision or\nfast training. However, despite having all these benefits, it still has some\nlimitations. We identify the ODE-integrator (also called ODE-solver) as the\nweakest link in the chain as it may have stability, consistency and convergence\n(CCS) issues and may suffer from slower convergence or may not converge at all.\nWe propose a first-order Nesterov's accelerated gradient (NAG) based ODE-solver\nwhich is proven to be tuned vis-a-vis CCS conditions. We empirically\ndemonstrate the efficacy of our approach by training faster, while achieving\nbetter or comparable performance against neural-ode employing other fixed-step\nexplicit ODE-solvers as well discrete depth models such as ResNet in three\ndifferent tasks including supervised classification, density estimation, and\ntime-series modelling.\n","authors":["Sheikh Waqas Akhtar"],"pdf_url":"https://arxiv.org/pdf/2312.01657v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01653v1","updated":"2023-12-04T06:11:39Z","published":"2023-12-04T06:11:39Z","title":"An End-to-End Network Pruning Pipeline with Sparsity Enforcement","summary":" Neural networks have emerged as a powerful tool for solving complex tasks\nacross various domains, but their increasing size and computational\nrequirements have posed significant challenges in deploying them on\nresource-constrained devices. Neural network sparsification, and in particular\npruning, has emerged as an effective technique to alleviate these challenges by\nreducing model size, computational complexity, and memory footprint while\nmaintaining competitive performance. However, many pruning pipelines modify the\nstandard training pipeline at only a single stage, if at all. In this work, we\nlook to develop an end-to-end training pipeline that befits neural network\npruning and sparsification at all stages of training. To do so, we make use of\nnonstandard model parameter initialization, pre-pruning training methodologies,\nand post-pruning training optimizations. We conduct experiments utilizing\ncombinations of these methods, in addition to different techniques used in the\npruning step, and find that our combined pipeline can achieve significant gains\nover current state of the art approaches to neural network sparsification.\n","authors":["Evan Dogariu"],"pdf_url":"https://arxiv.org/pdf/2312.01653v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01648v1","updated":"2023-12-04T06:01:32Z","published":"2023-12-04T06:01:32Z","title":"Characterizing Large Language Model Geometry Solves Toxicity Detection\n and Generation","summary":" Large Language Models~(LLMs) drive current AI breakthroughs despite very\nlittle being known about their internal representations, e.g., how to extract a\nfew informative features to solve various downstream tasks. To provide a\npractical and principled answer, we propose to characterize LLMs from a\ngeometric perspective. We obtain in closed form (i) the intrinsic dimension in\nwhich the Multi-Head Attention embeddings are constrained to exist and (ii) the\npartition and per-region affine mappings of the per-layer feedforward networks.\nOur results are informative, do not rely on approximations, and are actionable.\nFirst, we show that, motivated by our geometric interpretation, we can bypass\nLlama$2$'s RLHF by controlling its embedding's intrinsic dimension through\ninformed prompt manipulation. Second, we derive $7$ interpretable spline\nfeatures that can be extracted from any (pre-trained) LLM layer, providing a\nrich abstract representation of their inputs. Those features alone ($224$ for\nMistral-7B and Llama$2$-7B) are sufficient to help solve toxicity detection,\ninfer the domain of the prompt, and even tackle the Jigsaw challenge, which\naims at characterizing the type of toxicity of various prompts. Our results\ndemonstrate how, even in large-scale regimes, exact theoretical results can\nanswer practical questions in language models. Code:\n\\url{https://github.com/RandallBalestriero/SplineLLM}.\n","authors":["Randall Balestriero","Romain Cosentino","Sarath Shekkizhar"],"pdf_url":"https://arxiv.org/pdf/2312.01648v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01634v1","updated":"2023-12-04T05:29:28Z","published":"2023-12-04T05:29:28Z","title":"Robust Streaming, Sampling, and a Perspective on Online Learning","summary":" In this work we present an overview of statistical learning, followed by a\nsurvey of robust streaming techniques and challenges, culminating in several\nrigorous results proving the relationship that we motivate and hint at\nthroughout the journey. Furthermore, we unify often disjoint theorems in a\nshared framework and notation to clarify the deep connections that are\ndiscovered. We hope that by approaching these results from a shared\nperspective, already aware of the technical connections that exist, we can\nenlighten the study of both fields and perhaps motivate new and previously\nunconsidered directions of research.\n","authors":["Evan Dogariu","Jiatong Yu"],"pdf_url":"https://arxiv.org/pdf/2312.01634v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01624v1","updated":"2023-12-04T04:49:10Z","published":"2023-12-04T04:49:10Z","title":"GVFs in the Real World: Making Predictions Online for Water Treatment","summary":" In this paper we investigate the use of reinforcement-learning based\nprediction approaches for a real drinking-water treatment plant. Developing\nsuch a prediction system is a critical step on the path to optimizing and\nautomating water treatment. Before that, there are many questions to answer\nabout the predictability of the data, suitable neural network architectures,\nhow to overcome partial observability and more. We first describe this dataset,\nand highlight challenges with seasonality, nonstationarity, partial\nobservability, and heterogeneity across sensors and operation modes of the\nplant. We then describe General Value Function (GVF) predictions -- discounted\ncumulative sums of observations -- and highlight why they might be preferable\nto classical n-step predictions common in time series prediction. We discuss\nhow to use offline data to appropriately pre-train our temporal difference\nlearning (TD) agents that learn these GVF predictions, including how to select\nhyperparameters for online fine-tuning in deployment. We find that the\nTD-prediction agent obtains an overall lower normalized mean-squared error than\nthe n-step prediction agent. Finally, we show the importance of learning in\ndeployment, by comparing a TD agent trained purely offline with no online\nupdating to a TD agent that learns online. This final result is one of the\nfirst to motivate the importance of adapting predictions in real-time, for\nnon-stationary high-volume systems in the real world.\n","authors":["Muhammad Kamran Janjua","Haseeb Shah","Martha White","Erfan Miahi","Marlos C. Machado","Adam White"],"pdf_url":"https://arxiv.org/pdf/2312.01624v1.pdf","comment":"Published in Machine Learning (2023)"},{"id":"http://arxiv.org/abs/2312.01619v1","updated":"2023-12-04T04:20:38Z","published":"2023-12-04T04:20:38Z","title":"How Many Validation Labels Do You Need? Exploring the Design Space of\n Label-Efficient Model Ranking","summary":" The paper introduces LEMR, a framework that reduces annotation costs for\nmodel selection tasks. Our approach leverages ensemble methods to generate\npseudo-labels, employs uncertainty sampling for target acquisition, and\nutilizes a Z-score mechanism for iterative committee reelection to refine model\nranks. We present a systematic study across various selection metrics,\ndemonstrating that LEMR achieves comparable results to fully labeled datasets\nwith a fraction of the labeling budget. Our findings indicate that LEMR not\nonly economizes the labeling effort in weak supervision and semi-supervised\nlearning settings but also effectively guides prompt selection for large\nlanguage models. With extensive experiments across 23 tasks, we reveal that our\nframework can dramatically decrease the labeling cost without compromising the\naccuracy of model selection, thereby offering a cost-effective alternative to\ntraditional practices.\n","authors":["Zhengyu Hu","Jieyu Zhang","Yue Yu","Yuchen Zhuang","Hui Xiong"],"pdf_url":"https://arxiv.org/pdf/2312.01619v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01612v1","updated":"2023-12-04T04:03:30Z","published":"2023-12-04T04:03:30Z","title":"xNeuSM: Explainable Neural Subgraph Matching with Graph Learnable\n Multi-hop Attention Networks","summary":" Subgraph matching is a challenging problem with a wide range of applications\nin database systems, biochemistry, and cognitive science. It involves\ndetermining whether a given query graph is present within a larger target\ngraph. Traditional graph-matching algorithms provide precise results but face\nchallenges in large graph instances due to the NP-complete problem, limiting\ntheir practical applicability. In contrast, recent neural network-based\napproximations offer more scalable solutions, but often lack interpretable node\ncorrespondences. To address these limitations, this article presents xNeuSM:\nExplainable Neural Subgraph Matching which introduces Graph Learnable Multi-hop\nAttention Networks (GLeMA) that adaptively learns the parameters governing the\nattention factor decay for each node across hops rather than relying on fixed\nhyperparameters. We provide a theoretical analysis establishing error bounds\nfor GLeMA's approximation of multi-hop attention as a function of the number of\nhops. Additionally, we prove that learning distinct attention decay factors for\neach node leads to a correct approximation of multi-hop attention. Empirical\nevaluation on real-world datasets shows that xNeuSM achieves substantial\nimprovements in prediction accuracy of up to 34% compared to approximate\nbaselines and, notably, at least a seven-fold faster query time than exact\nalgorithms. The source code of our implementation is available at\nhttps://github.com/martinakaduc/xNeuSM.\n","authors":["Duc Q. Nguyen","Thanh Toan Nguyen","Tho quan"],"pdf_url":"https://arxiv.org/pdf/2312.01612v1.pdf","comment":"33 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2310.15020v2","updated":"2023-12-04T03:48:06Z","published":"2023-10-23T15:15:19Z","title":"Invariance is Key to Generalization: Examining the Role of\n Representation in Sim-to-Real Transfer for Visual Navigation","summary":" The data-driven approach to robot control has been gathering pace rapidly,\nyet generalization to unseen task domains remains a critical challenge. We\nargue that the key to generalization is representations that are (i) rich\nenough to capture all task-relevant information and (ii) invariant to\nsuperfluous variability between the training and the test domains. We\nexperimentally study such a representation -- containing both depth and\nsemantic information -- for visual navigation and show that it enables a\ncontrol policy trained entirely in simulated indoor scenes to generalize to\ndiverse real-world environments, both indoors and outdoors. Further, we show\nthat our representation reduces the A-distance between the training and test\ndomains, improving the generalization error bound as a result. Our proposed\napproach is scalable: the learned policy improves continuously, as the\nfoundation models that it exploits absorb more diverse data during\npre-training.\n","authors":["Bo Ai","Zhanxin Wu","David Hsu"],"pdf_url":"https://arxiv.org/pdf/2310.15020v2.pdf","comment":"11 pages, accepted by the 18th International Symposium on\n Experimental Robotics (ISER 2023) and published within the Springer\n Proceedings in Advanced Robotics (SPAR)"},{"id":"http://arxiv.org/abs/2305.19125v3","updated":"2023-12-04T03:47:17Z","published":"2023-05-30T15:36:37Z","title":"Graph Generation with $K^2$-trees","summary":" Generating graphs from a target distribution is a significant challenge\nacross many domains, including drug discovery and social network analysis. In\nthis work, we introduce a novel graph generation method leveraging $K^2$-tree\nrepresentation, originally designed for lossless graph compression. The\n$K^2$-tree representation {encompasses inherent hierarchy while enabling\ncompact graph generation}. In addition, we make contributions by (1) presenting\na sequential $K^2$-treerepresentation that incorporates pruning, flattening,\nand tokenization processes and (2) introducing a Transformer-based architecture\ndesigned to generate the sequence by incorporating a specialized tree\npositional encoding scheme. Finally, we extensively evaluate our algorithm on\nfour general and two molecular graph datasets to confirm its superiority for\ngraph generation.\n","authors":["Yunhui Jang","Dongwoo Kim","Sungsoo Ahn"],"pdf_url":"https://arxiv.org/pdf/2305.19125v3.pdf","comment":"24 pages"},{"id":"http://arxiv.org/abs/2312.01606v1","updated":"2023-12-04T03:38:17Z","published":"2023-12-04T03:38:17Z","title":"Deep Learning-Driven Enhancement of Welding Quality Control: Predicting\n Welding Depth and Pore Volume in Hairpin Welding","summary":" To advance quality assurance in the welding process, this study presents a\nrobust deep learning model that enables the prediction of two critical welds\nKey Performance Characteristics (KPCs): welding depth and average pore volume.\nIn the proposed approach, a comprehensive range of laser welding Key Input\nCharacteristics (KICs) is utilized, including welding beam geometries, welding\nfeed rates, path repetitions for weld beam geometries, and bright light weld\nratios for all paths, all of which were obtained from hairpin welding\nexperiments. Two deep learning networks are employed with multiple hidden dense\nlayers and linear activation functions to showcase the capabilities of deep\nneural networks in capturing the intricate nonlinear connections inherent\nwithin welding KPCs and KICs. Applying deep learning networks to the small\nnumerical experimental hairpin welding dataset has shown promising results,\nachieving Mean Absolute Error (MAE) values as low as 0.1079 for predicting\nwelding depth and 0.0641 for average pore volume. Additionally, the validity\nverification demonstrates the reliability of the proposed method. This, in\nturn, promises significant advantages in controlling welding outcomes, moving\nbeyond the current trend of relying merely on monitoring for defect\nclassification.\n","authors":["Amena Darwish","Stefan Ericson","Rohollah Ghasemi","Tobias Andersson","Dan Lönn","Andreas Andersson Lassila","Kent Salomonsson"],"pdf_url":"https://arxiv.org/pdf/2312.01606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01605v1","updated":"2023-12-04T03:38:04Z","published":"2023-12-04T03:38:04Z","title":"TextAug: Test time Text Augmentation for Multimodal Person\n Re-identification","summary":" Multimodal Person Reidentification is gaining popularity in the research\ncommunity due to its effectiveness compared to counter-part unimodal\nframeworks. However, the bottleneck for multimodal deep learning is the need\nfor a large volume of multimodal training examples. Data augmentation\ntechniques such as cropping, flipping, rotation, etc. are often employed in the\nimage domain to improve the generalization of deep learning models. Augmenting\nin other modalities than images, such as text, is challenging and requires\nsignificant computational resources and external data sources. In this study,\nwe investigate the effectiveness of two computer vision data augmentation\ntechniques: cutout and cutmix, for text augmentation in multi-modal person\nre-identification. Our approach merges these two augmentation strategies into\none strategy called CutMixOut which involves randomly removing words or\nsub-phrases from a sentence (Cutout) and blending parts of two or more\nsentences to create diverse examples (CutMix) with a certain probability\nassigned to each operation. This augmentation was implemented at inference time\nwithout any prior training. Our results demonstrate that the proposed technique\nis simple and effective in improving the performance on multiple multimodal\nperson re-identification benchmarks.\n","authors":["Mulham Fawakherji","Eduard Vazquez","Pasquale Giampa","Binod Bhattarai"],"pdf_url":"https://arxiv.org/pdf/2312.01605v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.00656v2","updated":"2023-12-04T03:26:35Z","published":"2023-12-01T15:30:54Z","title":"Simple Transferability Estimation for Regression Tasks","summary":" We consider transferability estimation, the problem of estimating how well\ndeep learning models transfer from a source to a target task. We focus on\nregression tasks, which received little previous attention, and propose two\nsimple and computationally efficient approaches that estimate transferability\nbased on the negative regularized mean squared error of a linear regression\nmodel. We prove novel theoretical results connecting our approaches to the\nactual transferability of the optimal target models obtained from the transfer\nlearning process. Despite their simplicity, our approaches significantly\noutperform existing state-of-the-art regression transferability estimators in\nboth accuracy and efficiency. On two large-scale keypoint regression\nbenchmarks, our approaches yield 12% to 36% better results on average while\nbeing at least 27% faster than previous state-of-the-art methods.\n","authors":["Cuong N. Nguyen","Phong Tran","Lam Si Tung Ho","Vu Dinh","Anh T. Tran","Tal Hassner","Cuong V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2312.00656v2.pdf","comment":"Paper published at The 39th Conference on Uncertainty in Artificial\n Intelligence (UAI) 2023"},{"id":"http://arxiv.org/abs/2309.12742v2","updated":"2023-12-04T03:15:24Z","published":"2023-09-22T09:43:32Z","title":"Make the U in UDA Matter: Invariant Consistency Learning for\n Unsupervised Domain Adaptation","summary":" Domain Adaptation (DA) is always challenged by the spurious correlation\nbetween domain-invariant features (e.g., class identity) and domain-specific\nfeatures (e.g., environment) that does not generalize to the target domain.\nUnfortunately, even enriched with additional unsupervised target domains,\nexisting Unsupervised DA (UDA) methods still suffer from it. This is because\nthe source domain supervision only considers the target domain samples as\nauxiliary data (e.g., by pseudo-labeling), yet the inherent distribution in the\ntarget domain -- where the valuable de-correlation clues hide -- is\ndisregarded. We propose to make the U in UDA matter by giving equal status to\nthe two domains. Specifically, we learn an invariant classifier whose\nprediction is simultaneously consistent with the labels in the source domain\nand clusters in the target domain, hence the spurious correlation inconsistent\nin the target domain is removed. We dub our approach \"Invariant CONsistency\nlearning\" (ICON). Extensive experiments show that ICON achieves the\nstate-of-the-art performance on the classic UDA benchmarks: Office-Home and\nVisDA-2017, and outperforms all the conventional methods on the challenging\nWILDS 2.0 benchmark. Codes are in https://github.com/yue-zhongqi/ICON.\n","authors":["Zhongqi Yue","Hanwang Zhang","Qianru Sun"],"pdf_url":"https://arxiv.org/pdf/2309.12742v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.01588v1","updated":"2023-12-04T03:09:31Z","published":"2023-12-04T03:09:31Z","title":"ActiveClean: Generating Line-Level Vulnerability Data via Active\n Learning","summary":" Deep learning vulnerability detection tools are increasing in popularity and\nhave been shown to be effective. These tools rely on large volume of high\nquality training data, which are very hard to get. Most of the currently\navailable datasets provide function-level labels, reporting whether a function\nis vulnerable or not vulnerable. However, for a vulnerability detection to be\nuseful, we need to also know the lines that are relevant to the vulnerability.\nThis paper makes efforts towards developing systematic tools and proposes.\nActiveClean to generate the large volume of line-level vulnerability data from\ncommits. That is, in addition to function-level labels, it also reports which\nlines in the function are likely responsible for vulnerability detection. In\nthe past, static analysis has been applied to clean commits to generate\nline-level data. Our approach based on active learning, which is easy to use\nand scalable, provide a complementary approach to static analysis. We designed\nsemantic and syntactic properties from commit lines and use them to train the\nmodel. We evaluated our approach on both Java and C datasets processing more\nthan 4.3K commits and 119K commit lines. AcitveClean achieved an F1 score\nbetween 70-74. Further, we also show that active learning is effective by using\njust 400 training data to reach F1 score of 70.23. Using ActiveClean, we\ngenerate the line-level labels for the entire FFMpeg project in the Devign\ndataset, including 5K functions, and also detected incorrect function-level\nlabels. We demonstrated that using our cleaned data, LineVul, a SOTA line-level\nvulnerability detection tool, detected 70 more vulnerable lines and 18 more\nvulnerable functions, and improved Top 10 accuracy from 66% to 73%.\n","authors":["Ashwin Kallingal Joshy","Mirza Sanjida Alam","Shaila Sharmin","Qi Li","Wei Le"],"pdf_url":"https://arxiv.org/pdf/2312.01588v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01587v1","updated":"2023-12-04T03:04:09Z","published":"2023-12-04T03:04:09Z","title":"Scalable and Independent Learning of Nash Equilibrium Policies in\n $n$-Player Stochastic Games with Unknown Independent Chains","summary":" We study a subclass of $n$-player stochastic games, namely, stochastic games\nwith independent chains and unknown transition matrices. In this class of\ngames, players control their own internal Markov chains whose transitions do\nnot depend on the states/actions of other players. However, players' decisions\nare coupled through their payoff functions. We assume players can receive only\nrealizations of their payoffs, and that the players can not observe the states\nand actions of other players, nor do they know the transition probability\nmatrices of their own Markov chain. Relying on a compact dual formulation of\nthe game based on occupancy measures and the technique of confidence set to\nmaintain high-probability estimates of the unknown transition matrices, we\npropose a fully decentralized mirror descent algorithm to learn an\n$\\epsilon$-NE for this class of games. The proposed algorithm has the desired\nproperties of independence, scalability, and convergence. Specifically, under\nno assumptions on the reward functions, we show the proposed algorithm\nconverges in polynomial time in a weaker distance (namely, the averaged\nNikaido-Isoda gap) to the set of $\\epsilon$-NE policies with arbitrarily high\nprobability. Moreover, assuming the existence of a variationally stable Nash\nequilibrium policy, we show that the proposed algorithm converges\nasymptotically to the stable $\\epsilon$-NE policy with arbitrarily high\nprobability. In addition to Markov potential games and linear-quadratic\nstochastic games, this work provides another subclass of $n$-player stochastic\ngames that, under some mild assumptions, admit polynomial-time learning\nalgorithms for finding their stationary $\\epsilon$-NE policies.\n","authors":["Tiancheng Qin","S. Rasoul Etesami"],"pdf_url":"https://arxiv.org/pdf/2312.01587v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00690v3","updated":"2023-12-04T02:58:02Z","published":"2023-11-01T17:45:52Z","title":"What User Behaviors Make the Differences During the Process of Visual\n Analytics?","summary":" The understanding of visual analytics process can benefit visualization\nresearchers from multiple aspects, including improving visual designs and\ndeveloping advanced interaction functions. However, the log files of user\nbehaviors are still hard to analyze due to the complexity of sensemaking and\nour lack of knowledge on the related user behaviors. This work presents a study\non a comprehensive data collection of user behaviors, and our analysis approach\nwith time-series classification methods. We have chosen a classical\nvisualization application, Covid-19 data analysis, with common analysis tasks\ncovering geo-spatial, time-series and multi-attributes. Our user study collects\nuser behaviors on a diverse set of visualization tasks with two comparable\nsystems, desktop and immersive visualizations. We summarize the classification\nresults with three time-series machine learning algorithms at two scales, and\nexplore the influences of behavior features. Our results reveal that user\nbehaviors can be distinguished during the process of visual analytics and there\nis a potentially strong association between the physical behaviors of users and\nthe visualization tasks they perform. We also demonstrate the usage of our\nmodels by interpreting open sessions of visual analytics, which provides an\nautomatic way to study sensemaking without tedious manual annotations.\n","authors":["Zekun Wu","Shahin Doroudian","Aidong Lu"],"pdf_url":"https://arxiv.org/pdf/2311.00690v3.pdf","comment":"This version corrects the issues of previous versions"},{"id":"http://arxiv.org/abs/2312.01585v1","updated":"2023-12-04T02:48:40Z","published":"2023-12-04T02:48:40Z","title":"OCGEC: One-class Graph Embedding Classification for DNN Backdoor\n Detection","summary":" Deep neural networks (DNNs) have been found vulnerable to backdoor attacks,\nraising security concerns about their deployment in mission-critical\napplications. There are various approaches to detect backdoor attacks, however\nthey all make certain assumptions about the target attack to be detected and\nrequire equal and huge numbers of clean and backdoor samples for training,\nwhich renders these detection methods quite limiting in real-world\ncircumstances.\n This study proposes a novel one-class classification framework called\nOne-class Graph Embedding Classification (OCGEC) that uses GNNs for model-level\nbackdoor detection with only a little amount of clean data. First, we train\nthousands of tiny models as raw datasets from a small number of clean datasets.\nFollowing that, we design a ingenious model-to-graph method for converting the\nmodel's structural details and weight features into graph data. We then\npre-train a generative self-supervised graph autoencoder (GAE) to better learn\nthe features of benign models in order to detect backdoor models without\nknowing the attack strategy. After that, we dynamically combine the GAE and\none-class classifier optimization goals to form classification boundaries that\ndistinguish backdoor models from benign models.\n Our OCGEC combines the powerful representation capabilities of graph neural\nnetworks with the utility of one-class classification techniques in the field\nof anomaly detection. In comparison to other baselines, it achieves AUC scores\nof more than 98% on a number of tasks, which far exceeds existing methods for\ndetection even when they rely on a huge number of positive and negative\nsamples. Our pioneering application of graphic scenarios for generic backdoor\ndetection can provide new insights that can be used to improve other backdoor\ndefense tasks. Code is available at https://github.com/jhy549/OCGEC.\n","authors":["Haoyu Jiang","Haiyang Yu","Nan Li","Ping Yi"],"pdf_url":"https://arxiv.org/pdf/2312.01585v1.pdf","comment":"13 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.01581v1","updated":"2023-12-04T02:33:53Z","published":"2023-12-04T02:33:53Z","title":"Signed Binarization: Unlocking Efficiency Through Repetition-Sparsity\n Trade-Off","summary":" Efficient inference of Deep Neural Networks (DNNs) on resource-constrained\nedge devices is essential. Quantization and sparsity are key algorithmic\ntechniques that translate to repetition and sparsity within tensors at the\nhardware-software interface. This paper introduces the concept of\nrepetition-sparsity trade-off that helps explain computational efficiency\nduring inference. We propose Signed Binarization, a unified co-design framework\nthat synergistically integrates hardware-software systems, quantization\nfunctions, and representation learning techniques to address this trade-off.\nOur results demonstrate that Signed Binarization is more accurate than\nbinarization with the same number of non-zero weights. Detailed analysis\nindicates that signed binarization generates a smaller distribution of\neffectual (non-zero) parameters nested within a larger distribution of total\nparameters, both of the same type, for a DNN block. Finally, our approach\nachieves a 26% speedup on real hardware, doubles energy efficiency, and reduces\ndensity by 2.8x compared to binary methods for ResNet 18, presenting an\nalternative solution for deploying efficient models in resource-limited\nenvironments.\n","authors":["Sachit Kuhar","Yash Jain","Alexey Tumanov"],"pdf_url":"https://arxiv.org/pdf/2312.01581v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.11032v2","updated":"2023-12-04T02:24:19Z","published":"2023-05-18T15:19:26Z","title":"Optimistic Natural Policy Gradient: a Simple Efficient Policy\n Optimization Framework for Online RL","summary":" While policy optimization algorithms have played an important role in recent\nempirical success of Reinforcement Learning (RL), the existing theoretical\nunderstanding of policy optimization remains rather limited -- they are either\nrestricted to tabular MDPs or suffer from highly suboptimal sample complexity,\nespecial in online RL where exploration is necessary. This paper proposes a\nsimple efficient policy optimization framework -- Optimistic NPG for online RL.\nOptimistic NPG can be viewed as a simple combination of the classic natural\npolicy gradient (NPG) algorithm [Kakade, 2001] with optimistic policy\nevaluation subroutines to encourage exploration. For $d$-dimensional linear\nMDPs, Optimistic NPG is computationally efficient, and learns an\n$\\varepsilon$-optimal policy within $\\tilde{O}(d^2/\\varepsilon^3)$ samples,\nwhich is the first computationally efficient algorithm whose sample complexity\nhas the optimal dimension dependence $\\tilde{\\Theta}(d^2)$. It also improves\nover state-of-the-art results of policy optimization algorithms [Zanette et\nal., 2021] by a factor of $d$. In the realm of general function approximation,\nwhich subsumes linear MDPs, Optimistic NPG, to our best knowledge, stands as\nthe first policy optimization algorithm that achieves polynomial sample\ncomplexity for learning near-optimal policies.\n","authors":["Qinghua Liu","Gellért Weisz","András György","Chi Jin","Csaba Szepesvári"],"pdf_url":"https://arxiv.org/pdf/2305.11032v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01577v1","updated":"2023-12-04T02:23:32Z","published":"2023-12-04T02:23:32Z","title":"RJHMC-Tree for Exploration of the Bayesian Decision Tree Posterior","summary":" Decision trees have found widespread application within the machine learning\ncommunity due to their flexibility and interpretability. This paper is directed\ntowards learning decision trees from data using a Bayesian approach, which is\nchallenging due to the potentially enormous parameter space required to span\nall tree models. Several approaches have been proposed to combat this\nchallenge, with one of the more successful being Markov chain Monte Carlo\n(MCMC) methods. The efficacy and efficiency of MCMC methods fundamentally rely\non the quality of the so-called proposals, which is the focus of this paper. In\nparticular, this paper investigates using a Hamiltonian Monte Carlo (HMC)\napproach to explore the posterior of Bayesian decision trees more efficiently\nby exploiting the geometry of the likelihood within a global update scheme. Two\nimplementations of the novel algorithm are developed and compared to existing\nmethods by testing against standard datasets in the machine learning and\nBayesian decision tree literature. HMC-based methods are shown to perform\nfavourably with respect to predictive test accuracy, acceptance rate, and tree\ncomplexity.\n","authors":["Jodie A. Cochrane","Adrian G. Wills","Sarah J. Johnson"],"pdf_url":"https://arxiv.org/pdf/2312.01577v1.pdf","comment":"43 pages, 7 figures"},{"id":"http://arxiv.org/abs/2206.05669v3","updated":"2023-12-04T02:05:25Z","published":"2022-06-12T05:43:16Z","title":"Universality and approximation bounds for echo state networks with\n random weights","summary":" We study the uniform approximation of echo state networks with randomly\ngenerated internal weights. These models, in which only the readout weights are\noptimized during training, have made empirical success in learning dynamical\nsystems. Recent results showed that echo state networks with ReLU activation\nare universal. In this paper, we give an alternative construction and prove\nthat the universality holds for general activation functions. Specifically, our\nmain result shows that, under certain condition on the activation function,\nthere exists a sampling procedure for the internal weights so that the echo\nstate network can approximate any continuous casual time-invariant operators\nwith high probability. In particular, for ReLU activation, we give explicit\nconstruction for these sampling procedures. We also quantify the approximation\nerror of the constructed ReLU echo state networks for sufficiently regular\noperators.\n","authors":["Zhen Li","Yunfei Yang"],"pdf_url":"https://arxiv.org/pdf/2206.05669v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00761v2","updated":"2023-12-04T01:57:38Z","published":"2023-12-01T18:29:08Z","title":"Deep Unlearning: Fast and Efficient Training-free Approach to Controlled\n Forgetting","summary":" Machine unlearning has emerged as a prominent and challenging area of\ninterest, driven in large part by the rising regulatory demands for industries\nto delete user data upon request and the heightened awareness of privacy.\nExisting approaches either retrain models from scratch or use several\nfinetuning steps for every deletion request, often constrained by computational\nresource limitations and restricted access to the original training data. In\nthis work, we introduce a novel class unlearning algorithm designed to\nstrategically eliminate an entire class or a group of classes from the learned\nmodel. To that end, our algorithm first estimates the Retain Space and the\nForget Space, representing the feature or activation spaces for samples from\nclasses to be retained and unlearned, respectively. To obtain these spaces, we\npropose a novel singular value decomposition-based technique that requires\nlayer wise collection of network activations from a few forward passes through\nthe network. We then compute the shared information between these spaces and\nremove it from the forget space to isolate class-discriminatory feature space\nfor unlearning. Finally, we project the model weights in the orthogonal\ndirection of the class-discriminatory space to obtain the unlearned model. We\ndemonstrate our algorithm's efficacy on ImageNet using a Vision Transformer\nwith only $\\sim$1.5% drop in retain accuracy compared to the original model\nwhile maintaining under 1% accuracy on the unlearned class samples. Further,\nour algorithm consistently performs well when subject to Membership Inference\nAttacks showing 7.8% improvement on average across a variety of image\nclassification datasets and network architectures, as compared to other\nbaselines while being $\\sim$6x more computationally efficient.\n","authors":["Sangamesh Kodge","Gobinda Saha","Kaushik Roy"],"pdf_url":"https://arxiv.org/pdf/2312.00761v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01567v1","updated":"2023-12-04T01:47:05Z","published":"2023-12-04T01:47:05Z","title":"Toward Automated Quantum Variational Machine Learning","summary":" In this work, we address the problem of automating quantum variational\nmachine learning. We develop a multi-locality parallelizable search algorithm,\ncalled MUSE, to find the initial points and the sets of parameters that achieve\nthe best performance for quantum variational circuit learning. Simulations with\nfive real-world classification datasets indicate that on average, MUSE improves\nthe detection accuracy of quantum variational classifiers 2.3 times with\nrespect to the observed lowest scores. Moreover, when applied to two real-world\nregression datasets, MUSE improves the quality of the predictions from negative\ncoefficients of determination to positive ones. Furthermore, the classification\nand regression scores of the quantum variational models trained with MUSE are\non par with the classical counterparts.\n","authors":["Omer Subasi"],"pdf_url":"https://arxiv.org/pdf/2312.01567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01564v1","updated":"2023-12-04T01:42:09Z","published":"2023-12-04T01:42:09Z","title":"APoLLo: Unified Adapter and Prompt Learning for Vision Language Models","summary":" The choice of input text prompt plays a critical role in the performance of\nVision-Language Pretrained (VLP) models such as CLIP. We present APoLLo, a\nunified multi-modal approach that combines Adapter and Prompt learning for\nVision-Language models. Our method is designed to substantially improve the\ngeneralization capabilities of VLP models when they are fine-tuned in a\nfew-shot setting. We introduce trainable cross-attention-based adapter layers\nin conjunction with vision and language encoders to strengthen the alignment\nbetween the two modalities. We enforce consistency between the respective\nencoder branches (receiving augmented inputs) to prevent overfitting in\ndownstream tasks. Our method is evaluated on three representative tasks:\ngeneralization to novel classes, cross-dataset evaluation, and unseen domain\nshifts. In practice, APoLLo achieves a relative gain up to 6.03% over MaPLe\n(SOTA) on novel classes for 10 diverse image recognition datasets.\n","authors":["Sanjoy Chowdhury","Sayan Nag","Dinesh Manocha"],"pdf_url":"https://arxiv.org/pdf/2312.01564v1.pdf","comment":"Accepted at EMNLP 2023 (Main track)"},{"id":"http://arxiv.org/abs/2306.00392v2","updated":"2023-12-04T01:40:15Z","published":"2023-06-01T06:53:14Z","title":"Coneheads: Hierarchy Aware Attention","summary":" Attention networks such as transformers have achieved state-of-the-art\nperformance in many domains. These networks rely heavily on the dot product\nattention operator, which computes the similarity between two points by taking\ntheir inner product. However, the inner product does not explicitly model the\ncomplex structural properties of real world datasets, such as hierarchies\nbetween data points. To remedy this, we introduce cone attention, a drop-in\nreplacement for dot product attention based on hyperbolic entailment cones.\nCone attention associates two points by the depth of their lowest common\nancestor in a hierarchy defined by hyperbolic cones, which intuitively measures\nthe divergence of two points and gives a hierarchy aware similarity score. We\ntest cone attention on a wide variety of models and tasks and show that it\nimproves task-level performance over dot product attention and other baselines,\nand is able to match dot-product attention with significantly fewer parameters.\nOur results suggest that cone attention is an effective way to capture\nhierarchical relationships when calculating attention.\n","authors":["Albert Tseng","Tao Yu","Toni J. B. Liu","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2306.00392v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2311.09018v2","updated":"2023-12-04T01:39:22Z","published":"2023-11-15T15:02:23Z","title":"On the Foundation of Distributionally Robust Reinforcement Learning","summary":" Motivated by the need for a robust policy in the face of environment shifts\nbetween training and the deployment, we contribute to the theoretical\nfoundation of distributionally robust reinforcement learning (DRRL). This is\naccomplished through a comprehensive modeling framework centered around\ndistributionally robust Markov decision processes (DRMDPs). This framework\nobliges the decision maker to choose an optimal policy under the worst-case\ndistributional shift orchestrated by an adversary. By unifying and extending\nexisting formulations, we rigorously construct DRMDPs that embraces various\nmodeling attributes for both the decision maker and the adversary. These\nattributes include adaptability granularity, exploring history-dependent,\nMarkov, and Markov time-homogeneous decision maker and adversary dynamics.\nAdditionally, we delve into the flexibility of shifts induced by the adversary,\nexamining SA and S-rectangularity. Within this DRMDP framework, we investigate\nconditions for the existence or absence of the dynamic programming principle\n(DPP). From an algorithmic standpoint, the existence of DPP holds significant\nimplications, as the vast majority of existing data and computationally\nefficiency RL algorithms are reliant on the DPP. To study its existence, we\ncomprehensively examine combinations of controller and adversary attributes,\nproviding streamlined proofs grounded in a unified methodology. We also offer\ncounterexamples for settings in which a DPP with full generality is absent.\n","authors":["Shengbo Wang","Nian Si","Jose Blanchet","Zhengyuan Zhou"],"pdf_url":"https://arxiv.org/pdf/2311.09018v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.09718v2","updated":"2023-12-04T01:15:56Z","published":"2023-10-15T03:08:25Z","title":"Efficient and Effective Deep Multi-view Subspace Clustering","summary":" Recent multi-view subspace clustering achieves impressive results utilizing\ndeep networks, where the self-expressive correlation is typically modeled by a\nfully connected (FC) layer. However, they still suffer from two limitations. i)\nThe parameter scale of the FC layer is quadratic to sample numbers, resulting\nin high time and memory costs that significantly degrade their feasibility in\nlarge-scale datasets. ii) It is under-explored to extract a unified\nrepresentation that simultaneously satisfies minimal sufficiency and\ndiscriminability. To this end, we propose a novel deep framework, termed\nEfficient and Effective deep Multi-View Subspace Clustering (E$^2$MVSC).\nInstead of a parameterized FC layer, we design a Relation-Metric Net that\ndecouples network parameter scale from sample numbers for greater computational\nefficiency. Most importantly, the proposed method devises a multi-type\nauto-encoder to explicitly decouple consistent, complementary, and superfluous\ninformation from every view, which is supervised by a soft clustering\nassignment similarity constraint. Following information bottleneck theory and\nthe maximal coding rate reduction principle, a sufficient yet minimal unified\nrepresentation can be obtained, as well as pursuing intra-cluster aggregation\nand inter-cluster separability within it. Extensive experiments show that\nE$^2$MVSC yields comparable results to existing methods and achieves\nstate-of-the-art performance in various types of multi-view datasets.\n","authors":["Yuxiu Lin","Hui Liu","Ren Wang","Qiang Guo","Caiming Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.09718v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01555v1","updated":"2023-12-04T00:54:04Z","published":"2023-12-04T00:54:04Z","title":"Explainable AI is Responsible AI: How Explainability Creates Trustworthy\n and Socially Responsible Artificial Intelligence","summary":" Artificial intelligence (AI) has been clearly established as a technology\nwith the potential to revolutionize fields from healthcare to finance - if\ndeveloped and deployed responsibly. This is the topic of responsible AI, which\nemphasizes the need to develop trustworthy AI systems that minimize bias,\nprotect privacy, support security, and enhance transparency and accountability.\nExplainable AI (XAI) has been broadly considered as a building block for\nresponsible AI (RAI), with most of the literature considering it as a solution\nfor improved transparency. This work proposes that XAI and responsible AI are\nsignificantly more deeply entwined. In this work, we explore state-of-the-art\nliterature on RAI and XAI technologies. Based on our findings, we demonstrate\nthat XAI can be utilized to ensure fairness, robustness, privacy, security, and\ntransparency in a wide range of contexts. Our findings lead us to conclude that\nXAI is an essential foundation for every pillar of RAI.\n","authors":["Stephanie Baker","Wei Xiang"],"pdf_url":"https://arxiv.org/pdf/2312.01555v1.pdf","comment":"35 pages, 7 figures (figures 3-6 include subfigures)"},{"id":"http://arxiv.org/abs/2312.01547v1","updated":"2023-12-04T00:31:16Z","published":"2023-12-04T00:31:16Z","title":"Near-Optimal Algorithms for Gaussians with Huber Contamination: Mean\n Estimation and Linear Regression","summary":" We study the fundamental problems of Gaussian mean estimation and linear\nregression with Gaussian covariates in the presence of Huber contamination. Our\nmain contribution is the design of the first sample near-optimal and almost\nlinear-time algorithms with optimal error guarantees for both of these\nproblems. Specifically, for Gaussian robust mean estimation on $\\mathbb{R}^d$\nwith contamination parameter $\\epsilon \\in (0, \\epsilon_0)$ for a small\nabsolute constant $\\epsilon_0$, we give an algorithm with sample complexity $n\n= \\tilde{O}(d/\\epsilon^2)$ and almost linear runtime that approximates the\ntarget mean within $\\ell_2$-error $O(\\epsilon)$. This improves on prior work\nthat achieved this error guarantee with polynomially suboptimal sample and time\ncomplexity. For robust linear regression, we give the first algorithm with\nsample complexity $n = \\tilde{O}(d/\\epsilon^2)$ and almost linear runtime that\napproximates the target regressor within $\\ell_2$-error $O(\\epsilon)$. This is\nthe first polynomial sample and time algorithm achieving the optimal error\nguarantee, answering an open question in the literature. At the technical\nlevel, we develop a methodology that yields almost-linear time algorithms for\nmulti-directional filtering that may be of broader interest.\n","authors":["Ilias Diakonikolas","Daniel M. Kane","Ankit Pensia","Thanasis Pittas"],"pdf_url":"https://arxiv.org/pdf/2312.01547v1.pdf","comment":"To appear in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.05853v2","updated":"2023-12-04T00:26:41Z","published":"2023-09-11T22:28:36Z","title":"ChemSpaceAL: An Efficient Active Learning Methodology Applied to\n Protein-Specific Molecular Generation","summary":" The incredible capabilities of generative artificial intelligence models have\ninevitably led to their application in the domain of drug discovery. Within\nthis domain, the vastness of chemical space motivates the development of more\nefficient methods for identifying regions with molecules that exhibit desired\ncharacteristics. In this work, we present a computationally efficient active\nlearning methodology that requires evaluation of only a subset of the generated\ndata in the constructed sample space to successfully align a generative model\nwith respect to a specified objective. We demonstrate the applicability of this\nmethodology to targeted molecular generation by fine-tuning a GPT-based\nmolecular generator toward a protein with FDA-approved small-molecule\ninhibitors, c-Abl kinase. Remarkably, the model learns to generate molecules\nsimilar to the inhibitors without prior knowledge of their existence, and even\nreproduces two of them exactly. We also show that the methodology is effective\nfor a protein without any commercially available small-molecule inhibitors, the\nHNH domain of the CRISPR-associated protein 9 (Cas9) enzyme. We believe that\nthe inherent generality of this method ensures that it will remain applicable\nas the exciting field of in silico molecular generation evolves. To facilitate\nimplementation and reproducibility, we have made all of our software available\nthrough the open-source ChemSpaceAL Python package.\n","authors":["Gregory W. Kyro","Anton Morgunov","Rafael I. Brent","Victor S. Batista"],"pdf_url":"https://arxiv.org/pdf/2309.05853v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01544v1","updated":"2023-12-04T00:11:27Z","published":"2023-12-04T00:11:27Z","title":"KEEC: Embed to Control on An Equivariant Geometry","summary":" This paper investigates how representation learning can enable optimal\ncontrol in unknown and complex dynamics, such as chaotic and non-linear\nsystems, without relying on prior domain knowledge of the dynamics. The core\nidea is to establish an equivariant geometry that is diffeomorphic to the\nmanifold defined by a dynamical system and to perform optimal control within\nthis corresponding geometry, which is a non-trivial task. To address this\nchallenge, Koopman Embed to Equivariant Control (KEEC) is introduced for model\nlearning and control. Inspired by Lie theory, KEEC begins by learning a\nnon-linear dynamical system defined on a manifold and embedding trajectories\ninto a Lie group. Subsequently, KEEC formulates an equivariant value function\nequation in reinforcement learning on the equivariant geometry, ensuring an\ninvariant effect as the value function on the original manifold. By deriving\nanalytical-form optimal actions on the equivariant value function, KEEC\ntheoretically achieves quadratic convergence for the optimal equivariant value\nfunction by leveraging the differential information on the equivariant\ngeometry. The effectiveness of KEEC is demonstrated in challenging dynamical\nsystems, including chaotic ones like Lorenz-63. Notably, our findings indicate\nthat isometric and isomorphic loss functions, ensuring the compactness and\nsmoothness of geometry, outperform loss functions without these properties.\n","authors":["Xiaoyuan Cheng","Yiming Yang","Wei Jiang","Yukun Hu"],"pdf_url":"https://arxiv.org/pdf/2312.01544v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02396v1","updated":"2023-12-04T23:26:12Z","published":"2023-12-04T23:26:12Z","title":"Unsupervised Change Detection for Space Habitats Using 3D Point Clouds","summary":" This work presents an algorithm for scene change detection from point clouds\nto enable autonomous robotic caretaking in future space habitats. Autonomous\nrobotic systems will help maintain future deep-space habitats, such as the\nGateway space station, which will be uncrewed for extended periods. Existing\nscene analysis software used on the International Space Station (ISS) relies on\nmanually-labeled images for detecting changes. In contrast, the algorithm\npresented in this work uses raw, unlabeled point clouds as inputs. The\nalgorithm first applies modified Expectation-Maximization Gaussian Mixture\nModel (GMM) clustering to two input point clouds. It then performs change\ndetection by comparing the GMMs using the Earth Mover's Distance. The algorithm\nis validated quantitatively and qualitatively using a test dataset collected by\nan Astrobee robot in the NASA Ames Granite Lab comprising single frame depth\nimages taken directly by Astrobee and full-scene reconstructed maps built with\nRGB-D and pose data from Astrobee. The runtimes of the approach are also\nanalyzed in depth. The source code is publicly released to promote further\ndevelopment.\n","authors":["Jamie Santos","Holly Dinkel","Julia Di","Paulo V. K. Borges","Marina Moreira","Oleg Alexandrov","Brian Coltin","Trey Smith"],"pdf_url":"https://arxiv.org/pdf/2312.02396v1.pdf","comment":"15 pages, 7 figures, Manuscript will be presented at the AIAA SciTech\n Forum in Orlando, FL, USA, 8 - 12 January 2024"},{"id":"http://arxiv.org/abs/2306.04037v2","updated":"2023-12-04T23:18:54Z","published":"2023-06-06T22:04:45Z","title":"Quantitative Analysis of Primary Attribution Explainable Artificial\n Intelligence Methods for Remote Sensing Image Classification","summary":" We present a comprehensive analysis of quantitatively evaluating explainable\nartificial intelligence (XAI) techniques for remote sensing image\nclassification. Our approach leverages state-of-the-art machine learning\napproaches to perform remote sensing image classification across multiple\nmodalities. We investigate the results of the models qualitatively through XAI\nmethods. Additionally, we compare the XAI methods quantitatively through\nvarious categories of desired properties. Through our analysis, we offer\ninsights and recommendations for selecting the most appropriate XAI method(s)\nto gain a deeper understanding of the models' decision-making processes. The\ncode for this work is publicly available.\n","authors":["Akshatha Mohan","Joshua Peeples"],"pdf_url":"https://arxiv.org/pdf/2306.04037v2.pdf","comment":"4 pages, 3 figures, Accepted to 2023 IGARSS Community-Contributed\n Sessions - Opening the Black Box: Explainable AI/ML in Remote Sensing\n Analysis"},{"id":"http://arxiv.org/abs/2312.02387v1","updated":"2023-12-04T23:03:09Z","published":"2023-12-04T23:03:09Z","title":"Dissecting Medical Referral Mechanisms in Health Services: Role of\n Physician Professional Networks","summary":" Medical referrals between primary care physicians (PC) and specialist care\n(SC) physicians profoundly impact patient care regarding quality, satisfaction,\nand cost. This paper investigates the influence of professional networks among\nmedical doctors on referring patients from PC to SC. Using five-year\nconsultation data from a Portuguese private health provider, we conducted\nexploratory data analysis and constructed both professional and referral\nnetworks among physicians. We then apply Graph Neural Network (GNN) models to\nlearn latent representations of the referral network. Our analysis supports the\nhypothesis that doctors' professional social connections can predict medical\nreferrals, potentially enhancing collaboration within organizations and\nimproving healthcare services. This research contributes to dissecting the\nunderlying mechanisms in primary-specialty referrals, thereby providing\nvaluable insights for enhancing patient care and effective healthcare\nmanagement.\n","authors":["Regina de Brito Duarte","Qiwei Han","Claudia Soares"],"pdf_url":"https://arxiv.org/pdf/2312.02387v1.pdf","comment":"27 pages, 9 figures, 2 tables"}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.01640v1","updated":"2023-12-04T05:42:56Z","published":"2023-12-04T05:42:56Z","title":"SequencePAR: Understanding Pedestrian Attributes via A Sequence\n Generation Paradigm","summary":" Current pedestrian attribute recognition (PAR) algorithms are developed based\non multi-label or multi-task learning frameworks, which aim to discriminate the\nattributes using specific classification heads. However, these discriminative\nmodels are easily influenced by imbalanced data or noisy samples. Inspired by\nthe success of generative models, we rethink the pedestrian attribute\nrecognition scheme and believe the generative models may perform better on\nmodeling dependencies and complexity between human attributes. In this paper,\nwe propose a novel sequence generation paradigm for pedestrian attribute\nrecognition, termed SequencePAR. It extracts the pedestrian features using a\npre-trained CLIP model and embeds the attribute set into query tokens under the\nguidance of text prompts. Then, a Transformer decoder is proposed to generate\nthe human attributes by incorporating the visual features and attribute query\ntokens. The masked multi-head attention layer is introduced into the decoder\nmodule to prevent the model from remembering the next attribute while making\nattribute predictions during training. Extensive experiments on multiple widely\nused pedestrian attribute recognition datasets fully validated the\neffectiveness of our proposed SequencePAR. The source code and pre-trained\nmodels will be released at https://github.com/Event-AHU/OpenPAR.\n","authors":["Jiandong Jin","Xiao Wang","Chenglong Li","Lili Huang","Jin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.01640v1.pdf","comment":"In Peer Review"},{"id":"http://arxiv.org/abs/2306.08089v2","updated":"2023-12-04T02:31:14Z","published":"2023-06-13T19:14:54Z","title":"360TripleView: 360-Degree Video View Management System Driven by\n Convergence Value of Viewing Preferences","summary":" 360-degree video has become increasingly popular in content consumption.\nHowever, finding the viewing direction for important content within each frame\nposes a significant challenge. Existing approaches rely on either viewer input\nor algorithmic determination to select the viewing direction, but neither mode\nconsistently outperforms the other in terms of content-importance. In this\npaper, we propose 360TripleView, the first view management system for\n360-degree video that automatically infers and utilizes the better view mode\nfor each frame, ultimately providing viewers with higher content-importance\nviews. Through extensive experiments and a user study, we demonstrate that\n360TripleView achieves over 90\\% accuracy in inferring the better mode and\nsignificantly enhances content-importance compared to existing methods.\n","authors":["Qian Zhou","Michael Zink","Ramesh Sitaraman","Klara Nahrstedt"],"pdf_url":"https://arxiv.org/pdf/2306.08089v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01576v1","updated":"2023-12-04T02:20:35Z","published":"2023-12-04T02:20:35Z","title":"Learning Efficient Unsupervised Satellite Image-based Building Damage\n Detection","summary":" Existing Building Damage Detection (BDD) methods always require\nlabour-intensive pixel-level annotations of buildings and their conditions,\nhence largely limiting their applications. In this paper, we investigate a\nchallenging yet practical scenario of BDD, Unsupervised Building Damage\nDetection (U-BDD), where only unlabelled pre- and post-disaster satellite image\npairs are provided. As a pilot study, we have first proposed an advanced U-BDD\nbaseline that leverages pre-trained vision-language foundation models (i.e.,\nGrounding DINO, SAM and CLIP) to address the U-BDD task. However, the apparent\ndomain gap between satellite and generic images causes low confidence in the\nfoundation models used to identify buildings and their damages. In response, we\nfurther present a novel self-supervised framework, U-BDD++, which improves upon\nthe U-BDD baseline by addressing domain-specific issues associated with\nsatellite imagery. Furthermore, the new Building Proposal Generation (BPG)\nmodule and the CLIP-enabled noisy Building Proposal Selection (CLIP-BPS) module\nin U-BDD++ ensure high-quality self-training. Extensive experiments on the\nwidely used building damage assessment benchmark demonstrate the effectiveness\nof the proposed method for unsupervised building damage detection. The\npresented annotation-free and foundation model-based paradigm ensures an\nefficient learning phase. This study opens a new direction for real-world BDD\nand sets a strong baseline for future research.\n","authors":["Yiyun Zhang","Zijian Wang","Yadan Luo","Xin Yu","Zi Huang"],"pdf_url":"https://arxiv.org/pdf/2312.01576v1.pdf","comment":"ICDM 2023"},{"id":"http://arxiv.org/abs/2306.07848v10","updated":"2023-12-04T06:27:50Z","published":"2023-06-13T15:28:10Z","title":"GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio\n Pretraining for Accurate Speech Emotion Recognition","summary":" Contrastive cross-modality pretraining has recently exhibited impressive\nsuccess in diverse fields, whereas there is limited research on their merits in\nspeech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind\nof gender-attribute-enhanced contrastive language-audio pretraining (CLAP)\nmethod for SER. Specifically, we first construct an effective emotion CLAP\n(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given\nthe significance of gender information in SER, two novel multi-task learning\nbased GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP)\nmodels are further proposed to incorporate gender information of speech\nsignals, forming more reasonable objectives. Experiments on IEMOCAP indicate\nthat our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with\ndifferent pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP\nobtains the best WAR of 83.16\\%, which performs better than state-of-the-art\nSER methods.\n","authors":["Yu Pan","Yanni Hu","Yuguang Yang","Wen Fei","Jixun Yao","Heng Lu","Lei Ma","Jianjun Zhao"],"pdf_url":"https://arxiv.org/pdf/2306.07848v10.pdf","comment":"5 pages"},{"id":"http://arxiv.org/abs/2312.02338v1","updated":"2023-12-04T20:47:48Z","published":"2023-12-04T20:47:48Z","title":"A Contrastive Compositional Benchmark for Text-to-Image Synthesis: A\n Study with Unified Text-to-Image Fidelity Metrics","summary":" Text-to-image (T2I) synthesis has recently achieved significant advancements.\nHowever, challenges remain in the model's compositionality, which is the\nability to create new combinations from known components. We introduce\nWinoground-T2I, a benchmark designed to evaluate the compositionality of T2I\nmodels. This benchmark includes 11K complex, high-quality contrastive sentence\npairs spanning 20 categories. These contrastive sentence pairs with subtle\ndifferences enable fine-grained evaluations of T2I synthesis models.\nAdditionally, to address the inconsistency across different metrics, we propose\na strategy that evaluates the reliability of various metrics by using\ncomparative sentence pairs. We use Winoground-T2I with a dual objective: to\nevaluate the performance of T2I models and the metrics used for their\nevaluation. Finally, we provide insights into the strengths and weaknesses of\nthese metrics and the capabilities of current T2I models in tackling challenges\nacross a range of complex compositional categories. Our benchmark is publicly\navailable at https://github.com/zhuxiangru/Winoground-T2I .\n","authors":["Xiangru Zhu","Penglei Sun","Chengyu Wang","Jingping Liu","Zhixu Li","Yanghua Xiao","Jun Huang"],"pdf_url":"https://arxiv.org/pdf/2312.02338v1.pdf","comment":"17 pages, 14 figures, 11 tables"},{"id":"http://arxiv.org/abs/2312.02238v1","updated":"2023-12-04T09:19:38Z","published":"2023-12-04T09:19:38Z","title":"X-Adapter: Adding Universal Compatibility of Plugins for Upgraded\n Diffusion Model","summary":" We introduce X-Adapter, a universal upgrader to enable the pretrained\nplug-and-play modules (e.g., ControlNet, LoRA) to work directly with the\nupgraded text-to-image diffusion model (e.g., SDXL) without further retraining.\nWe achieve this goal by training an additional network to control the frozen\nupgraded model with the new text-image data pairs. In detail, X-Adapter keeps a\nfrozen copy of the old model to preserve the connectors of different plugins.\nAdditionally, X-Adapter adds trainable mapping layers that bridge the decoders\nfrom models of different versions for feature remapping. The remapped features\nwill be used as guidance for the upgraded model. To enhance the guidance\nability of X-Adapter, we employ a null-text training strategy for the upgraded\nmodel. After training, we also introduce a two-stage denoising strategy to\nalign the initial latents of X-Adapter and the upgraded model. Thanks to our\nstrategies, X-Adapter demonstrates universal compatibility with various plugins\nand also enables plugins of different versions to work together, thereby\nexpanding the functionalities of diffusion community. To verify the\neffectiveness of the proposed method, we conduct extensive experiments and the\nresults show that X-Adapter may facilitate wider application in the upgraded\nfoundational diffusion model.\n","authors":["Lingmin Ran","Xiaodong Cun","JiaWei Liu","Rui Zhao","Song Zijie","Xintao Wang","Jussi Keppo","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.02238v1.pdf","comment":"Project page: https://showlab.github.io/X-Adapter/"}]},"2023-12-03T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.01532v1","updated":"2023-12-03T23:12:49Z","published":"2023-12-03T23:12:49Z","title":"Using Large Language Models to Accelerate Communication for Users with\n Severe Motor Impairments","summary":" Finding ways to accelerate text input for individuals with profound motor\nimpairments has been a long-standing area of research. Closing the speed gap\nfor augmentative and alternative communication (AAC) devices such as\neye-tracking keyboards is important for improving the quality of life for such\nindividuals. Recent advances in neural networks of natural language pose new\nopportunities for re-thinking strategies and user interfaces for enhanced\ntext-entry for AAC users. In this paper, we present SpeakFaster, consisting of\nlarge language models (LLMs) and a co-designed user interface for text entry in\na highly-abbreviated form, allowing saving 57% more motor actions than\ntraditional predictive keyboards in offline simulation. A pilot study with 19\nnon-AAC participants typing on a mobile device by hand demonstrated gains in\nmotor savings in line with the offline simulation, while introducing relatively\nsmall effects on overall typing speed. Lab and field testing on two eye-gaze\ntyping users with amyotrophic lateral sclerosis (ALS) demonstrated text-entry\nrates 29-60% faster than traditional baselines, due to significant saving of\nexpensive keystrokes achieved through phrase and word predictions from\ncontext-aware LLMs. These findings provide a strong foundation for further\nexploration of substantially-accelerated text communication for motor-impaired\nusers and demonstrate a direction for applying LLMs to text-based user\ninterfaces.\n","authors":["Shanqing Cai","Subhashini Venugopalan","Katie Seaver","Xiang Xiao","Katrin Tomanek","Sri Jalasutram","Meredith Ringel Morris","Shaun Kane","Ajit Narayanan","Robert L. MacDonald","Emily Kornman","Daniel Vance","Blair Casey","Steve M. Gleason","Philip Q. Nelson","Michael P. Brenner"],"pdf_url":"https://arxiv.org/pdf/2312.01532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01529v1","updated":"2023-12-03T23:03:22Z","published":"2023-12-03T23:03:22Z","title":"T3D: Towards 3D Medical Image Understanding through Vision-Language\n Pre-training","summary":" Expert annotation of 3D medical image for downstream analysis is\nresource-intensive, posing challenges in clinical applications. Visual\nself-supervised learning (vSSL), though effective for learning visual\ninvariance, neglects the incorporation of domain knowledge from medicine. To\nincorporate medical knowledge into visual representation learning,\nvision-language pre-training (VLP) has shown promising results in 2D image.\nHowever, existing VLP approaches become generally impractical when applied to\nhigh-resolution 3D medical images due to GPU hardware constraints and the\npotential loss of critical details caused by downsampling, which is the\nintuitive solution to hardware constraints. To address the above limitations,\nwe introduce T3D, the first VLP framework designed for high-resolution 3D\nmedical images. T3D incorporates two text-informed pretext tasks:\n(\\lowerromannumeral{1}) text-informed contrastive learning;\n(\\lowerromannumeral{2}) text-informed image restoration. These tasks focus on\nlearning 3D visual representations from high-resolution 3D medical images and\nintegrating clinical knowledge from radiology reports, without distorting\ninformation through forced alignment of downsampled volumes with detailed\nanatomical text. Trained on a newly curated large-scale dataset of 3D medical\nimages and radiology reports, T3D significantly outperforms current vSSL\nmethods in tasks like organ and tumor segmentation, as well as disease\nclassification. This underlines T3D's potential in representation learning for\n3D medical image analysis. All data and code will be available upon acceptance.\n","authors":["Che Liu","Cheng Ouyang","Yinda Chen","Cesar César Quilodrán-Casas","Lei Ma","Jie Fu","Yike Guo","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2312.01529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.10513v3","updated":"2023-12-03T23:01:19Z","published":"2023-04-20T17:48:43Z","title":"Why Does ChatGPT Fall Short in Providing Truthful Answers?","summary":" Recent advancements in large language models, such as ChatGPT, have\ndemonstrated significant potential to impact various aspects of human life.\nHowever, ChatGPT still faces challenges in providing reliable and accurate\nanswers to user questions. To better understand the model's particular\nweaknesses in providing truthful answers, we embark an in-depth exploration of\nopen-domain question answering. Specifically, we undertake a detailed\nexamination of ChatGPT's failures, categorized into: comprehension, factuality,\nspecificity, and inference. We further pinpoint factuality as the most\ncontributing failure and identify two critical abilities associated with\nfactuality: knowledge memorization and knowledge recall. Through experiments\nfocusing on factuality, we propose several potential enhancement strategies.\nOur findings suggest that augmenting the model with granular external knowledge\nand cues for knowledge recall can enhance the model's factuality in answering\nquestions.\n","authors":["Shen Zheng","Jie Huang","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2304.10513v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10601v2","updated":"2023-12-03T22:50:35Z","published":"2023-05-17T23:16:17Z","title":"Tree of Thoughts: Deliberate Problem Solving with Large Language Models","summary":" Language models are increasingly being deployed for general problem solving\nacross a wide range of tasks, but are still confined to token-level,\nleft-to-right decision-making processes during inference. This means they can\nfall short in tasks that require exploration, strategic lookahead, or where\ninitial decisions play a pivotal role. To surmount these challenges, we\nintroduce a new framework for language model inference, Tree of Thoughts (ToT),\nwhich generalizes over the popular Chain of Thought approach to prompting\nlanguage models, and enables exploration over coherent units of text (thoughts)\nthat serve as intermediate steps toward problem solving. ToT allows LMs to\nperform deliberate decision making by considering multiple different reasoning\npaths and self-evaluating choices to decide the next course of action, as well\nas looking ahead or backtracking when necessary to make global choices. Our\nexperiments show that ToT significantly enhances language models'\nproblem-solving abilities on three novel tasks requiring non-trivial planning\nor search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in\nGame of 24, while GPT-4 with chain-of-thought prompting only solved 4% of\ntasks, our method achieved a success rate of 74%. Code repo with all prompts:\nhttps://github.com/princeton-nlp/tree-of-thought-llm.\n","authors":["Shunyu Yao","Dian Yu","Jeffrey Zhao","Izhak Shafran","Thomas L. Griffiths","Yuan Cao","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2305.10601v2.pdf","comment":"NeurIPS 2023 camera ready version. Code repo with all prompts:\n https://github.com/princeton-nlp/tree-of-thought-llm"},{"id":"http://arxiv.org/abs/2312.01523v1","updated":"2023-12-03T22:44:58Z","published":"2023-12-03T22:44:58Z","title":"SymNoise: Advancing Language Model Fine-tuning with Symmetric Noise","summary":" In this paper, we introduce a novel fine-tuning technique for language\nmodels, which involves incorporating symmetric noise into the embedding\nprocess. This method aims to enhance the model's function by more stringently\nregulating its local curvature, demonstrating superior performance over the\ncurrent method, NEFTune. When fine-tuning the LLaMA-2-7B model using Alpaca,\nstandard techniques yield a 29.79% score on AlpacaEval. However, our approach,\nSymNoise, increases this score significantly to 69.04%, using symmetric noisy\nembeddings. This is a 6.7% improvement over the state-of-the-art method,\nNEFTune~(64.69%). Furthermore, when tested on various models and stronger\nbaseline instruction datasets, such as Evol-Instruct, ShareGPT, OpenPlatypus,\nSymNoise consistently outperforms NEFTune. The current literature, including\nNEFTune, has underscored the importance of more in-depth research into the\napplication of noise-based strategies in the fine-tuning of language models.\nOur approach, SymNoise, is another significant step towards this direction,\nshowing notable improvement over the existing state-of-the-art method.\n","authors":["Arjun Singh","Abhay Kumar Yadav"],"pdf_url":"https://arxiv.org/pdf/2312.01523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01515v1","updated":"2023-12-03T22:08:54Z","published":"2023-12-03T22:08:54Z","title":"Bigger is not Always Better: The Effect of Context Size on Speech\n Pre-Training","summary":" It has been generally assumed in the automatic speech recognition (ASR)\nliterature that it is better for models to have access to wider context\nwindows. Yet, many of the potential reasons this might be true in the\nsupervised setting do not necessarily transfer over to the case of unsupervised\nlearning. We investigate how much context is necessary to achieve high-quality\npre-trained acoustic models using self-supervised learning. We principally\ninvestigate contrastive predictive coding (CPC), which we adapt to be able to\nprecisely control the amount of context visible to the model during training\nand inference. We find that phone discriminability in the resulting model\nrepresentations peaks at around 40~ms of preceding context, and that having too\nmuch context (beyond around 320 ms) substantially degrades the quality of the\nrepresentations. Surprisingly, we find that this pattern also transfers to\nsupervised ASR when the pre-trained representations are used as frozen input\nfeatures. Our results point to potential changes in the design of current\nupstream architectures to better facilitate a variety of downstream tasks.\n","authors":["Sean Robertson","Ewan Dunbar"],"pdf_url":"https://arxiv.org/pdf/2312.01515v1.pdf","comment":"Repository at https://github.com/sdrobert/scpc. This work has been\n submitted to the IEEE for possible publication. Copyright may be transferred\n without notice, after which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2312.01509v1","updated":"2023-12-03T21:25:10Z","published":"2023-12-03T21:25:10Z","title":"Tackling Bias in Pre-trained Language Models: Current Trends and\n Under-represented Societies","summary":" The benefits and capabilities of pre-trained language models (LLMs) in\ncurrent and future innovations are vital to any society. However, introducing\nand using LLMs comes with biases and discrimination, resulting in concerns\nabout equality, diversity and fairness, and must be addressed. While\nunderstanding and acknowledging bias in LLMs and developing mitigation\nstrategies are crucial, the generalised assumptions towards societal needs can\nresult in disadvantages towards under-represented societies and indigenous\npopulations. Furthermore, the ongoing changes to actual and proposed amendments\nto regulations and laws worldwide also impact research capabilities in tackling\nthe bias problem. This research presents a comprehensive survey synthesising\nthe current trends and limitations in techniques used for identifying and\nmitigating bias in LLMs, where the overview of methods for tackling bias are\ngrouped into metrics, benchmark datasets, and mitigation strategies. The\nimportance and novelty of this survey are that it explores the perspective of\nunder-represented societies. We argue that current practices tackling the bias\nproblem cannot simply be 'plugged in' to address the needs of under-represented\nsocieties. We use examples from New Zealand to present requirements for\nadopting existing techniques to under-represented societies.\n","authors":["Vithya Yogarajan","Gillian Dobbie","Te Taka Keegan","Rostam J. Neuwirth"],"pdf_url":"https://arxiv.org/pdf/2312.01509v1.pdf","comment":"38 pages, 5 figures, 11 tables. arXiv admin note: text overlap with\n arXiv:2309.00770 by other authors"},{"id":"http://arxiv.org/abs/2312.01504v1","updated":"2023-12-03T20:42:38Z","published":"2023-12-03T20:42:38Z","title":"Effectively Fine-tune to Improve Large Multimodal Models for Radiology\n Report Generation","summary":" Writing radiology reports from medical images requires a high level of domain\nexpertise. It is time-consuming even for trained radiologists and can be\nerror-prone for inexperienced radiologists. It would be appealing to automate\nthis task by leveraging generative AI, which has shown drastic progress in\nvision and language understanding. In particular, Large Language Models (LLM)\nhave demonstrated impressive capabilities recently and continued to set new\nstate-of-the-art performance on almost all natural language tasks. While many\nhave proposed architectures to combine vision models with LLMs for multimodal\ntasks, few have explored practical fine-tuning strategies. In this work, we\nproposed a simple yet effective two-stage fine-tuning protocol to align visual\nfeatures to LLM's text embedding space as soft visual prompts. Our framework\nwith OpenLLaMA-7B achieved state-of-the-art level performance without\ndomain-specific pretraining. Moreover, we provide detailed analyses of soft\nvisual prompts and attention mechanisms, shedding light on future research\ndirections.\n","authors":["Yuzhe Lu","Sungmin Hong","Yash Shah","Panpan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.01504v1.pdf","comment":"Accepted to Deep Generative Models for Health Workshop at NeurIPS\n 2023"},{"id":"http://arxiv.org/abs/2312.01500v1","updated":"2023-12-03T20:09:23Z","published":"2023-12-03T20:09:23Z","title":"Unsupervised Approach to Evaluate Sentence-Level Fluency: Do We Really\n Need Reference?","summary":" Fluency is a crucial goal of all Natural Language Generation (NLG) systems.\nWidely used automatic evaluation metrics fall short in capturing the fluency of\nmachine-generated text. Assessing the fluency of NLG systems poses a challenge\nsince these models are not limited to simply reusing words from the input but\nmay also generate abstractions. Existing reference-based fluency evaluations,\nsuch as word overlap measures, often exhibit weak correlations with human\njudgments. This paper adapts an existing unsupervised technique for measuring\ntext fluency without the need for any reference. Our approach leverages various\nword embeddings and trains language models using Recurrent Neural Network (RNN)\narchitectures. We also experiment with other available multilingual Language\nModels (LMs). To assess the performance of the models, we conduct a comparative\nanalysis across 10 Indic languages, correlating the obtained fluency scores\nwith human judgments. Our code and human-annotated benchmark test-set for\nfluency is available at\nhttps://github.com/AnanyaCoder/TextFluencyForIndicLanaguges.\n","authors":["Gopichand Kanumolu","Lokesh Madasu","Pavan Baswani","Ananya Mukherjee","Manish Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.01500v1.pdf","comment":"Accepted at IJCNLP-AACL SEALP Workshop"},{"id":"http://arxiv.org/abs/2305.04782v2","updated":"2023-12-03T19:31:39Z","published":"2023-05-08T15:34:56Z","title":"HistAlign: Improving Context Dependency in Language Generation by\n Aligning with History","summary":" Language models (LMs) can generate hallucinations and incoherent outputs,\nwhich highlights their weak context dependency. Cache-LMs, which augment LMs\nwith a memory of recent history, can increase context dependency and have shown\nremarkable performance in diverse language generation tasks. However, we find\nthat even with training, the performance gain stemming from the cache component\nof current cache-LMs is suboptimal due to the misalignment between the current\nhidden states and those stored in the memory. In this work, we present\nHistAlign, a new training approach to ensure good cache alignment such that the\nmodel receives useful signals from the history. We first prove our concept on a\nsimple and synthetic task where the memory is essential for correct\npredictions, and we show that the cache component of HistAlign is better\naligned and improves overall performance. Next, we evaluate HistAlign on\ndiverse downstream language generation tasks, including prompt continuation,\nabstractive summarization, and data-to-text. We demonstrate that HistAlign\nimproves text coherence and faithfulness in open-ended and conditional\ngeneration settings respectively. HistAlign is also generalizable across\ndifferent model families, showcasing its strength in improving context\ndependency of LMs in diverse scenarios. Our code is publicly available at\nhttps://github.com/meetdavidwan/histalign\n","authors":["David Wan","Shiyue Zhang","Mohit Bansal"],"pdf_url":"https://arxiv.org/pdf/2305.04782v2.pdf","comment":"EMNLP 2023 (20 pages)"},{"id":"http://arxiv.org/abs/2303.17580v4","updated":"2023-12-03T18:17:21Z","published":"2023-03-30T17:48:28Z","title":"HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging\n Face","summary":" Solving complicated AI tasks with different domains and modalities is a key\nstep toward artificial general intelligence. While there are numerous AI models\navailable for various domains and modalities, they cannot handle complicated AI\ntasks autonomously. Considering large language models (LLMs) have exhibited\nexceptional abilities in language understanding, generation, interaction, and\nreasoning, we advocate that LLMs could act as a controller to manage existing\nAI models to solve complicated AI tasks, with language serving as a generic\ninterface to empower this. Based on this philosophy, we present HuggingGPT, an\nLLM-powered agent that leverages LLMs (e.g., ChatGPT) to connect various AI\nmodels in machine learning communities (e.g., Hugging Face) to solve AI tasks.\nSpecifically, we use ChatGPT to conduct task planning when receiving a user\nrequest, select models according to their function descriptions available in\nHugging Face, execute each subtask with the selected AI model, and summarize\nthe response according to the execution results. By leveraging the strong\nlanguage capability of ChatGPT and abundant AI models in Hugging Face,\nHuggingGPT can tackle a wide range of sophisticated AI tasks spanning different\nmodalities and domains and achieve impressive results in language, vision,\nspeech, and other challenging tasks, which paves a new way towards the\nrealization of artificial general intelligence.\n","authors":["Yongliang Shen","Kaitao Song","Xu Tan","Dongsheng Li","Weiming Lu","Yueting Zhuang"],"pdf_url":"https://arxiv.org/pdf/2303.17580v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.17769v2","updated":"2023-12-03T17:42:33Z","published":"2023-10-26T20:27:03Z","title":"Social Contract AI: Aligning AI Assistants with Implicit Group Norms","summary":" We explore the idea of aligning an AI assistant by inverting a model of\nusers' (unknown) preferences from observed interactions. To validate our\nproposal, we run proof-of-concept simulations in the economic ultimatum game,\nformalizing user preferences as policies that guide the actions of simulated\nplayers. We find that the AI assistant accurately aligns its behavior to match\nstandard policies from the economic literature (e.g., selfish, altruistic).\nHowever, the assistant's learned policies lack robustness and exhibit limited\ngeneralization in an out-of-distribution setting when confronted with a\ncurrency (e.g., grams of medicine) that was not included in the assistant's\ntraining distribution. Additionally, we find that when there is inconsistency\nin the relationship between language use and an unknown policy (e.g., an\naltruistic policy combined with rude language), the assistant's learning of the\npolicy is slowed. Overall, our preliminary results suggest that developing\nsimulation frameworks in which AI assistants need to infer preferences from\ndiverse users can provide a valuable approach for studying practical alignment\nquestions.\n","authors":["Jan-Philipp Fränken","Sam Kwok","Peixuan Ye","Kanishk Gandhi","Dilip Arumugam","Jared Moore","Alex Tamkin","Tobias Gerstenberg","Noah D. Goodman"],"pdf_url":"https://arxiv.org/pdf/2310.17769v2.pdf","comment":"SoLaR NeurIPS 2023 Workshop (https://solar-neurips.github.io/)"},{"id":"http://arxiv.org/abs/2210.14986v2","updated":"2023-12-03T17:14:05Z","published":"2022-10-26T19:04:23Z","title":"The Goldilocks of Pragmatic Understanding: Fine-Tuning Strategy Matters\n for Implicature Resolution by LLMs","summary":" Despite widespread use of LLMs as conversational agents, evaluations of\nperformance fail to capture a crucial aspect of communication: interpreting\nlanguage in context -- incorporating its pragmatics. Humans interpret language\nusing beliefs and prior knowledge about the world. For example, we intuitively\nunderstand the response \"I wore gloves\" to the question \"Did you leave\nfingerprints?\" as meaning \"No\". To investigate whether LLMs have the ability to\nmake this type of inference, known as an implicature, we design a simple task\nand evaluate four categories of widely used state-of-the-art models. We find\nthat, despite only evaluating on utterances that require a binary inference\n(yes or no), models in three of these categories perform close to random.\nHowever, LLMs instruction-tuned at the example-level perform significantly\nbetter. These results suggest that certain fine-tuning strategies are far\nbetter at inducing pragmatic understanding in models. We present our findings\nas the starting point for further research into evaluating how LLMs interpret\nlanguage in context and to drive the development of more pragmatic and useful\nmodels of human discourse.\n","authors":["Laura Ruis","Akbir Khan","Stella Biderman","Sara Hooker","Tim Rocktäschel","Edward Grefenstette"],"pdf_url":"https://arxiv.org/pdf/2210.14986v2.pdf","comment":"Accepted as Spotlight at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.01454v1","updated":"2023-12-03T16:58:10Z","published":"2023-12-03T16:58:10Z","title":"D-Bot: Database Diagnosis System using Large Language Models","summary":" Database administrators (DBAs) play an important role in managing,\nmaintaining and optimizing database systems. However, it is hard and tedious\nfor DBAs to manage a large number of databases and give timely response\n(waiting for hours is intolerable in many online cases). In addition, existing\nempirical methods only support limited diagnosis scenarios, which are also\nlabor-intensive to update the diagnosis rules for database version updates.\nRecently large language models (LLMs) have shown great potential in various\nfields. Thus, we propose D-Bot, an LLM-based database diagnosis system that can\nautomatically acquire knowledge from diagnosis documents, and generate\nreasonable and well-founded diagnosis report (i.e., identifying the root causes\nand solutions) within acceptable time (e.g., under 10 minutes compared to hours\nby a DBA). The techniques in D-Bot include (i) offline knowledge extraction\nfrom documents, (ii) automatic prompt generation (e.g., knowledge matching,\ntool retrieval), (iii) root cause analysis using tree search algorithm, and\n(iv) collaborative mechanism for complex anomalies with multiple root causes.\nWe verify D-Bot on real benchmarks (including 539 anomalies of six typical\napplications), and the results show that D-Bot can effectively analyze the root\ncauses of unseen anomalies and significantly outperforms traditional methods\nand vanilla models like GPT-4.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhaoyan Sun","Zhiyuan Liu","Weize Chen","Jianming Wu","Jiesi Liu","Ruohang Feng","Guoyang Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01454v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.10789v2","updated":"2023-12-03T16:54:26Z","published":"2022-12-21T06:18:31Z","title":"Multi-modal Molecule Structure-text Model for Text-based Retrieval and\n Editing","summary":" There is increasing adoption of artificial intelligence in drug discovery.\nHowever, existing studies use machine learning to mainly utilize the chemical\nstructures of molecules but ignore the vast textual knowledge available in\nchemistry. Incorporating textual knowledge enables us to realize new drug\ndesign objectives, adapt to text-based instructions and predict complex\nbiological activities. Here we present a multi-modal molecule structure-text\nmodel, MoleculeSTM, by jointly learning molecules' chemical structures and\ntextual descriptions via a contrastive learning strategy. To train MoleculeSTM,\nwe construct a large multi-modal dataset, namely, PubChemSTM, with over 280,000\nchemical structure-text pairs. To demonstrate the effectiveness and utility of\nMoleculeSTM, we design two challenging zero-shot tasks based on text\ninstructions, including structure-text retrieval and molecule editing.\nMoleculeSTM has two main properties: open vocabulary and compositionality via\nnatural language. In experiments, MoleculeSTM obtains the state-of-the-art\ngeneralization ability to novel biochemical concepts across various benchmarks.\n","authors":["Shengchao Liu","Weili Nie","Chengpeng Wang","Jiarui Lu","Zhuoran Qiao","Ling Liu","Jian Tang","Chaowei Xiao","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2212.10789v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11300v3","updated":"2023-12-03T16:33:58Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. A critical challenge is how\nto make use of existing large-scale pre-trained VLMs, which are trained on\ncommon objects, to perform the domain-specific transfer for accomplishing\ndomain-related downstream tasks. In this paper, we propose a new framework that\nincludes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap\nbetween the General Vision-Language Model (GVLM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\nfine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning\nmethods on RS5M to implement the DVLM. Experimental results show that our\nproposed dataset is highly effective for various tasks, and our model GeoRSCLIP\nimproves upon the baseline or previous state-of-the-art model by $3\\%\\sim20\\%$\nin Zero-shot Classification (ZSC), $3\\%\\sim6\\%$ in Remote Sensing Cross-Modal\nText-Image Retrieval (RSCTIR) and $4\\%\\sim5\\%$ in Semantic Localization (SeLo)\ntasks. Dataset and models have been released in:\n\\url{https://github.com/om-ai-lab/RS5M}.\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v3.pdf","comment":"RS5M dataset v5"},{"id":"http://arxiv.org/abs/2312.01429v1","updated":"2023-12-03T15:34:46Z","published":"2023-12-03T15:34:46Z","title":"Transformers are uninterpretable with myopic methods: a case study with\n bounded Dyck grammars","summary":" Interpretability methods aim to understand the algorithm implemented by a\ntrained model (e.g., a Transofmer) by examining various aspects of the model,\nsuch as the weight matrices or the attention patterns. In this work, through a\ncombination of theoretical results and carefully controlled experiments on\nsynthetic data, we take a critical view of methods that exclusively focus on\nindividual parts of the model, rather than consider the network as a whole. We\nconsider a simple synthetic setup of learning a (bounded) Dyck language.\nTheoretically, we show that the set of models that (exactly or approximately)\nsolve this task satisfy a structural characterization derived from ideas in\nformal languages (the pumping lemma). We use this characterization to show that\nthe set of optima is qualitatively rich; in particular, the attention pattern\nof a single layer can be ``nearly randomized'', while preserving the\nfunctionality of the network. We also show via extensive experiments that these\nconstructions are not merely a theoretical artifact: even after severely\nconstraining the architecture of the model, vastly different solutions can be\nreached via standard training. Thus, interpretability claims based on\ninspecting individual heads or weight matrices in the Transformer can be\nmisleading.\n","authors":["Kaiyue Wen","Yuchen Li","Bingbin Liu","Andrej Risteski"],"pdf_url":"https://arxiv.org/pdf/2312.01429v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14648v2","updated":"2023-12-03T15:28:02Z","published":"2023-11-24T18:29:50Z","title":"Calibrated Language Models Must Hallucinate","summary":" Recent language models generate false but plausible-sounding text with\nsurprising frequency. Such \"hallucinations\" are an obstacle to the usability of\nlanguage-based AI systems and can harm people who rely upon their outputs. This\nwork shows shows that there is an inherent statistical lower-bound on the rate\nthat pretrained language models hallucinate certain types of facts, having\nnothing to do with the transformer LM architecture or data quality. For\n\"arbitrary\" facts whose veracity cannot be determined from the training data,\nwe show that hallucinations must occur at a certain rate for language models\nthat satisfy a statistical calibration condition appropriate for generative\nlanguage models. Specifically, if the maximum probability of any fact is\nbounded, we show that the probability of generating a hallucination is close to\nthe fraction of facts that occur exactly once in the training data (a\n\"Good-Turing\" estimate), even assuming ideal training data without errors.\n One conclusion is that models pretrained to be sufficiently good predictors\n(i.e., calibrated) may require post-training to mitigate hallucinations on the\ntype of arbitrary facts that tend to appear once in the training set. However,\nour analysis also suggests that there is no statistical reason that pretraining\nwill lead to hallucination on facts that tend to appear more than once in the\ntraining data (like references to publications such as articles and books,\nwhose hallucinations have been particularly notable and problematic) or on\nsystematic facts (like arithmetic calculations). Therefore, different\narchitectures and learning algorithms may mitigate these latter types of\nhallucinations.\n","authors":["Adam Tauman Kalai","Santosh S. Vempala"],"pdf_url":"https://arxiv.org/pdf/2311.14648v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18743v2","updated":"2023-12-03T13:52:57Z","published":"2023-11-30T17:41:30Z","title":"AlignBench: Benchmarking Chinese Alignment of Large Language Models","summary":" Alignment has become a critical step for instruction-tuned Large Language\nModels (LLMs) to become helpful assistants. However, effective evaluation of\nalignment for emerging Chinese LLMs is still significantly lacking, calling for\nreal-scenario grounded, open-ended, challenging and automatic evaluations\ntailored for alignment. To fill in this gap, we introduce AlignBench, a\ncomprehensive multi-dimensional benchmark for evaluating LLMs' alignment in\nChinese. Equipped with a human-in-the-loop data curation pipeline, our\nbenchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with\nChain-of-Thought to generate explanations and final ratings as evaluations,\nensuring high reliability and interpretability. Furthermore, we report\nAlignBench evaluated by CritiqueLLM, a dedicated Chinese evaluator LLM that\nrecovers 95% of GPT-4's evaluation ability. We will provide public APIs for\nevaluating AlignBench with CritiqueLLM to facilitate the evaluation of LLMs'\nChinese alignment. All evaluation codes, data, and LLM generations are\navailable at \\url{https://github.com/THUDM/AlignBench}.\n","authors":["Xiao Liu","Xuanyu Lei","Shengyuan Wang","Yue Huang","Zhuoer Feng","Bosi Wen","Jiale Cheng","Pei Ke","Yifan Xu","Weng Lam Tam","Xiaohan Zhang","Lichao Sun","Hongning Wang","Jing Zhang","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18743v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01398v1","updated":"2023-12-03T13:52:32Z","published":"2023-12-03T13:52:32Z","title":"Towards Mitigating Perceived Unfairness in Contracts from a Non-Legal\n Stakeholder's Perspective","summary":" Commercial contracts are known to be a valuable source for deriving\nproject-specific requirements. However, contract negotiations mainly occur\namong the legal counsel of the parties involved. The participation of non-legal\nstakeholders, including requirement analysts, engineers, and solution\narchitects, whose primary responsibility lies in ensuring the seamless\nimplementation of contractual terms, is often indirect and inadequate.\nConsequently, a significant number of sentences in contractual clauses, though\nlegally accurate, can appear unfair from an implementation perspective to\nnon-legal stakeholders. This perception poses a problem since requirements\nindicated in the clauses are obligatory and can involve punitive measures and\npenalties if not implemented as committed in the contract. Therefore, the\nidentification of potentially unfair clauses in contracts becomes crucial. In\nthis work, we conduct an empirical study to analyze the perspectives of\ndifferent stakeholders regarding contractual fairness. We then investigate the\nability of Pre-trained Language Models (PLMs) to identify unfairness in\ncontractual sentences by comparing chain of thought prompting and\nsemi-supervised fine-tuning approaches. Using BERT-based fine-tuning, we\nachieved an accuracy of 84% on a dataset consisting of proprietary contracts.\nIt outperformed chain of thought prompting using Vicuna-13B by a margin of 9%.\n","authors":["Anmol Singhal","Preethu Rose Anish","Shirish Karande","Smita Ghaisas"],"pdf_url":"https://arxiv.org/pdf/2312.01398v1.pdf","comment":"9 pages, 2 figures, to be published in Natural Legal Language\n Processing Workshop at EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.01356v1","updated":"2023-12-03T11:36:23Z","published":"2023-12-03T11:36:23Z","title":"CEScore: Simple and Efficient Confidence Estimation Model for Evaluating\n Split and Rephrase","summary":" The split and rephrase (SR) task aims to divide a long, complex sentence into\na set of shorter, simpler sentences that convey the same meaning. This\nchallenging problem in NLP has gained increased attention recently because of\nits benefits as a pre-processing step in other NLP tasks. Evaluating quality of\nSR is challenging, as there no automatic metric fit to evaluate this task. In\nthis work, we introduce CEScore, as novel statistical model to automatically\nevaluate SR task. By mimicking the way humans evaluate SR, CEScore provides 4\nmetrics (Sscore, Gscore, Mscore, and CEscore) to assess simplicity,\ngrammaticality, meaning preservation, and overall quality, respectively. In\nexperiments with 26 models, CEScore correlates strongly with human evaluations,\nachieving 0.98 in Spearman correlations at model-level. This underscores the\npotential of CEScore as a simple and effective metric for assessing the overall\nquality of SR models.\n","authors":["AlMotasem Bellah Al Ajlouni","Jinlong Li"],"pdf_url":"https://arxiv.org/pdf/2312.01356v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01339v1","updated":"2023-12-03T10:03:50Z","published":"2023-12-03T10:03:50Z","title":"AI-Powered Arabic Crossword Puzzle Generation for Educational\n Applications","summary":" This paper presents the first Arabic crossword puzzle generator driven by\nadvanced AI technology. Leveraging cutting-edge large language models including\nGPT4, GPT3-Davinci, GPT3-Curie, GPT3-Babbage, GPT3-Ada, and BERT, the system\ngenerates distinctive and challenging clues. Based on a dataset comprising over\n50,000 clue-answer pairs, the generator employs fine-tuning, few/zero-shot\nlearning strategies, and rigorous quality-checking protocols to enforce the\ngeneration of high-quality clue-answer pairs. Importantly, educational\ncrosswords contribute to enhancing memory, expanding vocabulary, and promoting\nproblem-solving skills, thereby augmenting the learning experience through a\nfun and engaging approach, reshaping the landscape of traditional learning\nmethods. The overall system can be exploited as a powerful educational tool\nthat amalgamates AI and innovative learning techniques, heralding a\ntransformative era for Arabic crossword puzzles and the intersection of\ntechnology and education.\n","authors":["Kamyar Zeinalipour","Mohamed Zaky Saad","Marco Maggini","Marco Gori"],"pdf_url":"https://arxiv.org/pdf/2312.01339v1.pdf","comment":"Accepted Paper for ArabicNLP 2023 - The First Arabic Natural Language\n Processing Conference - Co-located with EMNLP 2023 in Singapore"},{"id":"http://arxiv.org/abs/2312.01314v1","updated":"2023-12-03T08:09:45Z","published":"2023-12-03T08:09:45Z","title":"NLEBench+NorGLM: A Comprehensive Empirical Analysis and Benchmark\n Dataset for Generative Language Models in Norwegian","summary":" Recent advancements in Generative Language Models (GLMs) have transformed\nNatural Language Processing (NLP) by showcasing the effectiveness of the\n\"pre-train, prompt, and predict\" paradigm in utilizing pre-trained GLM\nknowledge for diverse applications. Despite their potential, these capabilities\nlack adequate quantitative characterization due to the absence of comprehensive\nbenchmarks, particularly for low-resource languages. Existing low-resource\nbenchmarks focus on discriminative language models like BERT, neglecting the\nevaluation of generative language models. Moreover, current benchmarks often\noverlook measuring generalization performance across multiple tasks, a crucial\nmetric for GLMs.\n To bridge these gaps, we introduce NLEBench, a comprehensive benchmark\ntailored for evaluating natural language generation capabilities in Norwegian,\na low-resource language. We use Norwegian as a case study to explore whether\ncurrent GLMs and benchmarks in mainstream languages like English can reveal the\nunique characteristics of underrepresented languages. NLEBench encompasses a\nsuite of real-world NLP tasks ranging from news storytelling, summarization,\nopen-domain conversation, natural language understanding, instruction\nfine-tuning, toxicity and bias evaluation, to self-curated Chain-of-Thought\ninvestigation. It features two high-quality, human-annotated datasets: an\ninstruction dataset covering traditional Norwegian cultures, idioms, slang, and\nspecial expressions, and a document-grounded multi-label dataset for topic\nclassification, question answering, and summarization. This paper also\nintroduces foundational Norwegian Generative Language Models (NorGLMs)\ndeveloped with diverse parameter scales and Transformer-based architectures.\nSystematic evaluations on the proposed benchmark suite provide insights into\nthe capabilities and scalability of NorGLMs across various downstream tasks.\n","authors":["Peng Liu","Lemei Zhang","Terje Nissen Farup","Even W. Lauvrak","Jon Espen Ingvaldsen","Simen Eide","Jon Atle Gulla","Zhirong Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01308v1","updated":"2023-12-03T07:24:12Z","published":"2023-12-03T07:24:12Z","title":"Bridging Background Knowledge Gaps in Translation with Automatic\n Explicitation","summary":" Translations help people understand content written in another language.\nHowever, even correct literal translations do not fulfill that goal when people\nlack the necessary background to understand them. Professional translators\nincorporate explicitations to explain the missing context by considering\ncultural differences between source and target audiences. Despite its potential\nto help users, NLP research on explicitation is limited because of the dearth\nof adequate evaluation methods. This work introduces techniques for\nautomatically generating explicitations, motivated by WikiExpl: a dataset that\nwe collect from Wikipedia and annotate with human translators. The resulting\nexplicitations are useful as they help answer questions more accurately in a\nmultilingual question answering framework.\n","authors":["HyoJung Han","Jordan Lee Boyd-Graber","Marine Carpuat"],"pdf_url":"https://arxiv.org/pdf/2312.01308v1.pdf","comment":"EMNLP2023"},{"id":"http://arxiv.org/abs/2312.01306v1","updated":"2023-12-03T06:53:53Z","published":"2023-12-03T06:53:53Z","title":"On Significance of Subword tokenization for Low Resource and Efficient\n Named Entity Recognition: A case study in Marathi","summary":" Named Entity Recognition (NER) systems play a vital role in NLP applications\nsuch as machine translation, summarization, and question-answering. These\nsystems identify named entities, which encompass real-world concepts like\nlocations, persons, and organizations. Despite extensive research on NER\nsystems for the English language, they have not received adequate attention in\nthe context of low resource languages. In this work, we focus on NER for\nlow-resource language and present our case study in the context of the Indian\nlanguage Marathi. The advancement of NLP research revolves around the\nutilization of pre-trained transformer models such as BERT for the development\nof NER models. However, we focus on improving the performance of shallow models\nbased on CNN, and LSTM by combining the best of both worlds. In the era of\ntransformers, these traditional deep learning models are still relevant because\nof their high computational efficiency. We propose a hybrid approach for\nefficient NER by integrating a BERT-based subword tokenizer into vanilla\nCNN/LSTM models. We show that this simple approach of replacing a traditional\nword-based tokenizer with a BERT-tokenizer brings the accuracy of vanilla\nsingle-layer models closer to that of deep pre-trained models like BERT. We\nshow the importance of using sub-word tokenization for NER and present our\nstudy toward building efficient NLP systems. The evaluation is performed on\nL3Cube-MahaNER dataset using tokenizers from MahaBERT, MahaGPT, IndicBERT, and\nmBERT.\n","authors":["Harsh Chaudhari","Anuja Patil","Dhanashree Lavekar","Pranav Khairnar","Raviraj Joshi","Sachin Pande"],"pdf_url":"https://arxiv.org/pdf/2312.01306v1.pdf","comment":"Accepted at ICDAM 2023"},{"id":"http://arxiv.org/abs/2311.10777v3","updated":"2023-12-03T05:50:36Z","published":"2023-11-16T06:01:47Z","title":"A Systematic Review of Aspect-based Sentiment Analysis (ABSA): Domains,\n Methods, and Trends","summary":" Aspect-based Sentiment Analysis (ABSA) is a type of fine-grained sentiment\nanalysis (SA) that identifies aspects and the associated opinions from a given\ntext. In the digital era, ABSA gained increasing popularity and applications in\nmining opinionated text data to obtain insights and support decisions. ABSA\nresearch employs linguistic, statistical, and machine-learning approaches and\nutilises resources such as labelled datasets, aspect and sentiment lexicons and\nontology. By its nature, ABSA is domain-dependent and can be sensitive to the\nimpact of misalignment between the resource and application domains. However,\nto our knowledge, this topic has not been explored by the existing ABSA\nliterature reviews. In this paper, we present a Systematic Literature Review\n(SLR) of ABSA studies with a focus on the research application domain, dataset\ndomain, and the research methods to examine their relationships and identify\ntrends over time. Our results suggest a number of potential systemic issues in\nthe ABSA research literature, including the predominance of the\n``product/service review'' dataset domain among the majority of studies that\ndid not have a specific research application domain, coupled with the\nprevalence of dataset-reliant methods such as supervised machine learning. This\nreview makes a number of unique contributions to the ABSA research field: 1) To\nour knowledge, it is the first SLR that links the research domain, dataset\ndomain, and research method through a systematic perspective; 2) it is one of\nthe largest scoped SLR on ABSA, with 519 eligible studies filtered from 4191\nsearch results without time constraint; and 3) our review methodology adopted\nan innovative automatic filtering process based on PDF-mining, which enhanced\nscreening quality and reliability. Suggestions and our review limitations are\nalso discussed.\n","authors":["Yan Cathy Hua","Paul Denny","Katerina Taskova","Jörg Wicker"],"pdf_url":"https://arxiv.org/pdf/2311.10777v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01279v1","updated":"2023-12-03T04:35:04Z","published":"2023-12-03T04:35:04Z","title":"TextGenSHAP: Scalable Post-hoc Explanations in Text Generation with Long\n Documents","summary":" Large language models (LLMs) have attracted huge interest in practical\napplications given their increasingly accurate responses and coherent reasoning\nabilities. Given their nature as black-boxes using complex reasoning processes\non their inputs, it is inevitable that the demand for scalable and faithful\nexplanations for LLMs' generated content will continue to grow. There have been\nmajor developments in the explainability of neural network models over the past\ndecade. Among them, post-hoc explainability methods, especially Shapley values,\nhave proven effective for interpreting deep learning models. However, there are\nmajor challenges in scaling up Shapley values for LLMs, particularly when\ndealing with long input contexts containing thousands of tokens and\nautoregressively generated output sequences. Furthermore, it is often unclear\nhow to effectively utilize generated explanations to improve the performance of\nLLMs. In this paper, we introduce TextGenSHAP, an efficient post-hoc\nexplanation method incorporating LM-specific techniques. We demonstrate that\nthis leads to significant increases in speed compared to conventional Shapley\nvalue computations, reducing processing times from hours to minutes for\ntoken-level explanations, and to just seconds for document-level explanations.\nIn addition, we demonstrate how real-time Shapley values can be utilized in two\nimportant scenarios, providing better understanding of long-document question\nanswering by localizing important words and sentences; and improving existing\ndocument retrieval systems through enhancing the accuracy of selected passages\nand ultimately the final responses.\n","authors":["James Enouen","Hootan Nakhost","Sayna Ebrahimi","Sercan O Arik","Yan Liu","Tomas Pfister"],"pdf_url":"https://arxiv.org/pdf/2312.01279v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01276v1","updated":"2023-12-03T04:28:19Z","published":"2023-12-03T04:28:19Z","title":"Running cognitive evaluations on large language models: The do's and the\n don'ts","summary":" In this paper, I describe methodological considerations for studies that aim\nto evaluate the cognitive capacities of large language models (LLMs) using\nlanguage-based behavioral assessments. Drawing on three case studies from the\nliterature (a commonsense knowledge benchmark, a theory of mind evaluation, and\na test of syntactic agreement), I describe common pitfalls that might arise\nwhen applying a cognitive test to an LLM. I then list 10 do's and don'ts that\nshould help design high-quality cognitive evaluations for AI systems. I\nconclude by discussing four areas where the do's and don'ts are currently under\nactive discussion -- prompt sensitivity, cultural and linguistic diversity,\nusing LLMs as research assistants, and running evaluations on open vs. closed\nLLMs. Overall, the goal of the paper is to contribute to the broader discussion\nof best practices in the rapidly growing field of AI Psychology.\n","authors":["Anna A. Ivanova"],"pdf_url":"https://arxiv.org/pdf/2312.01276v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06744v4","updated":"2023-12-03T03:29:37Z","published":"2023-08-13T11:07:55Z","title":"Token-Scaled Logit Distillation for Ternary Weight Generative Language\n Models","summary":" Generative Language Models (GLMs) have shown impressive performance in tasks\nsuch as text generation, understanding, and reasoning. However, the large model\nsize poses challenges for practical deployment. To solve this problem,\nQuantization-Aware Training (QAT) has become increasingly popular. However,\ncurrent QAT methods for generative models have resulted in a noticeable loss of\naccuracy. To counteract this issue, we propose a novel knowledge distillation\nmethod specifically designed for GLMs. Our method, called token-scaled logit\ndistillation, prevents overfitting and provides superior learning from the\nteacher model and ground truth. This research marks the first evaluation of\nternary weight quantization-aware training of large-scale GLMs with less than\n1.0 degradation in perplexity and achieves enhanced accuracy in tasks like\ncommon-sense QA and arithmetic reasoning as well as natural language\nunderstanding. Our code is available at https://github.com/aiha-lab/TSLD.\n","authors":["Minsoo Kim","Sihwa Lee","Janghwan Lee","Sukjin Hong","Du-Seong Chang","Wonyong Sung","Jungwook Choi"],"pdf_url":"https://arxiv.org/pdf/2308.06744v4.pdf","comment":"NeurIPS 2023 Camera Ready"},{"id":"http://arxiv.org/abs/2311.00176v3","updated":"2023-12-03T00:56:36Z","published":"2023-10-31T22:35:58Z","title":"ChipNeMo: Domain-Adapted LLMs for Chip Design","summary":" ChipNeMo aims to explore the applications of large language models (LLMs) for\nindustrial chip design. Instead of directly deploying off-the-shelf commercial\nor open-source LLMs, we instead adopt the following domain adaptation\ntechniques: custom tokenizers, domain-adaptive continued pretraining,\nsupervised fine-tuning (SFT) with domain-specific instructions, and\ndomain-adapted retrieval models. We evaluate these methods on three selected\nLLM applications for chip design: an engineering assistant chatbot, EDA script\ngeneration, and bug summarization and analysis. Our results show that these\ndomain adaptation techniques enable significant LLM performance improvements\nover general-purpose base models across the three evaluated applications,\nenabling up to 5x model size reduction with similar or better performance on a\nrange of design tasks. Our findings also indicate that there's still room for\nimprovement between our current results and ideal outcomes. We believe that\nfurther investigation of domain-adapted LLM approaches will help close this gap\nin the future.\n","authors":["Mingjie Liu","Teodor-Dumitru Ene","Robert Kirby","Chris Cheng","Nathaniel Pinckney","Rongjian Liang","Jonah Alben","Himyanshu Anand","Sanmitra Banerjee","Ismet Bayraktaroglu","Bonita Bhaskaran","Bryan Catanzaro","Arjun Chaudhuri","Sharon Clay","Bill Dally","Laura Dang","Parikshit Deshpande","Siddhanth Dhodhi","Sameer Halepete","Eric Hill","Jiashang Hu","Sumit Jain","Brucek Khailany","George Kokai","Kishor Kunal","Xiaowei Li","Charley Lind","Hao Liu","Stuart Oberman","Sujeet Omar","Sreedhar Pratty","Jonathan Raiman","Ambar Sarkar","Zhengjiang Shao","Hanfei Sun","Pratik P Suthar","Varun Tej","Walker Turner","Kaizhe Xu","Haoxing Ren"],"pdf_url":"https://arxiv.org/pdf/2311.00176v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02219v1","updated":"2023-12-03T16:39:36Z","published":"2023-12-03T16:39:36Z","title":"Behind the Magic, MERLIM: Multi-modal Evaluation Benchmark for Large\n Image-Language Models","summary":" Large Vision and Language Models have enabled significant advances in fully\nsupervised and zero-shot vision tasks. These large pre-trained architectures\nserve as the baseline to what is currently known as Instruction Tuning Large\nVision and Language models (IT-LVLMs). IT-LVLMs are general-purpose multi-modal\nassistants whose responses are modulated by natural language instructions and\narbitrary visual data. Despite this versatility, IT-LVLM effectiveness in\nfundamental computer vision problems remains unclear, primarily due to the\nabsence of a standardized evaluation benchmark. This paper introduces a\nMulti-modal Evaluation Benchmark named MERLIM, a scalable test-bed to assess\nthe performance of IT-LVLMs on fundamental computer vision tasks. MERLIM\ncontains over 279K image-question pairs, and has a strong focus on detecting\ncross-modal \"hallucination\" events in IT-LVLMs, where the language output\nrefers to visual concepts that lack any effective grounding in the image. Our\nresults show that state-of-the-art IT-LVMLs are still limited at identifying\nfine-grained visual concepts, object hallucinations are common across tasks,\nand their results are strongly biased by small variations in the input query,\neven if the queries have the very same semantics. Our findings also suggest\nthat these models have weak visual groundings but they can still make adequate\nguesses by global visual patterns or textual biases contained in the LLM\ncomponent.\n","authors":["Andrés Villa","Juan Carlos León Alcázar","Alvaro Soto","Bernard Ghanem"],"pdf_url":"https://arxiv.org/pdf/2312.02219v1.pdf","comment":"11 pages, 7 figures, 3 tables"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.01541v1","updated":"2023-12-03T23:59:03Z","published":"2023-12-03T23:59:03Z","title":"Revisiting Non-separable Binary Classification and its Applications in\n Anomaly Detection","summary":" The inability to linearly classify XOR has motivated much of deep learning.\nWe revisit this age-old problem and show that linear classification of XOR is\nindeed possible. Instead of separating data between halfspaces, we propose a\nslightly different paradigm, equality separation, that adapts the SVM objective\nto distinguish data within or outside the margin. Our classifier can then be\nintegrated into neural network pipelines with a smooth approximation. From its\nproperties, we intuit that equality separation is suitable for anomaly\ndetection. To formalize this notion, we introduce closing numbers, a\nquantitative measure on the capacity for classifiers to form closed decision\nregions for anomaly detection. Springboarding from this theoretical connection\nbetween binary classification and anomaly detection, we test our hypothesis on\nsupervised anomaly detection experiments, showing that equality separation can\ndetect both seen and unseen anomalies.\n","authors":["Matthew Lau","Ismaila Seck","Athanasios P Meliopoulos","Wenke Lee","Eugene Ndiaye"],"pdf_url":"https://arxiv.org/pdf/2312.01541v1.pdf","comment":"Code: https://github.com/mattlaued/XOR-is-Linearly-Classifiable"},{"id":"http://arxiv.org/abs/2010.08158v2","updated":"2023-12-03T23:41:04Z","published":"2020-10-16T04:29:09Z","title":"An Accurate and Fully-Automated Ensemble Model for Weekly Time Series\n Forecasting","summary":" Many businesses and industries require accurate forecasts for weekly time\nseries nowadays. However, the forecasting literature does not currently provide\neasy-to-use, automatic, reproducible and accurate approaches dedicated to this\ntask. We propose a forecasting method in this domain to fill this gap,\nleveraging state-of-the-art forecasting techniques, such as forecast\ncombination, meta-learning, and global modelling. We consider different\nmeta-learning architectures, algorithms, and base model pools. Based on all\nconsidered model variants, we propose to use a stacking approach with lasso\nregression which optimally combines the forecasts of four base models: a global\nRecurrent Neural Network model (RNN), Theta, Trigonometric Box-Cox ARMA Trend\nSeasonal (TBATS) and Dynamic Harmonic Regression ARIMA (DHR-ARIMA), as it shows\nthe overall best performance across seven experimental weekly datasets on four\nevaluation metrics. Our proposed method also consistently outperforms a set of\nbenchmarks and state-of-the-art weekly forecasting models by a considerable\nmargin with statistical significance. Our method can produce the most accurate\nforecasts, in terms of mean sMAPE, for the M4 weekly dataset among all\nbenchmarks and all original competition participants.\n","authors":["Rakshitha Godahewa","Christoph Bergmeir","Geoffrey I. Webb","Pablo Montero-Manso"],"pdf_url":"https://arxiv.org/pdf/2010.08158v2.pdf","comment":"1 figure, 9 tables"},{"id":"http://arxiv.org/abs/2312.01538v1","updated":"2023-12-03T23:36:16Z","published":"2023-12-03T23:36:16Z","title":"Recurrent Distance-Encoding Neural Networks for Graph Representation\n Learning","summary":" Graph neural networks based on iterative one-hop message passing have been\nshown to struggle in harnessing information from distant nodes effectively.\nConversely, graph transformers allow each node to attend to all other nodes\ndirectly, but suffer from high computational complexity and have to rely on\nad-hoc positional encoding to bake in the graph inductive bias. In this paper,\nwe propose a new architecture to reconcile these challenges. Our approach stems\nfrom the recent breakthroughs in long-range modeling provided by deep\nstate-space models on sequential data: for a given target node, our model\naggregates other nodes by their shortest distances to the target and uses a\nparallelizable linear recurrent network over the chain of distances to provide\na natural encoding of its neighborhood structure. With no need for positional\nencoding, we empirically show that the performance of our model is highly\ncompetitive compared with that of state-of-the-art graph transformers on\nvarious benchmarks, at a drastically reduced computational complexity. In\naddition, we show that our model is theoretically more expressive than one-hop\nmessage passing neural networks.\n","authors":["Yuhui Ding","Antonio Orvieto","Bobby He","Thomas Hofmann"],"pdf_url":"https://arxiv.org/pdf/2312.01538v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01537v1","updated":"2023-12-03T23:30:48Z","published":"2023-12-03T23:30:48Z","title":"Unlocking the Potential of Federated Learning: The Symphony of Dataset\n Distillation via Deep Generative Latents","summary":" Data heterogeneity presents significant challenges for federated learning\n(FL). Recently, dataset distillation techniques have been introduced, and\nperformed at the client level, to attempt to mitigate some of these challenges.\nIn this paper, we propose a highly efficient FL dataset distillation framework\non the server side, significantly reducing both the computational and\ncommunication demands on local devices while enhancing the clients' privacy.\nUnlike previous strategies that perform dataset distillation on local devices\nand upload synthetic data to the server, our technique enables the server to\nleverage prior knowledge from pre-trained deep generative models to synthesize\nessential data representations from a heterogeneous model architecture. This\nprocess allows local devices to train smaller surrogate models while enabling\nthe training of a larger global model on the server, effectively minimizing\nresource utilization. We substantiate our claim with a theoretical analysis,\ndemonstrating the asymptotic resemblance of the process to the hypothetical\nideal of completely centralized training on a heterogeneous dataset. Empirical\nevidence from our comprehensive experiments indicates our method's superiority,\ndelivering an accuracy enhancement of up to 40% over non-dataset-distillation\ntechniques in highly heterogeneous FL contexts, and surpassing existing\ndataset-distillation methods by 18%. In addition to the high accuracy, our\nframework converges faster than the baselines because rather than the server\ntrains on several sets of heterogeneous data distributions, it trains on a\nmulti-modal distribution. Our code is available at\nhttps://github.com/FedDG23/FedDG-main.git\n","authors":["Yuqi Jia","Saeed Vahidian","Jingwei Sun","Jianyi Zhang","Vyacheslav Kungurtsev","Neil Zhenqiang Gong","Yiran Chen"],"pdf_url":"https://arxiv.org/pdf/2312.01537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01530v1","updated":"2023-12-03T23:08:29Z","published":"2023-12-03T23:08:29Z","title":"Evaluation of Active Feature Acquisition Methods for Time-varying\n Feature Settings","summary":" Machine learning methods often assume input features are available at no\ncost. However, in domains like healthcare, where acquiring features could be\nexpensive or harmful, it is necessary to balance a feature's acquisition cost\nagainst its predictive value. The task of training an AI agent to decide which\nfeatures to acquire is called active feature acquisition (AFA). By deploying an\nAFA agent, we effectively alter the acquisition strategy and trigger a\ndistribution shift. To safely deploy AFA agents under this distribution shift,\nwe present the problem of active feature acquisition performance evaluation\n(AFAPE). We examine AFAPE under i) a no direct effect (NDE) assumption, stating\nthat acquisitions don't affect the underlying feature values; and ii) a no\nunobserved confounding (NUC) assumption, stating that retrospective feature\nacquisition decisions were only based on observed features. We show that one\ncan apply offline reinforcement learning under the NUC assumption and missing\ndata methods under the NDE assumption. When NUC and NDE hold, we propose a\nnovel semi-offline reinforcement learning framework, which requires a weaker\npositivity assumption and yields more data-efficient estimators. We introduce\nthree novel estimators: a direct method (DM), an inverse probability weighting\n(IPW), and a double reinforcement learning (DRL) estimator.\n","authors":["Henrik von Kleist","Alireza Zamanian","Ilya Shpitser","Narges Ahmidi"],"pdf_url":"https://arxiv.org/pdf/2312.01530v1.pdf","comment":"43 pages, 3 tables, 8 Figures"},{"id":"http://arxiv.org/abs/2312.01529v1","updated":"2023-12-03T23:03:22Z","published":"2023-12-03T23:03:22Z","title":"T3D: Towards 3D Medical Image Understanding through Vision-Language\n Pre-training","summary":" Expert annotation of 3D medical image for downstream analysis is\nresource-intensive, posing challenges in clinical applications. Visual\nself-supervised learning (vSSL), though effective for learning visual\ninvariance, neglects the incorporation of domain knowledge from medicine. To\nincorporate medical knowledge into visual representation learning,\nvision-language pre-training (VLP) has shown promising results in 2D image.\nHowever, existing VLP approaches become generally impractical when applied to\nhigh-resolution 3D medical images due to GPU hardware constraints and the\npotential loss of critical details caused by downsampling, which is the\nintuitive solution to hardware constraints. To address the above limitations,\nwe introduce T3D, the first VLP framework designed for high-resolution 3D\nmedical images. T3D incorporates two text-informed pretext tasks:\n(\\lowerromannumeral{1}) text-informed contrastive learning;\n(\\lowerromannumeral{2}) text-informed image restoration. These tasks focus on\nlearning 3D visual representations from high-resolution 3D medical images and\nintegrating clinical knowledge from radiology reports, without distorting\ninformation through forced alignment of downsampled volumes with detailed\nanatomical text. Trained on a newly curated large-scale dataset of 3D medical\nimages and radiology reports, T3D significantly outperforms current vSSL\nmethods in tasks like organ and tumor segmentation, as well as disease\nclassification. This underlines T3D's potential in representation learning for\n3D medical image analysis. All data and code will be available upon acceptance.\n","authors":["Che Liu","Cheng Ouyang","Yinda Chen","Cesar César Quilodrán-Casas","Lei Ma","Jie Fu","Yike Guo","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2312.01529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13289v3","updated":"2023-12-03T23:00:11Z","published":"2023-05-22T17:50:18Z","title":"Achieving the Minimax Optimal Sample Complexity of Offline Reinforcement\n Learning: A DRO-Based Approach","summary":" Offline reinforcement learning aims to learn from pre-collected datasets\nwithout active exploration. This problem faces significant challenges,\nincluding limited data availability and distributional shifts. Existing\napproaches adopt a pessimistic stance towards uncertainty by penalizing rewards\nof under-explored state-action pairs to estimate value functions\nconservatively. In this paper, we show that the distributionally robust\noptimization (DRO) based approach can also address these challenges and is\nminimax optimal. Specifically, we directly model the uncertainty in the\ntransition kernel and construct an uncertainty set of statistically plausible\ntransition kernels. We then find the policy that optimizes the worst-case\nperformance over this uncertainty set. We first design a metric-based\nHoeffding-style uncertainty set such that with high probability the true\ntransition kernel is in this set. We prove that to achieve a sub-optimality gap\nof $\\epsilon$, the sample complexity is\n$\\mathcal{O}(S^2C^{\\pi^*}\\epsilon^{-2}(1-\\gamma)^{-4})$, where $\\gamma$ is the\ndiscount factor, $S$ is the number of states, and $C^{\\pi^*}$ is the\nsingle-policy clipped concentrability coefficient which quantifies the\ndistribution shift. To achieve the optimal sample complexity, we further\npropose a less conservative Bernstein-style uncertainty set, which, however,\ndoes not necessarily include the true transition kernel. We show that an\nimproved sample complexity of\n$\\mathcal{O}(SC^{\\pi^*}\\epsilon^{-2}(1-\\gamma)^{-3})$ can be obtained, which\nmatches with the minimax lower bound for offline reinforcement learning, and\nthus is minimax optimal.\n","authors":["Yue Wang","Jinjun Xiong","Shaofeng Zou"],"pdf_url":"https://arxiv.org/pdf/2305.13289v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.10601v2","updated":"2023-12-03T22:50:35Z","published":"2023-05-17T23:16:17Z","title":"Tree of Thoughts: Deliberate Problem Solving with Large Language Models","summary":" Language models are increasingly being deployed for general problem solving\nacross a wide range of tasks, but are still confined to token-level,\nleft-to-right decision-making processes during inference. This means they can\nfall short in tasks that require exploration, strategic lookahead, or where\ninitial decisions play a pivotal role. To surmount these challenges, we\nintroduce a new framework for language model inference, Tree of Thoughts (ToT),\nwhich generalizes over the popular Chain of Thought approach to prompting\nlanguage models, and enables exploration over coherent units of text (thoughts)\nthat serve as intermediate steps toward problem solving. ToT allows LMs to\nperform deliberate decision making by considering multiple different reasoning\npaths and self-evaluating choices to decide the next course of action, as well\nas looking ahead or backtracking when necessary to make global choices. Our\nexperiments show that ToT significantly enhances language models'\nproblem-solving abilities on three novel tasks requiring non-trivial planning\nor search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in\nGame of 24, while GPT-4 with chain-of-thought prompting only solved 4% of\ntasks, our method achieved a success rate of 74%. Code repo with all prompts:\nhttps://github.com/princeton-nlp/tree-of-thought-llm.\n","authors":["Shunyu Yao","Dian Yu","Jeffrey Zhao","Izhak Shafran","Thomas L. Griffiths","Yuan Cao","Karthik Narasimhan"],"pdf_url":"https://arxiv.org/pdf/2305.10601v2.pdf","comment":"NeurIPS 2023 camera ready version. Code repo with all prompts:\n https://github.com/princeton-nlp/tree-of-thought-llm"},{"id":"http://arxiv.org/abs/2312.01523v1","updated":"2023-12-03T22:44:58Z","published":"2023-12-03T22:44:58Z","title":"SymNoise: Advancing Language Model Fine-tuning with Symmetric Noise","summary":" In this paper, we introduce a novel fine-tuning technique for language\nmodels, which involves incorporating symmetric noise into the embedding\nprocess. This method aims to enhance the model's function by more stringently\nregulating its local curvature, demonstrating superior performance over the\ncurrent method, NEFTune. When fine-tuning the LLaMA-2-7B model using Alpaca,\nstandard techniques yield a 29.79% score on AlpacaEval. However, our approach,\nSymNoise, increases this score significantly to 69.04%, using symmetric noisy\nembeddings. This is a 6.7% improvement over the state-of-the-art method,\nNEFTune~(64.69%). Furthermore, when tested on various models and stronger\nbaseline instruction datasets, such as Evol-Instruct, ShareGPT, OpenPlatypus,\nSymNoise consistently outperforms NEFTune. The current literature, including\nNEFTune, has underscored the importance of more in-depth research into the\napplication of noise-based strategies in the fine-tuning of language models.\nOur approach, SymNoise, is another significant step towards this direction,\nshowing notable improvement over the existing state-of-the-art method.\n","authors":["Arjun Singh","Abhay Kumar Yadav"],"pdf_url":"https://arxiv.org/pdf/2312.01523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.03320v3","updated":"2023-12-03T22:44:05Z","published":"2022-09-07T17:27:08Z","title":"What does a platypus look like? Generating customized prompts for\n zero-shot image classification","summary":" Open-vocabulary models are a promising new paradigm for image classification.\nUnlike traditional classification models, open-vocabulary models classify among\nany arbitrary set of categories specified with natural language during\ninference. This natural language, called \"prompts\", typically consists of a set\nof hand-written templates (e.g., \"a photo of a {}\") which are completed with\neach of the category names. This work introduces a simple method to generate\nhigher accuracy prompts, without relying on any explicit knowledge of the task\ndomain and with far fewer hand-constructed sentences. To achieve this, we\ncombine open-vocabulary models with large language models (LLMs) to create\nCustomized Prompts via Language models (CuPL, pronounced \"couple\"). In\nparticular, we leverage the knowledge contained in LLMs in order to generate\nmany descriptive sentences that contain important discriminating\ncharacteristics of the image categories. This allows the model to place a\ngreater importance on these regions in the image when making predictions. We\nfind that this straightforward and general approach improves accuracy on a\nrange of zero-shot image classification benchmarks, including over one\npercentage point gain on ImageNet. Finally, this simple baseline requires no\nadditional training and remains completely zero-shot. Code available at\nhttps://github.com/sarahpratt/CuPL.\n","authors":["Sarah Pratt","Ian Covert","Rosanne Liu","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2209.03320v3.pdf","comment":"ICCV 2023"},{"id":"http://arxiv.org/abs/2312.01522v1","updated":"2023-12-03T22:44:04Z","published":"2023-12-03T22:44:04Z","title":"G2D: From Global to Dense Radiography Representation Learning via\n Vision-Language Pre-training","summary":" Recently, medical vision-language pre-training (VLP) has reached substantial\nprogress to learn global visual representation from medical images and their\npaired radiology reports. However, medical imaging tasks in real world usually\nrequire finer granularity in visual features. These tasks include visual\nlocalization tasks (e.g., semantic segmentation, object detection) and visual\ngrounding task. Yet, current medical VLP methods face challenges in learning\nthese fine-grained features, as they primarily focus on brute-force alignment\nbetween image patches and individual text tokens for local visual feature\nlearning, which is suboptimal for downstream dense prediction tasks. In this\nwork, we propose a new VLP framework, named \\textbf{G}lobal to \\textbf{D}ense\nlevel representation learning (G2D) that achieves significantly improved\ngranularity and more accurate grounding for the learned features, compared to\nexisting medical VLP approaches. In particular, G2D learns dense and\nsemantically-grounded image representations via a pseudo segmentation task\nparallel with the global vision-language alignment. Notably, generating pseudo\nsegmentation targets does not incur extra trainable parameters: they are\nobtained on the fly during VLP with a parameter-free processor. G2D achieves\nsuperior performance across 6 medical imaging tasks and 25 diseases,\nparticularly in semantic segmentation, which necessitates fine-grained,\nsemantically-grounded image features. In this task, G2D surpasses peer models\neven when fine-tuned with just 1\\% of the training data, compared to the 100\\%\nused by these models. The code will be released upon acceptance.\n","authors":["Che Liu","Cheng Ouyang","Sibo Cheng","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2312.01522v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06470v3","updated":"2023-12-03T22:29:11Z","published":"2023-03-11T17:44:28Z","title":"Prefix-Tree Decoding for Predicting Mass Spectra from Molecules","summary":" Computational predictions of mass spectra from molecules have enabled the\ndiscovery of clinically relevant metabolites. However, such predictive tools\nare still limited as they occupy one of two extremes, either operating (a) by\nfragmenting molecules combinatorially with overly rigid constraints on\npotential rearrangements and poor time complexity or (b) by decoding lossy and\nnonphysical discretized spectra vectors. In this work, we use a new\nintermediate strategy for predicting mass spectra from molecules by treating\nmass spectra as sets of molecular formulae, which are themselves multisets of\natoms. After first encoding an input molecular graph, we decode a set of\nmolecular subformulae, each of which specify a predicted peak in the mass\nspectrum, the intensities of which are predicted by a second model. Our key\ninsight is to overcome the combinatorial possibilities for molecular\nsubformulae by decoding the formula set using a prefix tree structure,\natom-type by atom-type, representing a general method for ordered multiset\ndecoding. We show promising empirical results on mass spectra prediction tasks.\n","authors":["Samuel Goldman","John Bradshaw","Jiayi Xin","Connor W. Coley"],"pdf_url":"https://arxiv.org/pdf/2303.06470v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01507v1","updated":"2023-12-03T21:05:50Z","published":"2023-12-03T21:05:50Z","title":"Learn2Extend: Extending sequences by retaining their statistical\n properties with mixture models","summary":" This paper addresses the challenge of extending general finite sequences of\nreal numbers within a subinterval of the real line, maintaining their inherent\nstatistical properties by employing machine learning. Our focus lies on\npreserving the gap distribution and pair correlation function of these point\nsets. Leveraging advancements in deep learning applied to point processes, this\npaper explores the use of an auto-regressive \\textit{Sequence Extension Mixture\nModel} (SEMM) for extending finite sequences, by estimating directly the\nconditional density, instead of the intensity function. We perform comparative\nexperiments on multiple types of point processes, including Poisson, locally\nattractive, and locally repelling sequences, and we perform a case study on the\nprediction of Riemann $\\zeta$ function zeroes. The results indicate that the\nproposed mixture model outperforms traditional neural network architectures in\nsequence extension with the retention of statistical properties. Given this\nmotivation, we showcase the capabilities of a mixture model to extend\nsequences, maintaining specific statistical properties, i.e. the gap\ndistribution, and pair correlation indicators.\n","authors":["Dimitris Vartziotis","George Dasoulas","Florian Pausinger"],"pdf_url":"https://arxiv.org/pdf/2312.01507v1.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2312.01504v1","updated":"2023-12-03T20:42:38Z","published":"2023-12-03T20:42:38Z","title":"Effectively Fine-tune to Improve Large Multimodal Models for Radiology\n Report Generation","summary":" Writing radiology reports from medical images requires a high level of domain\nexpertise. It is time-consuming even for trained radiologists and can be\nerror-prone for inexperienced radiologists. It would be appealing to automate\nthis task by leveraging generative AI, which has shown drastic progress in\nvision and language understanding. In particular, Large Language Models (LLM)\nhave demonstrated impressive capabilities recently and continued to set new\nstate-of-the-art performance on almost all natural language tasks. While many\nhave proposed architectures to combine vision models with LLMs for multimodal\ntasks, few have explored practical fine-tuning strategies. In this work, we\nproposed a simple yet effective two-stage fine-tuning protocol to align visual\nfeatures to LLM's text embedding space as soft visual prompts. Our framework\nwith OpenLLaMA-7B achieved state-of-the-art level performance without\ndomain-specific pretraining. Moreover, we provide detailed analyses of soft\nvisual prompts and attention mechanisms, shedding light on future research\ndirections.\n","authors":["Yuzhe Lu","Sungmin Hong","Yash Shah","Panpan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.01504v1.pdf","comment":"Accepted to Deep Generative Models for Health Workshop at NeurIPS\n 2023"},{"id":"http://arxiv.org/abs/2307.01708v2","updated":"2023-12-03T20:39:10Z","published":"2023-07-04T13:23:21Z","title":"Distributional Model Equivalence for Risk-Sensitive Reinforcement\n Learning","summary":" We consider the problem of learning models for risk-sensitive reinforcement\nlearning. We theoretically demonstrate that proper value equivalence, a method\nof learning models which can be used to plan optimally in the risk-neutral\nsetting, is not sufficient to plan optimally in the risk-sensitive setting. We\nleverage distributional reinforcement learning to introduce two new notions of\nmodel equivalence, one which is general and can be used to plan for any risk\nmeasure, but is intractable; and a practical variation which allows one to\nchoose which risk measures they may plan optimally for. We demonstrate how our\nframework can be used to augment any model-free risk-sensitive algorithm, and\nprovide both tabular and large-scale experiments to demonstrate its ability.\n","authors":["Tyler Kastner","Murat A. Erdogdu","Amir-massoud Farahmand"],"pdf_url":"https://arxiv.org/pdf/2307.01708v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01502v1","updated":"2023-12-03T20:21:08Z","published":"2023-12-03T20:21:08Z","title":"Normed Spaces for Graph Embedding","summary":" Theoretical results from discrete geometry suggest that normed spaces can\nabstractly embed finite metric spaces with surprisingly low theoretical bounds\non distortion in low dimensions. In this paper, inspired by this theoretical\ninsight, we highlight normed spaces as a more flexible and computationally\nefficient alternative to several popular Riemannian manifolds for learning\ngraph embeddings. Normed space embeddings significantly outperform several\npopular manifolds on a large range of synthetic and real-world graph\nreconstruction benchmark datasets while requiring significantly fewer\ncomputational resources. We also empirically verify the superiority of normed\nspace embeddings on growing families of graphs associated with negative, zero,\nand positive curvature, further reinforcing the flexibility of normed spaces in\ncapturing diverse graph structures as graph sizes increase. Lastly, we\ndemonstrate the utility of normed space embeddings on two applied graph\nembedding tasks, namely, link prediction and recommender systems. Our work\nhighlights the potential of normed spaces for geometric graph representation\nlearning, raises new research questions, and offers a valuable tool for\nexperimental mathematics in the field of finite metric space embeddings. We\nmake our code and data publically available.\n","authors":["Diaaeldin Taha","Wei Zhao","J. Maxwell Riestenberg","Michael Strube"],"pdf_url":"https://arxiv.org/pdf/2312.01502v1.pdf","comment":"23 pages,7 figures,9 tables | The first two authors contributed\n equally"}],"Multimedia":[{"id":"http://arxiv.org/abs/2310.01596v3","updated":"2023-12-03T20:25:18Z","published":"2023-10-02T19:41:42Z","title":"ImagenHub: Standardizing the evaluation of conditional image generation\n models","summary":" Recently, a myriad of conditional image generation and editing models have\nbeen developed to serve different downstream tasks, including text-to-image\ngeneration, text-guided image editing, subject-driven image generation,\ncontrol-guided image generation, etc. However, we observe huge inconsistencies\nin experimental conditions: datasets, inference, and evaluation metrics -\nrender fair comparisons difficult. This paper proposes ImagenHub, which is a\none-stop library to standardize the inference and evaluation of all the\nconditional image generation models. Firstly, we define seven prominent tasks\nand curate high-quality evaluation datasets for them. Secondly, we built a\nunified inference pipeline to ensure fair comparison. Thirdly, we design two\nhuman evaluation scores, i.e. Semantic Consistency and Perceptual Quality,\nalong with comprehensive guidelines to evaluate generated images. We train\nexpert raters to evaluate the model outputs based on the proposed metrics. Our\nhuman evaluation achieves a high inter-worker agreement of Krippendorff's alpha\non 76% models with a value higher than 0.4. We comprehensively evaluated a\ntotal of around 30 models and observed three key takeaways: (1) the existing\nmodels' performance is generally unsatisfying except for Text-guided Image\nGeneration and Subject-driven Image Generation, with 74% models achieving an\noverall score lower than 0.5. (2) we examined the claims from published papers\nand found 83% of them hold with a few exceptions. (3) None of the existing\nautomatic metrics has a Spearman's correlation higher than 0.2 except\nsubject-driven image generation. Moving forward, we will continue our efforts\nto evaluate newly published models and update our leaderboard to keep track of\nthe progress in conditional image generation.\n","authors":["Max Ku","Tianle Li","Kai Zhang","Yujie Lu","Xingyu Fu","Wenwen Zhuang","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.01596v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.13397v3","updated":"2023-12-03T20:12:46Z","published":"2023-03-23T16:15:18Z","title":"DiffMesh: A Motion-aware Diffusion-like Framework for Human Mesh\n Recovery from Videos","summary":" Human mesh recovery (HMR) provides rich human body information for various\nreal-world applications. While image-based HMR methods have achieved impressive\nresults, they often struggle to recover humans in dynamic scenarios, leading to\ntemporal inconsistencies and non-smooth 3D motion predictions due to the\nabsence of human motion. In contrast, video-based approaches leverage temporal\ninformation to mitigate this issue. In this paper, we present DiffMesh, an\ninnovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh\nestablishes a bridge between diffusion models and human motion, efficiently\ngenerating accurate and smooth output mesh sequences by incorporating human\nmotion within the forward process and reverse process in the diffusion model.\nExtensive experiments are conducted on the widely used datasets (Human3.6M\n\\cite{h36m_pami} and 3DPW \\cite{pw3d2018}), which demonstrate the effectiveness\nand efficiency of our DiffMesh. Visual comparisons in real-world scenarios\nfurther highlight DiffMesh's suitability for practical applications.\n","authors":["Ce Zheng","Xianpeng Liu","Mengyuan Liu","Tianfu Wu","Guo-Jun Qi","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2303.13397v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.11300v3","updated":"2023-12-03T16:33:58Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. A critical challenge is how\nto make use of existing large-scale pre-trained VLMs, which are trained on\ncommon objects, to perform the domain-specific transfer for accomplishing\ndomain-related downstream tasks. In this paper, we propose a new framework that\nincludes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap\nbetween the General Vision-Language Model (GVLM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\nfine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning\nmethods on RS5M to implement the DVLM. Experimental results show that our\nproposed dataset is highly effective for various tasks, and our model GeoRSCLIP\nimproves upon the baseline or previous state-of-the-art model by $3\\%\\sim20\\%$\nin Zero-shot Classification (ZSC), $3\\%\\sim6\\%$ in Remote Sensing Cross-Modal\nText-Image Retrieval (RSCTIR) and $4\\%\\sim5\\%$ in Semantic Localization (SeLo)\ntasks. Dataset and models have been released in:\n\\url{https://github.com/om-ai-lab/RS5M}.\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v3.pdf","comment":"RS5M dataset v5"},{"id":"http://arxiv.org/abs/2312.01295v1","updated":"2023-12-03T06:02:27Z","published":"2023-12-03T06:02:27Z","title":"Two-stage dynamic creative optimization under sparse ambiguous samples\n for e-commerce advertising","summary":" Ad creative is one of the main mediums for e-commerce advertising. In our\napproach we decouple this dynamic creative optimization into two stages, a\ncascaded structure that can trade off between effectiveness and efficiency. In\nthe first stage, we train an automatic creative optimization architecture based\non autoco to simulate complex interactions between creative elements. Although\nwe obtained the ranking of different creatives under a sku, because we bucketed\nand merged historical data according to periods, this confuses the ctr\ndiversity of the same ad creatives on different days and weakens the ability to\nseparate ambiguous samples. Therefore, we propose a transformer-based rerank\nmodel. With the help of the rank model, we propose a distillation method to\nlearn the relative order of ideas and extract the ranking knowledge to guide\nthe rerank learning. The creative order soft labels under each sku are\ngenerated by the rank model to alleviate the dilemma that a large number of\nunder-represented creatives cannot obtain real labels. Through the knowledge\ndiffusion of rerank, the ambiguous samples are associated with the positive and\nnegative samples. Cascade rerank and autoco to output the estimated value of\nthe synthetic ad image. In the second stage, we designed a bandit model, and\nthe bandit selected one of the output ad of the first stage for timely\ndelivery. Experimental results show that our method can outperform competing\nbaselines in terms of sctr. Online A/B testing shows that our method improves\nctr by 10% compared to the baseline.\n","authors":["Guandong Li","Xian Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01295v1.pdf","comment":null}]},"2023-12-02T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.01244v1","updated":"2023-12-02T23:05:24Z","published":"2023-12-02T23:05:24Z","title":"Challenges and Applications of Automated Extraction of Socio-political\n Events from Text (CASE 2023): Workshop and Shared Task Report","summary":" We provide a summary of the sixth edition of the CASE workshop that is held\nin the scope of RANLP 2023. The workshop consists of regular papers, three\nkeynotes, working papers of shared task participants, and shared task overview\npapers. This workshop series has been bringing together all aspects of event\ninformation collection across technical and social science fields. In addition\nto contributing to the progress in text based event extraction, the workshop\nprovides a space for the organization of a multimodal event information\ncollection task.\n","authors":["Ali Hürriyetoğlu","Hristo Tanev","Osman Mutlu","Surendrabikram Thapa","Fiona Anting Tan","Erdem Yörük"],"pdf_url":"https://arxiv.org/pdf/2312.01244v1.pdf","comment":"https://aclanthology.org/2023.case-1.22"},{"id":"http://arxiv.org/abs/2305.03047v2","updated":"2023-12-02T21:36:10Z","published":"2023-05-04T17:59:28Z","title":"Principle-Driven Self-Alignment of Language Models from Scratch with\n Minimal Human Supervision","summary":" Recent AI-assistant agents, such as ChatGPT, predominantly rely on supervised\nfine-tuning (SFT) with human annotations and reinforcement learning from human\nfeedback (RLHF) to align the output of large language models (LLMs) with human\nintentions, ensuring they are helpful, ethical, and reliable. However, this\ndependence can significantly constrain the true potential of AI-assistant\nagents due to the high cost of obtaining human supervision and the related\nissues on quality, reliability, diversity, self-consistency, and undesirable\nbiases. To address these challenges, we propose a novel approach called\nSELF-ALIGN, which combines principle-driven reasoning and the generative power\nof LLMs for the self-alignment of AI agents with minimal human supervision. Our\napproach encompasses four stages: first, we use an LLM to generate synthetic\nprompts, and a topic-guided method to augment the prompt diversity; second, we\nuse a small set of human-written principles for AI models to follow, and guide\nthe LLM through in-context learning from demonstrations (of principles\napplication) to produce helpful, ethical, and reliable responses to user's\nqueries; third, we fine-tune the original LLM with the high-quality\nself-aligned responses so that the resulting model can generate desirable\nresponses for each query directly without the principle set and the\ndemonstrations anymore; and finally, we offer a refinement step to address the\nissues of overly-brief or indirect responses. Applying SELF-ALIGN to the\nLLaMA-65b base language model, we develop an AI assistant named Dromedary. With\nfewer than 300 lines of human annotations (including < 200 seed prompts, 16\ngeneric principles, and 5 exemplars for in-context learning). Dromedary\nsignificantly surpasses the performance of several state-of-the-art AI systems,\nincluding Text-Davinci-003 and Alpaca, on benchmark datasets with various\nsettings.\n","authors":["Zhiqing Sun","Yikang Shen","Qinhong Zhou","Hongxin Zhang","Zhenfang Chen","David Cox","Yiming Yang","Chuang Gan"],"pdf_url":"https://arxiv.org/pdf/2305.03047v2.pdf","comment":"Accepted at NeurIPS 2023 (Spotlight). Project page:\n https://github.com/IBM/Dromedary"},{"id":"http://arxiv.org/abs/2311.15565v2","updated":"2023-12-02T21:06:50Z","published":"2023-11-27T06:26:53Z","title":"Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing\n AI-Generated Text","summary":" My research investigates the use of cutting-edge hybrid deep learning models\nto accurately differentiate between AI-generated text and human writing. I\napplied a robust methodology, utilising a carefully selected dataset comprising\nAI and human texts from various sources, each tagged with instructions.\nAdvanced natural language processing techniques facilitated the analysis of\ntextual features. Combining sophisticated neural networks, the custom model\nenabled it to detect nuanced differences between AI and human content.\n","authors":["Finbarrs Oketunji"],"pdf_url":"https://arxiv.org/pdf/2311.15565v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.01196v4","updated":"2023-12-02T21:05:22Z","published":"2023-04-03T17:59:09Z","title":"Baize: An Open-Source Chat Model with Parameter-Efficient Tuning on\n Self-Chat Data","summary":" Chat models, such as ChatGPT, have shown impressive capabilities and have\nbeen rapidly adopted across numerous domains. However, these models are only\naccessible through a restricted API, creating barriers for new research and\nprogress in the field. We propose a pipeline that can automatically generate a\nhigh-quality multi-turn chat corpus by leveraging ChatGPT to engage in a\nconversation with itself. Subsequently, we employ parameter-efficient tuning to\nenhance LLaMA, an open-source large language model. The resulting model, named\nBaize, demonstrates good performance in multi-turn dialogues with guardrails\nthat minimize potential risks. Furthermore, we propose a new technique called\nSelf-Distill with Feedback, to further improve the performance of the Baize\nmodels with feedback from ChatGPT. The Baize models and data are released for\nresearch purposes only at https://github.com/project-baize/baize-chatbot. An\nonline demo is also available at\nhttps://huggingface.co/spaces/project-baize/chat-with-baize.\n","authors":["Canwen Xu","Daya Guo","Nan Duan","Julian McAuley"],"pdf_url":"https://arxiv.org/pdf/2304.01196v4.pdf","comment":"Baize v2; EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.01225v1","updated":"2023-12-02T21:03:23Z","published":"2023-12-02T21:03:23Z","title":"UCE-FID: Using Large Unlabeled, Medium Crowdsourced-Labeled, and Small\n Expert-Labeled Tweets for Foodborne Illness Detection","summary":" Foodborne illnesses significantly impact public health. Deep learning\nsurveillance applications using social media data aim to detect early warning\nsignals. However, labeling foodborne illness-related tweets for model training\nrequires extensive human resources, making it challenging to collect a\nsufficient number of high-quality labels for tweets within a limited budget.\nThe severe class imbalance resulting from the scarcity of foodborne\nillness-related tweets among the vast volume of social media further\nexacerbates the problem. Classifiers trained on a class-imbalanced dataset are\nbiased towards the majority class, making accurate detection difficult. To\novercome these challenges, we propose EGAL, a deep learning framework for\nfoodborne illness detection that uses small expert-labeled tweets augmented by\ncrowdsourced-labeled and massive unlabeled data. Specifically, by leveraging\ntweets labeled by experts as a reward set, EGAL learns to assign a weight of\nzero to incorrectly labeled tweets to mitigate their negative influence. Other\ntweets receive proportionate weights to counter-balance the unbalanced class\ndistribution. Extensive experiments on real-world \\textit{TWEET-FID} data show\nthat EGAL outperforms strong baseline models across different settings,\nincluding varying expert-labeled set sizes and class imbalance ratios. A case\nstudy on a multistate outbreak of Salmonella Typhimurium infection linked to\npackaged salad greens demonstrates how the trained model captures relevant\ntweets offering valuable outbreak insights. EGAL, funded by the U.S. Department\nof Agriculture (USDA), has the potential to be deployed for real-time analysis\nof tweet streaming, contributing to foodborne illness outbreak surveillance\nefforts.\n","authors":["Ruofan Hu","Dongyu Zhang","Dandan Tao","Huayi Zhang","Hao Feng","Elke Rundensteiner"],"pdf_url":"https://arxiv.org/pdf/2312.01225v1.pdf","comment":"2023 IEEE International Conference on Big Data (BigData)"},{"id":"http://arxiv.org/abs/2311.14836v2","updated":"2023-12-02T20:42:10Z","published":"2023-11-24T20:16:29Z","title":"Custom Data Augmentation for low resource ASR using Bark and\n Retrieval-Based Voice Conversion","summary":" This paper proposes two innovative methodologies to construct customized\nCommon Voice datasets for low-resource languages like Hindi. The first\nmethodology leverages Bark, a transformer-based text-to-audio model developed\nby Suno, and incorporates Meta's enCodec and a pre-trained HuBert model to\nenhance Bark's performance. The second methodology employs Retrieval-Based\nVoice Conversion (RVC) and uses the Ozen toolkit for data preparation. Both\nmethodologies contribute to the advancement of ASR technology and offer\nvaluable insights into addressing the challenges of constructing customized\nCommon Voice datasets for under-resourced languages. Furthermore, they provide\na pathway to achieving high-quality, personalized voice generation for a range\nof applications.\n","authors":["Anand Kamble","Aniket Tathe","Suyash Kumbharkar","Atharva Bhandare","Anirban C. Mitra"],"pdf_url":"https://arxiv.org/pdf/2311.14836v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01221v1","updated":"2023-12-02T20:19:11Z","published":"2023-12-02T20:19:11Z","title":"Enabling Quantum Natural Language Processing for Hindi Language","summary":" Quantum Natural Language Processing (QNLP) is taking huge leaps in solving\nthe shortcomings of classical Natural Language Processing (NLP) techniques and\nmoving towards a more \"Explainable\" NLP system. The current literature around\nQNLP focuses primarily on implementing QNLP techniques in sentences in the\nEnglish language. In this paper, we propose to enable the QNLP approach to\nHINDI, which is the third most spoken language in South Asia. We present the\nprocess of building the parameterized quantum circuits required to undertake\nQNLP on Hindi sentences. We use the pregroup representation of Hindi and the\nDisCoCat framework to draw sentence diagrams. Later, we translate these\ndiagrams to Parameterised Quantum Circuits based on Instantaneous Quantum\nPolynomial (IQP) style ansatz. Using these parameterized quantum circuits\nallows one to train grammar and topic-aware sentence classifiers for the Hindi\nLanguage.\n","authors":["Naman Srivastava","Gaurang Belekar","Sunil Saumya","Aswath Babu H"],"pdf_url":"https://arxiv.org/pdf/2312.01221v1.pdf","comment":"7 Pages"},{"id":"http://arxiv.org/abs/2312.01217v1","updated":"2023-12-02T20:02:34Z","published":"2023-12-02T20:02:34Z","title":"Understanding Opinions Towards Climate Change on Social Media","summary":" Social media platforms such as Twitter (now known as X) have revolutionized\nhow the public engage with important societal and political topics. Recently,\nclimate change discussions on social media became a catalyst for political\npolarization and the spreading of misinformation. In this work, we aim to\nunderstand how real world events influence the opinions of individuals towards\nclimate change related topics on social media. To this end, we extracted and\nanalyzed a dataset of 13.6 millions tweets sent by 3.6 million users from 2006\nto 2019. Then, we construct a temporal graph from the user-user mentions\nnetwork and utilize the Louvain community detection algorithm to analyze the\nchanges in community structure around Conference of the Parties on Climate\nChange~(COP) events. Next, we also apply tools from the Natural Language\nProcessing literature to perform sentiment analysis and topic modeling on the\ntweets. Our work acts as a first step towards understanding the evolution of\npro-climate change communities around COP events. Answering these questions\nhelps us understand how to raise people's awareness towards climate change thus\nhopefully calling on more individuals to join the collaborative effort in\nslowing down climate change.\n","authors":["Yashaswi Pupneja","Joseph Zou","Sacha Lévy","Shenyang Huang"],"pdf_url":"https://arxiv.org/pdf/2312.01217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.06550v2","updated":"2023-12-02T19:29:18Z","published":"2023-09-06T14:14:37Z","title":"Synthetic Text Generation using Hypergraph Representations","summary":" Generating synthetic variants of a document is often posed as text-to-text\ntransformation. We propose an alternate LLM based method that first decomposes\na document into semantic frames and then generates text using this interim\nsparse format. The frames are modeled using a hypergraph, which allows\nperturbing the frame contents in a principled manner. Specifically, new\nhyperedges are mined through topological analysis and complex polyadic\nrelationships including hierarchy and temporal dynamics are accommodated. We\nshow that our solution generates documents that are diverse, coherent and vary\nin style, sentiment, format, composition and facts.\n","authors":["Natraj Raman","Sameena Shah"],"pdf_url":"https://arxiv.org/pdf/2309.06550v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01202v1","updated":"2023-12-02T18:55:14Z","published":"2023-12-02T18:55:14Z","title":"From Voices to Validity: Leveraging Large Language Models (LLMs) for\n Textual Analysis of Policy Stakeholder Interviews","summary":" Obtaining stakeholders' diverse experiences and opinions about current policy\nin a timely manner is crucial for policymakers to identify strengths and gaps\nin resource allocation, thereby supporting effective policy design and\nimplementation. However, manually coding even moderately sized interview texts\nor open-ended survey responses from stakeholders can often be labor-intensive\nand time-consuming. This study explores the integration of Large Language\nModels (LLMs)--like GPT-4--with human expertise to enhance text analysis of\nstakeholder interviews regarding K-12 education policy within one U.S. state.\nEmploying a mixed-methods approach, human experts developed a codebook and\ncoding processes as informed by domain knowledge and unsupervised topic\nmodeling results. They then designed prompts to guide GPT-4 analysis and\niteratively evaluate different prompts' performances. This combined\nhuman-computer method enabled nuanced thematic and sentiment analysis. Results\nreveal that while GPT-4 thematic coding aligned with human coding by 77.89% at\nspecific themes, expanding to broader themes increased congruence to 96.02%,\nsurpassing traditional Natural Language Processing (NLP) methods by over 25%.\nAdditionally, GPT-4 is more closely matched to expert sentiment analysis than\nlexicon-based methods. Findings from quantitative measures and qualitative\nreviews underscore the complementary roles of human domain expertise and\nautomated analysis as LLMs offer new perspectives and coding consistency. The\nhuman-computer interactive approach enhances efficiency, validity, and\ninterpretability of educational policy research.\n","authors":["Alex Liu","Min Sun"],"pdf_url":"https://arxiv.org/pdf/2312.01202v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08412v2","updated":"2023-12-02T18:10:15Z","published":"2022-11-15T18:50:34Z","title":"Evaluating the Factual Consistency of Large Language Models Through News\n Summarization","summary":" While large language models (LLMs) have proven to be effective on a large\nvariety of tasks, they are also known to hallucinate information. To measure\nwhether an LLM prefers factually consistent continuations of its input, we\npropose a new benchmark called FIB(Factual Inconsistency Benchmark) that\nfocuses on the task of summarization. Specifically, our benchmark involves\ncomparing the scores an LLM assigns to a factually consistent versus a\nfactually inconsistent summary for an input news article. For factually\nconsistent summaries, we use human-written reference summaries that we manually\nverify as factually consistent. To generate summaries that are factually\ninconsistent, we generate summaries from a suite of summarization models that\nwe have manually annotated as factually inconsistent. A model's factual\nconsistency is then measured according to its accuracy, i.e.\\ the proportion of\ndocuments where it assigns a higher score to the factually consistent summary.\nTo validate the usefulness of FIB, we evaluate 23 large language models ranging\nfrom 1B to 176B parameters from six different model families including BLOOM\nand OPT. We find that existing LLMs generally assign a higher score to\nfactually consistent summaries than to factually inconsistent summaries.\nHowever, if the factually inconsistent summaries occur verbatim in the\ndocument, then LLMs assign a higher score to these factually inconsistent\nsummaries than factually consistent summaries. We validate design choices in\nour benchmark including the scoring method and source of distractor summaries.\nOur code and benchmark data can be found at https://github.com/r-three/fib.\n","authors":["Derek Tam","Anisha Mascarenhas","Shiyue Zhang","Sarah Kwan","Mohit Bansal","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2211.08412v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.12275v2","updated":"2023-12-02T17:35:28Z","published":"2023-11-21T01:34:02Z","title":"Enabling On-Device Large Language Model Personalization with\n Self-Supervised Data Selection and Synthesis","summary":" After a large language model (LLM) is deployed on edge devices, it is\ndesirable for these devices to learn from user-generated conversation data to\ngenerate user-specific and personalized responses in real-time. However,\nuser-generated data usually contains sensitive and private information, and\nuploading such data to the cloud for annotation is not preferred if not\nprohibited. While it is possible to obtain annotation locally by directly\nasking users to provide preferred responses, such annotations have to be sparse\nto not affect user experience. In addition, the storage of edge devices is\nusually too limited to enable large-scale fine-tuning with full user-generated\ndata. It remains an open question how to enable on-device LLM personalization,\nconsidering sparse annotation and limited on-device storage. In this paper, we\npropose a novel framework to select and store the most representative data\nonline in a self-supervised way. Such data has a small memory footprint and\nallows infrequent requests of user annotations for further fine-tuning. To\nenhance fine-tuning quality, multiple semantically similar pairs of question\ntexts and expected responses are generated using the LLM. Our experiments show\nthat the proposed framework achieves the best user-specific content-generating\ncapability (accuracy) and fine-tuning speed (performance) compared with vanilla\nbaselines. To the best of our knowledge, this is the very first on-device LLM\npersonalization framework.\n","authors":["Ruiyang Qin","Jun Xia","Zhenge Jia","Meng Jiang","Ahmed Abbasi","Peipei Zhou","Jingtong Hu","Yiyu Shi"],"pdf_url":"https://arxiv.org/pdf/2311.12275v2.pdf","comment":"6 pages, 3 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.01185v1","updated":"2023-12-02T17:24:17Z","published":"2023-12-02T17:24:17Z","title":"A ripple in time: a discontinuity in American history","summary":" In this note we use the State of the Union Address dataset from Kaggle to\nmake some surprising (and some not so surprising) observations pertaining to\nthe general timeline of American history, and the character and nature of the\naddresses themselves. Our main approach is using vector embeddings, such as\nBERT (DistilBERT) and GPT-2. While it is widely believed that BERT (and its\nvariations) is most suitable for NLP classification tasks, we find out that\nGPT-2 in conjunction with nonlinear dimension reduction methods such as UMAP\nprovide better separation and stronger clustering. This makes GPT-2 + UMAP an\ninteresting alternative. In our case, no model fine-tuning is required, and the\npre-trained out-of-the-box GPT-2 model is enough. We also used a fine-tuned\nDistilBERT model for classification (detecting which president delivered which\naddress), with very good results (accuracy 93% - 95% depending on the run). All\ncomputations can be replicated by using the accompanying code on GitHub.\n","authors":["Alexander Kolpakov","Igor Rivin"],"pdf_url":"https://arxiv.org/pdf/2312.01185v1.pdf","comment":"7 pages, 8 figures; GitHub repository\n https://github.com/sashakolpakov/ripple_in_time"},{"id":"http://arxiv.org/abs/2311.06595v3","updated":"2023-12-02T17:00:27Z","published":"2023-11-11T15:40:21Z","title":"From Classification to Generation: Insights into Crosslingual Retrieval\n Augmented ICL","summary":" The remarkable ability of Large Language Models (LLMs) to understand and\nfollow instructions has sometimes been limited by their in-context learning\n(ICL) performance in low-resource languages. To address this, we introduce a\nnovel approach that leverages cross-lingual retrieval-augmented in-context\nlearning (CREA-ICL). By extracting semantically similar prompts from\nhigh-resource languages, we aim to improve the zero-shot performance of\nmultilingual pre-trained language models (MPLMs) across diverse tasks. Though\nour approach yields steady improvements in classification tasks, it faces\nchallenges in generation tasks. Our evaluation offers insights into the\nperformance dynamics of retrieval-augmented in-context learning across both\nclassification and generation domains.\n","authors":["Xiaoqian Li","Ercong Nie","Sheng Liang"],"pdf_url":"https://arxiv.org/pdf/2311.06595v3.pdf","comment":"In The Workshop on Instruction Tuning and Instruction Following, held\n in conjunction with The Conference on NeurIPS 2023, December 2023"},{"id":"http://arxiv.org/abs/2311.00587v2","updated":"2023-12-02T16:54:23Z","published":"2023-11-01T15:32:50Z","title":"Crosslingual Retrieval Augmented In-context Learning for Bangla","summary":" The promise of Large Language Models (LLMs) in Natural Language Processing\nhas often been overshadowed by their limited performance in low-resource\nlanguages such as Bangla. To address this, our paper presents a pioneering\napproach that utilizes cross-lingual retrieval augmented in-context learning.\nBy strategically sourcing semantically similar prompts from high-resource\nlanguage, we enable multilingual pretrained language models (MPLMs), especially\nthe generative model BLOOMZ, to successfully boost performance on Bangla tasks.\nOur extensive evaluation highlights that the cross-lingual retrieval augmented\nprompts bring steady improvements to MPLMs over the zero-shot performance.\n","authors":["Xiaoqian Li","Ercong Nie","Sheng Liang"],"pdf_url":"https://arxiv.org/pdf/2311.00587v2.pdf","comment":"In The 1st Bangla Language Processing (BLP) Workshop, held in\n conjunction with The Conference on Empirical Methods in Natural Language\n Processing (EMNLP), December 2023"},{"id":"http://arxiv.org/abs/2312.01151v1","updated":"2023-12-02T14:41:01Z","published":"2023-12-02T14:41:01Z","title":"Here Is Not There: Measuring Entailment-Based Trajectory Similarity for\n Location-Privacy Protection and Beyond","summary":" While the paths humans take play out in social as well as physical space,\nmeasures to describe and compare their trajectories are carried out in\nabstract, typically Euclidean, space. When these measures are applied to\ntrajectories of actual individuals in an application area, alterations that are\ninconsequential in abstract space may suddenly become problematic once overlaid\nwith geographic reality. In this work, we present a different view on\ntrajectory similarity by introducing a measure that utilizes logical\nentailment. This is an inferential perspective that considers facts as triple\nstatements deduced from the social and environmental context in which the\ntravel takes place, and their practical implications. We suggest a\nformalization of entailment-based trajectory similarity, measured as the\noverlapping proportion of facts, which are spatial relation statements in our\ncase study. With the proposed measure, we evaluate LSTM-TrajGAN, a\nprivacy-preserving trajectory-generation model. The entailment-based model\nevaluation reveals potential consequences of disregarding the rich structure of\ngeographic space (e.g., miscalculated insurance risk due to regional shifts in\nour toy example). Our work highlights the advantage of applying logical\nentailment to trajectory-similarity reasoning for location-privacy protection\nand beyond.\n","authors":["Zilong Liu","Krzysztof Janowicz","Kitty Currier","Meilin Shi","Jinmeng Rao","Song Gao","Ling Cai","Anita Graser"],"pdf_url":"https://arxiv.org/pdf/2312.01151v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08105v2","updated":"2023-12-02T14:10:14Z","published":"2023-11-14T12:05:45Z","title":"DiLoCo: Distributed Low-Communication Training of Language Models","summary":" Large language models (LLM) have become a critical component in many\napplications of machine learning. However, standard approaches to training LLM\nrequire a large number of tightly interconnected accelerators, with devices\nexchanging gradients and other intermediate states at each optimization step.\nWhile it is difficult to build and maintain a single computing cluster hosting\nmany accelerators, it might be easier to find several computing clusters each\nhosting a smaller number of devices. In this work, we propose a distributed\noptimization algorithm, Distributed Low-Communication (DiLoCo), that enables\ntraining of language models on islands of devices that are poorly connected.\nThe approach is a variant of federated averaging, where the number of inner\nsteps is large, the inner optimizer is AdamW, and the outer optimizer is\nNesterov momentum. On the widely used C4 dataset, we show that DiLoCo on 8\nworkers performs as well as fully synchronous optimization while communicating\n500 times less. DiLoCo exhibits great robustness to the data distribution of\neach worker. It is also robust to resources becoming unavailable over time, and\nvice versa, it can seamlessly leverage resources that become available during\ntraining.\n","authors":["Arthur Douillard","Qixuan Feng","Andrei A. Rusu","Rachita Chhaparia","Yani Donchev","Adhiguna Kuncoro","Marc'Aurelio Ranzato","Arthur Szlam","Jiajun Shen"],"pdf_url":"https://arxiv.org/pdf/2311.08105v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01143v1","updated":"2023-12-02T14:02:52Z","published":"2023-12-02T14:02:52Z","title":"Towards leveraging LLMs for Conditional QA","summary":" This study delves into the capabilities and limitations of Large Language\nModels (LLMs) in the challenging domain of conditional question-answering.\nUtilizing the Conditional Question Answering (CQA) dataset and focusing on\ngenerative models like T5 and UL2, we assess the performance of LLMs across\ndiverse question types. Our findings reveal that fine-tuned LLMs can surpass\nthe state-of-the-art (SOTA) performance in some cases, even without fully\nencoding all input context, with an increase of 7-8 points in Exact Match (EM)\nand F1 scores for Yes/No questions. However, these models encounter challenges\nin extractive question answering, where they lag behind the SOTA by over 10\npoints, and in mitigating the risk of injecting false information. A study with\noracle-retrievers emphasizes the critical role of effective evidence retrieval,\nunderscoring the necessity for advanced solutions in this area. Furthermore, we\nhighlight the significant influence of evaluation metrics on performance\nassessments and advocate for a more comprehensive evaluation framework. The\ncomplexity of the task, the observed performance discrepancies, and the need\nfor effective evidence retrieval underline the ongoing challenges in this field\nand underscore the need for future work focusing on refining training tasks and\nexploring prompt-based techniques to enhance LLM performance in conditional\nquestion-answering tasks.\n","authors":["Syed-Amad Hussain","Parag Pravin Dakle","SaiKrishna Rallabandi","Preethi Raghavan"],"pdf_url":"https://arxiv.org/pdf/2312.01143v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.00996v2","updated":"2023-12-02T13:47:20Z","published":"2023-10-02T08:58:29Z","title":"ARN: A Comprehensive Framework and Benchmark for Analogical Reasoning on\n Narratives","summary":" Analogical reasoning is one of the prime abilities of humans and is linked to\ncreativity and scientific discoveries. This ability has been studied\nextensively in natural language processing (NLP) and in cognitive psychology.\nNLP benchmarks often focus on proportional analogies, while the ones in\ncognitive psychology investigate longer pieces of text too. Yet, although\nstudies that focus on analogical reasoning in an involved setting utilize\nnarratives as their evaluation medium, analogical reasoning on narratives has\nnot been studied extensively. We create an extensive evaluation framework for\nanalogical reasoning on narratives that utilizes narrative elements to create\nlower-order and higher-order mappings that subsequently lead to the development\nof the Analogical Reasoning on Narratives (ARN) benchmark that covers four\ncategories of far(cross-domain)/near(within-domain) analogies and far/near\ndisanalogies, allowing us to study analogical reasoning in LLMs in distinct\nscenarios. Our results demonstrate that LLMs struggle to recognize higher-order\nmappings when they are not accompanied by lower-order mappings (far analogies)\nand show better performance when all mappings are formed simultaneously (near\nanalogies). We observe that in all the scenarios, the analogical reasoning\nabilities of LLMs can be easily impaired by lower-order mappings in near\ndisanalogies.\n","authors":["Zhivar Sourati","Filip Ilievski","Pia Sommerauer","Yifan Jiang"],"pdf_url":"https://arxiv.org/pdf/2310.00996v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.16583v4","updated":"2023-12-02T13:27:22Z","published":"2023-09-28T16:43:35Z","title":"GPT-Fathom: Benchmarking Large Language Models to Decipher the\n Evolutionary Path towards GPT-4 and Beyond","summary":" With the rapid advancement of large language models (LLMs), there is a\npressing need for a comprehensive evaluation suite to assess their capabilities\nand limitations. Existing LLM leaderboards often reference scores reported in\nother papers without consistent settings and prompts, which may inadvertently\nencourage cherry-picking favored settings and prompts for better results. In\nthis work, we introduce GPT-Fathom, an open-source and reproducible LLM\nevaluation suite built on top of OpenAI Evals. We systematically evaluate 10+\nleading LLMs as well as OpenAI's legacy models on 20+ curated benchmarks across\n7 capability categories, all under aligned settings. Our retrospective study on\nOpenAI's earlier models offers valuable insights into the evolutionary path\nfrom GPT-3 to GPT-4. Currently, the community is eager to know how GPT-3\nprogressively improves to GPT-4, including technical details like whether\nadding code data improves LLM's reasoning capability, which aspects of LLM\ncapability can be improved by SFT and RLHF, how much is the alignment tax, etc.\nOur analysis sheds light on many of these questions, aiming to improve the\ntransparency of advanced LLMs.\n","authors":["Shen Zheng","Yuyu Zhang","Yijie Zhu","Chenguang Xi","Pengyang Gao","Xun Zhou","Kevin Chen-Chuan Chang"],"pdf_url":"https://arxiv.org/pdf/2309.16583v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.03997v2","updated":"2023-12-02T12:28:14Z","published":"2023-06-06T20:19:33Z","title":"Sentiment Analysis in Finance: From Transformers Back to eXplainable\n Lexicons (XLex)","summary":" Lexicon-based sentiment analysis (SA) in finance leverages specialized,\nmanually annotated lexicons created by human experts to extract sentiment from\nfinancial texts. Although lexicon-based methods are simple to implement and\nfast to operate on textual data, they require considerable manual annotation\nefforts to create, maintain, and update the lexicons. These methods are also\nconsidered inferior to the deep learning-based approaches, such as transformer\nmodels, which have become dominant in various NLP tasks due to their remarkable\nperformance. However, transformers require extensive data and computational\nresources for both training and testing. Additionally, they involve significant\nprediction times, making them unsuitable for real-time production environments\nor systems with limited processing capabilities. In this paper, we introduce a\nnovel methodology named eXplainable Lexicons (XLex) that combines the\nadvantages of both lexicon-based methods and transformer models. We propose an\napproach that utilizes transformers and SHapley Additive exPlanations (SHAP)\nfor explainability to learn financial lexicons. Our study presents four main\ncontributions. Firstly, we demonstrate that transformer-aided explainable\nlexicons can enhance the vocabulary coverage of the benchmark Loughran-McDonald\n(LM) lexicon, reducing the human involvement in annotating, maintaining, and\nupdating the lexicons. Secondly, we show that the resulting lexicon outperforms\nthe standard LM lexicon in SA of financial datasets. Thirdly, we illustrate\nthat the lexicon-based approach is significantly more efficient in terms of\nmodel speed and size compared to transformers. Lastly, the XLex approach is\ninherently more interpretable than transformer models as lexicon models rely on\npredefined rules, allowing for better insights into the results of SA and\nmaking the XLex approach a viable tool for financial decision-making.\n","authors":["Maryan Rizinski","Hristijan Peshov","Kostadin Mishev","Milos Jovanovik","Dimitar Trajanov"],"pdf_url":"https://arxiv.org/pdf/2306.03997v2.pdf","comment":"Submitted to IEEE Access"},{"id":"http://arxiv.org/abs/2308.05361v3","updated":"2023-12-02T12:24:14Z","published":"2023-08-10T06:08:20Z","title":"WeaverBird: Empowering Financial Decision-Making with Large Language\n Model, Knowledge Base, and Search Engine","summary":" We present WeaverBird, an intelligent dialogue system designed specifically\nfor the finance domain. Our system harnesses a large language model of GPT\narchitecture that has been tuned using extensive corpora of finance-related\ntext. As a result, our system possesses the capability to understand complex\nfinancial queries, such as \"How should I manage my investments during\ninflation?\", and provide informed responses. Furthermore, our system\nincorporates a local knowledge base and a search engine to retrieve relevant\ninformation. The final responses are conditioned on the search results and\ninclude proper citations to the sources, thus enjoying an enhanced credibility.\nThrough a range of finance-related questions, we have demonstrated the superior\nperformance of our system compared to other models. To experience our system\nfirsthand, users can interact with our live demo at\nhttps://weaverbird.ttic.edu, as well as watch our 2-min video illustration at\nhttps://www.youtube.com/watch?v=fyV2qQkX6Tc.\n","authors":["Siqiao Xue","Fan Zhou","Yi Xu","Ming Jin","Qingsong Wen","Hongyan Hao","Qingyang Dai","Caigao Jiang","Hongyu Zhao","Shuo Xie","Jianshan He","James Zhang","Hongyuan Mei"],"pdf_url":"https://arxiv.org/pdf/2308.05361v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01114v1","updated":"2023-12-02T12:10:48Z","published":"2023-12-02T12:10:48Z","title":"TURead: An eye movement dataset of Turkish reading","summary":" In this study, we present TURead, an eye movement dataset of silent and oral\nsentence reading in Turkish, an agglutinative language with a shallow\northography understudied in reading research. TURead provides empirical data to\ninvestigate the relationship between morphology and oculomotor control. We\nemploy a target-word approach in which target words are manipulated by word\nlength and by the addition of two commonly used suffixes in Turkish. The\ndataset contains well-established eye movement variables; prelexical\ncharacteristics such as vowel harmony and bigram-trigram frequencies and word\nfeatures, such as word length, predictability, frequency, eye voice span\nmeasures, Cloze test scores of the root word and suffix predictabilities, as\nwell as the scores obtained from two working memory tests. Our findings on\nfixation parameters and word characteristics are in line with the patterns\nreported in the relevant literature.\n","authors":["Cengiz Acarturk","Aysegul Ozkan","Tugce Nur Pekcetin","Zuhal Ormanoglu","Bilal Kirkici"],"pdf_url":"https://arxiv.org/pdf/2312.01114v1.pdf","comment":"24 pages, 3 figures, 23 tables, Behavior Research Methods (2023)"},{"id":"http://arxiv.org/abs/2202.08063v5","updated":"2023-12-02T10:23:59Z","published":"2022-02-16T13:44:00Z","title":"Information Extraction in Low-Resource Scenarios: Survey and Perspective","summary":" Information Extraction (IE) seeks to derive structured information from\nunstructured texts, often facing challenges in low-resource scenarios due to\ndata scarcity and unseen classes. This paper presents a review of neural\napproaches to low-resource IE from \\emph{traditional} and \\emph{LLM-based}\nperspectives, systematically categorizing them into a fine-grained taxonomy.\nThen we conduct empirical study on LLM-based methods compared with previous\nstate-of-the-art models, and discover that (1) well-tuned LMs are still\npredominant; (2) tuning open-resource LLMs and ICL with GPT family is promising\nin general; (3) the optimal LLM-based technical solution for low-resource IE\ncan be task-dependent. In addition, we discuss low-resource IE with LLMs,\nhighlight promising applications, and outline potential research directions.\nThis survey aims to foster understanding of this field, inspire new ideas, and\nencourage widespread applications in both academia and industry.\n","authors":["Shumin Deng","Yubo Ma","Ningyu Zhang","Yixin Cao","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2202.08063v5.pdf","comment":"Work in Progress. Paper List:\n \\url{https://github.com/zjunlp/Low-resource-KEPapers}; Data and Code: \\url{\n https://github.com/mayubo2333/LLM_project}"},{"id":"http://arxiv.org/abs/2312.01090v1","updated":"2023-12-02T09:45:45Z","published":"2023-12-02T09:45:45Z","title":"Self Generated Wargame AI: Double Layer Agent Task Planning Based on\n Large Language Model","summary":" The big language model represented by ChatGPT has had a disruptive impact on\nthe field of artificial intelligence. But it mainly focuses on Natural language\nprocessing, speech recognition, machine learning and natural-language\nunderstanding. This paper innovatively applies the big language model to the\nfield of intelligent decision-making, places the big language model in the\ndecision-making center, and constructs an agent architecture with the big\nlanguage model as the core. Based on this, it further proposes a two-layer\nagent task planning, issues and executes decision commands through the\ninteraction of natural language, and carries out simulation verification\nthrough the wargame simulation environment. Through the game confrontation\nsimulation experiment, it is found that the intelligent decision-making ability\nof the big language model is significantly stronger than the commonly used\nreinforcement learning AI and rule AI, and the intelligence, understandability\nand generalization are all better. And through experiments, it was found that\nthe intelligence of the large language model is closely related to prompt. This\nwork also extends the large language model from previous human-computer\ninteraction to the field of intelligent decision-making, which has important\nreference value and significance for the development of intelligent\ndecision-making.\n","authors":["Y. Sun","C. Yu","J. Zhao","W. Wang","X. Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.01090v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01087v1","updated":"2023-12-02T09:37:47Z","published":"2023-12-02T09:37:47Z","title":"Prompted Zero-Shot Multi-label Classification of Factual Incorrectness\n in Machine-Generated Summaries","summary":" This study addresses the critical issue of factual inaccuracies in\nmachine-generated text summaries, an increasingly prevalent issue in\ninformation dissemination. Recognizing the potential of such errors to\ncompromise information reliability, we investigate the nature of factual\ninconsistencies across machine-summarized content. We introduce a prompt-based\nclassification system that categorizes errors into four distinct types:\nmisrepresentation, inaccurate quantities or measurements, false attribution,\nand fabrication. The participants are tasked with evaluating a corpus of\nmachine-generated summaries against their original articles. Our methodology\nemploys qualitative judgements to identify the occurrence of factual\ndistortions. The results show that our prompt-based approaches are able to\ndetect the type of errors in the summaries to some extent, although there is\nscope for improvement in our classification systems.\n","authors":["Aniket Deroy","Subhankar Maity","Saptarshi Ghosh"],"pdf_url":"https://arxiv.org/pdf/2312.01087v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01082v1","updated":"2023-12-02T09:20:10Z","published":"2023-12-02T09:20:10Z","title":"On the Effects of Randomness on Stability of Learning with Limited\n Labelled Data: A Systematic Literature Review","summary":" Learning with limited labelled data, such as few-shot learning, meta-learning\nor transfer learning, aims to effectively train a model using only small amount\nof labelled samples. However, these approaches were observed to be excessively\nsensitive to the effects of uncontrolled randomness caused by non-determinism\nin the training process. The randomness negatively affects the stability of the\nmodels, leading to large variance in results across training runs. When such\ninstability is disregarded, it can unintentionally, but unfortunately also\nintentionally, create an imaginary perception of research progress. Recently,\nthis area started to attract a research attention and the number of relevant\nstudies is continuously growing. In this survey, we provide a comprehensive\noverview of 134 papers addressing the effects of randomness on the stability of\nlearning with limited labelled data. We distinguish between four main tasks\naddressed in the papers (investigate/evaluate; determine; mitigate;\nbenchmark/compare/report randomness effects), providing findings for each one.\nFurthermore, we identify and discuss seven challenges and open problems\ntogether with possible directions to facilitate further research. The ultimate\ngoal of this survey is to emphasise the importance of this growing research\narea, which so far has not received appropriate level of attention.\n","authors":["Branislav Pecher","Ivan Srba","Maria Bielikova"],"pdf_url":"https://arxiv.org/pdf/2312.01082v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01057v1","updated":"2023-12-02T08:04:29Z","published":"2023-12-02T08:04:29Z","title":"RLHF and IIA: Perverse Incentives","summary":" Existing algorithms for reinforcement learning from human feedback (RLHF) can\nincentivize responses at odds with preferences because they are based on models\nthat assume independence of irrelevant alternatives (IIA). The perverse\nincentives induced by IIA give rise to egregious behavior when innovating on\nquery formats or learning algorithms.\n","authors":["Wanqiao Xu","Shi Dong","Xiuyuan Lu","Grace Lam","Zheng Wen","Benjamin Van Roy"],"pdf_url":"https://arxiv.org/pdf/2312.01057v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01054v1","updated":"2023-12-02T07:41:46Z","published":"2023-12-02T07:41:46Z","title":"Exploring and Improving the Spatial Reasoning Abilities of Large\n Language Models","summary":" Large Language Models (LLMs) represent formidable tools for sequence\nmodeling, boasting an innate capacity for general pattern recognition.\nNevertheless, their broader spatial reasoning capabilities, especially applied\nto numerical trajectory data, remain insufficiently explored. In this paper, we\ninvestigate the out-of-the-box performance of ChatGPT-3.5, ChatGPT-4 and Llama\n2 7B models when confronted with 3D robotic trajectory data from the CALVIN\nbaseline and associated tasks, including 2D directional and shape labeling.\nAdditionally, we introduce a novel prefix-based prompting mechanism, which\nyields a 33% improvement on the 3D trajectory data and an increase of up to 10%\non SpartQA tasks over zero-shot prompting (with gains for other prompting types\nas well). The experimentation with 3D trajectory data offers an intriguing\nglimpse into the manner in which LLMs engage with numerical and spatial\ninformation, thus laying a solid foundation for the identification of target\nareas for future enhancements.\n","authors":["Manasi Sharma"],"pdf_url":"https://arxiv.org/pdf/2312.01054v1.pdf","comment":"Published in NeurIPS 2023 Workshop on Instruction Tuning and\n Instruction Following"},{"id":"http://arxiv.org/abs/2312.01053v1","updated":"2023-12-02T07:40:32Z","published":"2023-12-02T07:40:32Z","title":"End-to-End Speech-to-Text Translation: A Survey","summary":" Speech-to-text translation pertains to the task of converting speech signals\nin a language to text in another language. It finds its application in various\ndomains, such as hands-free communication, dictation, video lecture\ntranscription, and translation, to name a few. Automatic Speech Recognition\n(ASR), as well as Machine Translation(MT) models, play crucial roles in\ntraditional ST translation, enabling the conversion of spoken language in its\noriginal form to written text and facilitating seamless cross-lingual\ncommunication. ASR recognizes spoken words, while MT translates the transcribed\ntext into the target language. Such disintegrated models suffer from cascaded\nerror propagation and high resource and training costs. As a result,\nresearchers have been exploring end-to-end (E2E) models for ST translation.\nHowever, to our knowledge, there is no comprehensive review of existing works\non E2E ST. The present survey, therefore, discusses the work in this direction.\nOur attempt has been to provide a comprehensive review of models employed,\nmetrics, and datasets used for ST tasks, providing challenges and future\nresearch direction with new insights. We believe this review will be helpful to\nresearchers working on various applications of ST models.\n","authors":["Nivedita Sethiya","Chandresh Kumar Maurya"],"pdf_url":"https://arxiv.org/pdf/2312.01053v1.pdf","comment":"45 pages"},{"id":"http://arxiv.org/abs/2312.01052v1","updated":"2023-12-02T07:40:21Z","published":"2023-12-02T07:40:21Z","title":"Structured, Complex and Time-complete Temporal Event Forecasting","summary":" Temporal event forecasting aims to predict what will happen next given the\nobserved events in history. Previous formulations of temporal event are\nunstructured, atomic, or lacking full temporal information, thus largely\nrestricting the representation quality and forecasting ability of temporal\nevents. To address these limitations, we introduce a novel formulation for\nStructured, Complex, and Time-complete Temporal Event (SCTc-TE). Based on this\nnew formulation, we develop a simple and fully automated pipeline for\nconstructing such SCTc-TEs from a large amount of news articles. Furthermore,\nwe propose a novel model that leverages both Local and Global contexts for\nSCTc-TE forecasting, named LoGo. To evaluate our model, we construct two\nlarge-scale datasets named MidEast-TE and GDELT-TE. Extensive evaluations\ndemonstrate the advantages of our datasets in multiple aspects, while\nexperimental results justify the effectiveness of our forecasting model LoGo.\nWe release the code and dataset via\nhttps://github.com/yecchen/GDELT-ComplexEvent.\n","authors":["Yunshan Ma","Chenchen Ye","Zijian Wu","Xiang Wang","Yixin Cao","Liang Pang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2312.01052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01050v1","updated":"2023-12-02T07:34:03Z","published":"2023-12-02T07:34:03Z","title":"Detection and Analysis of Stress-Related Posts in Reddit Acamedic\n Communities","summary":" Nowadays, the significance of monitoring stress levels and recognizing early\nsigns of mental illness cannot be overstated. Automatic stress detection in\ntext can proactively help manage stress and protect mental well-being. In\ntoday's digital era, social media platforms reflect the psychological\nwell-being and stress levels within various communities. This study focuses on\ndetecting and analyzing stress-related posts in Reddit academic communities.\nDue to online education and remote work, these communities have become central\nfor academic discussions and support. We classify text as stressed or not using\nnatural language processing and machine learning classifiers, with Dreaddit as\nour training dataset, which contains labeled data from Reddit. Next, we collect\nand analyze posts from various academic subreddits. We identified that the most\neffective individual feature for stress detection is the Bag of Words, paired\nwith the Logistic Regression classifier, achieving a 77.78% accuracy rate and\nan F1 score of 0.79 on the DReaddit dataset. This combination also performs\nbest in stress detection on human-annotated datasets, with a 72% accuracy rate.\nOur key findings reveal that posts and comments in professors Reddit\ncommunities are the most stressful, compared to other academic levels,\nincluding bachelor, graduate, and Ph.D. students. This research contributes to\nour understanding of the stress levels within academic communities. It can help\nacademic institutions and online communities develop measures and interventions\nto address this issue effectively.\n","authors":["Nazzere Oryngozha","Pakizar Shamoi","Ayan Igali"],"pdf_url":"https://arxiv.org/pdf/2312.01050v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2311.17280v2","updated":"2023-12-02T06:39:17Z","published":"2023-11-28T23:40:13Z","title":"Does VLN Pretraining Work with Nonsensical or Irrelevant Instructions?","summary":" Data augmentation via back-translation is common when pretraining\nVision-and-Language Navigation (VLN) models, even though the generated\ninstructions are noisy. But: does that noise matter? We find that nonsensical\nor irrelevant language instructions during pretraining can have little effect\non downstream performance for both HAMT and VLN-BERT on R2R, and is still\nbetter than only using clean, human data. To underscore these results, we\nconcoct an efficient augmentation method, Unigram + Object, which generates\nnonsensical instructions that nonetheless improve downstream performance. Our\nfindings suggest that what matters for VLN R2R pretraining is the quantity of\nvisual trajectories, not the quality of instructions.\n","authors":["Wang Zhu","Ishika Singh","Yuan Huang","Robin Jia","Jesse Thomason"],"pdf_url":"https://arxiv.org/pdf/2311.17280v2.pdf","comment":"Accepted by O-DRUM @ CVPR 2023"},{"id":"http://arxiv.org/abs/2312.01044v1","updated":"2023-12-02T06:33:23Z","published":"2023-12-02T06:33:23Z","title":"Large Language Models Are Zero-Shot Text Classifiers","summary":" Retrained large language models (LLMs) have become extensively used across\nvarious sub-disciplines of natural language processing (NLP). In NLP, text\nclassification problems have garnered considerable focus, but still faced with\nsome limitations related to expensive computational cost, time consumption, and\nrobust performance to unseen classes. With the proposal of chain of thought\nprompting (CoT), LLMs can be implemented using zero-shot learning (ZSL) with\nthe step by step reasoning prompts, instead of conventional question and answer\nformats. The zero-shot LLMs in the text classification problems can alleviate\nthese limitations by directly utilizing pretrained models to predict both seen\nand unseen classes. Our research primarily validates the capability of GPT\nmodels in text classification. We focus on effectively utilizing prompt\nstrategies to various text classification scenarios. Besides, we compare the\nperformance of zero shot LLMs with other state of the art text classification\nmethods, including traditional machine learning methods, deep learning methods,\nand ZSL methods. Experimental results demonstrate that the performance of LLMs\nunderscores their effectiveness as zero-shot text classifiers in three of the\nfour datasets analyzed. The proficiency is especially advantageous for small\nbusinesses or teams that may not have extensive knowledge in text\nclassification.\n","authors":["Zhiqiang Wang","Yiran Pang","Yanbin Lin"],"pdf_url":"https://arxiv.org/pdf/2312.01044v1.pdf","comment":"9 pages, 3 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.01040v1","updated":"2023-12-02T05:54:06Z","published":"2023-12-02T05:54:06Z","title":"From Beginner to Expert: Modeling Medical Knowledge into General LLMs","summary":" Recently, large language model (LLM) based artificial intelligence (AI)\nsystems have demonstrated remarkable capabilities in natural language\nunderstanding and generation. However, these models face a significant\nchallenge when it comes to sensitive applications, such as reasoning over\nmedical knowledge and answering medical questions in a physician-like manner.\nPrior studies attempted to overcome this challenge by increasing the model size\n(>100B) to learn more general medical knowledge, while there is still room for\nimprovement in LLMs with smaller-scale model sizes (<100B). In this work, we\nstart from a pre-trained general LLM model (AntGLM-10B) and fine-tune it from a\nmedical beginner towards a medical expert (called AntGLM-Med-10B), which\nleverages a 3-stage optimization procedure, \\textit{i.e.}, general medical\nknowledge injection, medical domain instruction tuning, and specific medical\ntask adaptation. Our contributions are threefold: (1) We specifically\ninvestigate how to adapt a pre-trained general LLM in medical domain,\nespecially for a specific medical task. (2) We collect and construct\nlarge-scale medical datasets for each stage of the optimization process. These\ndatasets encompass various data types and tasks, such as question-answering,\nmedical reasoning, multi-choice questions, and medical conversations. (3)\nSpecifically for multi-choice questions in the medical domain, we propose a\nnovel Verification-of-Choice approach for prompting engineering, which\nsignificantly enhances the reasoning ability of LLMs. Remarkably, by combining\nthe above approaches, our AntGLM-Med-10B model can outperform the most of LLMs\non PubMedQA, including both general and medical LLMs, even when these LLMs have\nlarger model size.\n","authors":["Qiang Li","Xiaoyan Yang","Haowen Wang","Qin Wang","Lei Liu","Junjie Wang","Yang Zhang","Mingyuan Chu","Sen Hu","Yicheng Chen","Yue Shen","Cong Fan","Wangshu Zhang","Teng Xu","Jinjie Gu","Jing Zheng","Guannan Zhang Ant Group"],"pdf_url":"https://arxiv.org/pdf/2312.01040v1.pdf","comment":"Developed by Ant Group for PubMedQA leaderboard"},{"id":"http://arxiv.org/abs/2312.01037v1","updated":"2023-12-02T05:47:22Z","published":"2023-12-02T05:47:22Z","title":"Eliciting Latent Knowledge from Quirky Language Models","summary":" Eliciting Latent Knowledge (ELK) aims to find patterns in a neural network's\nactivations which robustly track the true state of the world, even when the\nnetwork's overt output is false or misleading. To further ELK research, we\nintroduce a suite of \"quirky\" language models that are LoRA finetuned to make\nsystematic errors when answering math questions if and only if the keyword\n\"Bob\" is present in the prompt. We demonstrate that simple probing methods can\nelicit the model's latent knowledge of the correct answer in these contexts,\neven for problems harder than those the probe was trained on. We then compare\nELK probing methods and find that a simple difference-in-means classifier\ngeneralizes best. We also find that a mechanistic anomaly detection approach\ncan flag untruthful behavior with upwards of 99% AUROC. Our results show\npromise for eliciting superhuman knowledge from capable models, and we aim to\nfacilitate future research that expands on our findings, employing more diverse\nand challenging datasets.\n","authors":["Alex Mallen","Nora Belrose"],"pdf_url":"https://arxiv.org/pdf/2312.01037v1.pdf","comment":"Work in progress"},{"id":"http://arxiv.org/abs/2312.01032v1","updated":"2023-12-02T05:13:28Z","published":"2023-12-02T05:13:28Z","title":"Harnessing the Power of Prompt-based Techniques for Generating\n School-Level Questions using Large Language Models","summary":" Designing high-quality educational questions is a challenging and\ntime-consuming task. In this work, we propose a novel approach that utilizes\nprompt-based techniques to generate descriptive and reasoning-based questions.\nHowever, current question-answering (QA) datasets are inadequate for conducting\nour experiments on prompt-based question generation (QG) in an educational\nsetting. Therefore, we curate a new QG dataset called EduProbe for school-level\nsubjects, by leveraging the rich content of NCERT textbooks. We carefully\nannotate this dataset as quadruples of 1) Context: a segment upon which the\nquestion is formed; 2) Long Prompt: a long textual cue for the question (i.e.,\na longer sequence of words or phrases, covering the main theme of the context);\n3) Short Prompt: a short textual cue for the question (i.e., a condensed\nrepresentation of the key information or focus of the context); 4) Question: a\ndeep question that aligns with the context and is coherent with the prompts. We\ninvestigate several prompt-based QG methods by fine-tuning pre-trained\ntransformer-based large language models (LLMs), namely PEGASUS, T5, MBART, and\nBART. Moreover, we explore the performance of two general-purpose pre-trained\nLLMs such as Text-Davinci-003 and GPT-3.5-Turbo without any further training.\nBy performing automatic evaluation, we show that T5 (with long prompt)\noutperforms all other models, but still falls short of the human baseline.\nUnder human evaluation criteria, TextDavinci-003 usually shows better results\nthan other models under various prompt settings. Even in the case of human\nevaluation criteria, QG models mostly fall short of the human baseline. Our\ncode and dataset are available at: https://github.com/my625/PromptQG\n","authors":["Subhankar Maity","Aniket Deroy","Sudeshna Sarkar"],"pdf_url":"https://arxiv.org/pdf/2312.01032v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01006v1","updated":"2023-12-02T02:53:45Z","published":"2023-12-02T02:53:45Z","title":"Dual-Teacher De-biasing Distillation Framework for Multi-domain Fake\n News Detection","summary":" Multi-domain fake news detection aims to identify whether various news from\ndifferent domains is real or fake and has become urgent and important. However,\nexisting methods are dedicated to improving the overall performance of fake\nnews detection, ignoring the fact that unbalanced data leads to disparate\ntreatment for different domains, i.e., the domain bias problem. To solve this\nproblem, we propose the Dual-Teacher De-biasing Distillation framework (DTDBD)\nto mitigate bias across different domains. Following the knowledge distillation\nmethods, DTDBD adopts a teacher-student structure, where pre-trained large\nteachers instruct a student model. In particular, the DTDBD consists of an\nunbiased teacher and a clean teacher that jointly guide the student model in\nmitigating domain bias and maintaining performance. For the unbiased teacher,\nwe introduce an adversarial de-biasing distillation loss to instruct the\nstudent model in learning unbiased domain knowledge. For the clean teacher, we\ndesign domain knowledge distillation loss, which effectively incentivizes the\nstudent model to focus on representing domain features while maintaining\nperformance. Moreover, we present a momentum-based dynamic adjustment algorithm\nto trade off the effects of two teachers. Extensive experiments on Chinese and\nEnglish datasets show that the proposed method substantially outperforms the\nstate-of-the-art baseline methods in terms of bias metrics while guaranteeing\ncompetitive performance.\n","authors":["Jiayang Li","Xuan Feng","Tianlong Gu","Liang Chang"],"pdf_url":"https://arxiv.org/pdf/2312.01006v1.pdf","comment":"ICDE 2024"},{"id":"http://arxiv.org/abs/2308.11138v2","updated":"2023-12-02T01:21:09Z","published":"2023-08-22T02:39:42Z","title":"NLP-based detection of systematic anomalies among the narratives of\n consumer complaints","summary":" We develop an NLP-based procedure for detecting systematic nonmeritorious\nconsumer complaints, simply called systematic anomalies, among complaint\nnarratives. While classification algorithms are used to detect pronounced\nanomalies, in the case of smaller and frequent systematic anomalies, the\nalgorithms may falter due to a variety of reasons, including technical ones as\nwell as natural limitations of human analysts. Therefore, as the next step\nafter classification, we convert the complaint narratives into quantitative\ndata, which are then analyzed using an algorithm for detecting systematic\nanomalies. We illustrate the entire procedure using complaint narratives from\nthe Consumer Complaint Database of the Consumer Financial Protection Bureau.\n","authors":["Peiheng Gao","Ning Sun","Xuefeng Wang","Chen Yang","Ričardas Zitikis"],"pdf_url":"https://arxiv.org/pdf/2308.11138v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02206v1","updated":"2023-12-02T23:11:41Z","published":"2023-12-02T23:11:41Z","title":"Axiomatic Preference Modeling for Longform Question Answering","summary":" The remarkable abilities of large language models (LLMs) like GPT-4 partially\nstem from post-training processes like Reinforcement Learning from Human\nFeedback (RLHF) involving human preferences encoded in a reward model. However,\nthese reward models (RMs) often lack direct knowledge of why, or under what\nprinciples, the preferences annotations were made. In this study, we identify\nprinciples that guide RMs to better align with human preferences, and then\ndevelop an axiomatic framework to generate a rich variety of preference signals\nto uphold them. We use these axiomatic signals to train a model for scoring\nanswers to longform questions. Our approach yields a Preference Model with only\nabout 220M parameters that agrees with gold human-annotated preference labels\nmore often than GPT-4. The contributions of this work include: training a\nstandalone preference model that can score human- and LLM-generated answers on\nthe same scale; developing an axiomatic framework for generating training data\npairs tailored to certain principles; and showing that a small amount of\naxiomatic signals can help small models outperform GPT-4 in preference scoring.\nWe release our model on huggingface:\nhttps://huggingface.co/corbyrosset/axiomatic_preference_model\n","authors":["Corby Rosset","Guoqing Zheng","Victor Dibia","Ahmed Awadallah","Paul Bennett"],"pdf_url":"https://arxiv.org/pdf/2312.02206v1.pdf","comment":"Accepted to EMNLP 2023"}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2311.11824v2","updated":"2023-12-02T18:43:13Z","published":"2023-11-20T15:01:33Z","title":"Neural Graph Collaborative Filtering Using Variational Inference","summary":" The customization of recommended content to users holds significant\nimportance in enhancing user experiences across a wide spectrum of applications\nsuch as e-commerce, music, and shopping. Graph-based methods have achieved\nconsiderable performance by capturing user-item interactions. However, these\nmethods tend to utilize randomly constructed embeddings in the dataset used for\ntraining the recommender, which lacks any user preferences. Here, we propose\nthe concept of variational embeddings as a means of pre-training the\nrecommender system to improve the feature propagation through the layers of\ngraph convolutional networks (GCNs). The graph variational embedding\ncollaborative filtering (GVECF) is introduced as a novel framework to\nincorporate representations learned through a variational graph auto-encoder\nwhich are embedded into a GCN-based collaborative filtering. This approach\neffectively transforms latent high-order user-item interactions into more\ntrainable vectors, ultimately resulting in better performance in terms of\nrecall and normalized discounted cumulative gain(NDCG) metrics. The experiments\nconducted on benchmark datasets demonstrate that our proposed method achieves\nup to 13.78% improvement in the recall over the test data.\n","authors":["Narges Sadat Fazeli Dehkordi","Hadi Zare","Parham Moradi","Mahdi Jalili"],"pdf_url":"https://arxiv.org/pdf/2311.11824v2.pdf","comment":"Submitted for PAKDD2024 conference,12 pages"},{"id":"http://arxiv.org/abs/2309.08420v4","updated":"2023-12-02T10:52:48Z","published":"2023-09-15T14:23:20Z","title":"FedDCSR: Federated Cross-domain Sequential Recommendation via\n Disentangled Representation Learning","summary":" Cross-domain Sequential Recommendation (CSR) which leverages user sequence\ndata from multiple domains has received extensive attention in recent years.\nHowever, the existing CSR methods require sharing origin user data across\ndomains, which violates the General Data Protection Regulation (GDPR). Thus, it\nis necessary to combine federated learning (FL) and CSR to fully utilize\nknowledge from different domains while preserving data privacy. Nonetheless,\nthe sequence feature heterogeneity across different domains significantly\nimpacts the overall performance of FL. In this paper, we propose FedDCSR, a\nnovel federated cross-domain sequential recommendation framework via\ndisentangled representation learning. Specifically, to address the sequence\nfeature heterogeneity across domains, we introduce an approach called\ninter-intra domain sequence representation disentanglement (SRD) to disentangle\nthe user sequence features into domain-shared and domain-exclusive features. In\naddition, we design an intra domain contrastive infomax (CIM) strategy to learn\nricher domain-exclusive features of users by performing data augmentation on\nuser sequences. Extensive experiments on three real-world scenarios demonstrate\nthat FedDCSR achieves significant improvements over existing baselines.\n","authors":["Hongyu Zhang","Dongyi Zheng","Xu Yang","Jiyuan Feng","Qing Liao"],"pdf_url":"https://arxiv.org/pdf/2309.08420v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.08063v5","updated":"2023-12-02T10:23:59Z","published":"2022-02-16T13:44:00Z","title":"Information Extraction in Low-Resource Scenarios: Survey and Perspective","summary":" Information Extraction (IE) seeks to derive structured information from\nunstructured texts, often facing challenges in low-resource scenarios due to\ndata scarcity and unseen classes. This paper presents a review of neural\napproaches to low-resource IE from \\emph{traditional} and \\emph{LLM-based}\nperspectives, systematically categorizing them into a fine-grained taxonomy.\nThen we conduct empirical study on LLM-based methods compared with previous\nstate-of-the-art models, and discover that (1) well-tuned LMs are still\npredominant; (2) tuning open-resource LLMs and ICL with GPT family is promising\nin general; (3) the optimal LLM-based technical solution for low-resource IE\ncan be task-dependent. In addition, we discuss low-resource IE with LLMs,\nhighlight promising applications, and outline potential research directions.\nThis survey aims to foster understanding of this field, inspire new ideas, and\nencourage widespread applications in both academia and industry.\n","authors":["Shumin Deng","Yubo Ma","Ningyu Zhang","Yixin Cao","Bryan Hooi"],"pdf_url":"https://arxiv.org/pdf/2202.08063v5.pdf","comment":"Work in Progress. Paper List:\n \\url{https://github.com/zjunlp/Low-resource-KEPapers}; Data and Code: \\url{\n https://github.com/mayubo2333/LLM_project}"},{"id":"http://arxiv.org/abs/2312.01052v1","updated":"2023-12-02T07:40:21Z","published":"2023-12-02T07:40:21Z","title":"Structured, Complex and Time-complete Temporal Event Forecasting","summary":" Temporal event forecasting aims to predict what will happen next given the\nobserved events in history. Previous formulations of temporal event are\nunstructured, atomic, or lacking full temporal information, thus largely\nrestricting the representation quality and forecasting ability of temporal\nevents. To address these limitations, we introduce a novel formulation for\nStructured, Complex, and Time-complete Temporal Event (SCTc-TE). Based on this\nnew formulation, we develop a simple and fully automated pipeline for\nconstructing such SCTc-TEs from a large amount of news articles. Furthermore,\nwe propose a novel model that leverages both Local and Global contexts for\nSCTc-TE forecasting, named LoGo. To evaluate our model, we construct two\nlarge-scale datasets named MidEast-TE and GDELT-TE. Extensive evaluations\ndemonstrate the advantages of our datasets in multiple aspects, while\nexperimental results justify the effectiveness of our forecasting model LoGo.\nWe release the code and dataset via\nhttps://github.com/yecchen/GDELT-ComplexEvent.\n","authors":["Yunshan Ma","Chenchen Ye","Zijian Wu","Xiang Wang","Yixin Cao","Liang Pang","Tat-Seng Chua"],"pdf_url":"https://arxiv.org/pdf/2312.01052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01007v1","updated":"2023-12-02T02:57:52Z","published":"2023-12-02T02:57:52Z","title":"A Hypergraph-Based Approach to Recommend Online Resources in a Library","summary":" When users in a digital library read or browse online resources, it generates\nan immense amount of data. If the underlying system can recommend items, such\nas books and journals, to the users, it will help them to find the related\nitems. This research analyzes a digital library's usage data to recommend items\nto its users, and it uses different clustering algorithms to design the\nrecommender system. We have used content-based clustering, including\nhierarchical, expectation maximization (EM), K-mean, FarthestFirst, and\ndensity-based clustering algorithms, and user access pattern-based clustering,\nwhich uses a hypergraph-based approach to generate the clusters. This research\nshows that the recommender system designed using the hypergraph algorithm\ngenerates the most accurate recommendation model compared to those designed\nusing the content-based clustering approaches.\n","authors":["Debashish Roy","Rajarshi Roy Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2312.01007v1.pdf","comment":"12 Pages, 2 figures, and 1 table"},{"id":"http://arxiv.org/abs/2311.04625v2","updated":"2023-12-02T02:57:41Z","published":"2023-11-08T12:05:18Z","title":"A Comprehensive Summarization and Evaluation of Feature Refinement\n Modules for CTR Prediction","summary":" Click-through rate (CTR) prediction is widely used in academia and industry.\nMost CTR tasks fall into a feature embedding \\& feature interaction paradigm,\nwhere the accuracy of CTR prediction is mainly improved by designing practical\nfeature interaction structures. However, recent studies have argued that the\nfixed feature embedding learned only through the embedding layer limits the\nperformance of existing CTR models. Some works apply extra modules on top of\nthe embedding layer to dynamically refine feature representations in different\ninstances, making it effective and easy to integrate with existing CTR methods.\nDespite the promising results, there is a lack of a systematic review and\nsummarization of this new promising direction on the CTR task. To fill this\ngap, we comprehensively summarize and define a new module, namely\n\\textbf{feature refinement} (FR) module, that can be applied between feature\nembedding and interaction layers. We extract 14 FR modules from previous works,\nincluding instances where the FR module was proposed but not clearly defined or\nexplained. We fully assess the effectiveness and compatibility of existing FR\nmodules through comprehensive and extensive experiments with over 200 augmented\nmodels and over 4,000 runs for more than 15,000 GPU hours. The results offer\ninsightful guidelines for researchers, and all benchmarking code and\nexperimental results are open-sourced. In addition, we present a new\narchitecture of assigning independent FR modules to separate sub-networks for\nparallel CTR models, as opposed to the conventional method of inserting a\nshared FR module on top of the embedding layer. Our approach is also supported\nby comprehensive experiments demonstrating its effectiveness.\n","authors":["Fangye Wang","Hansu Gu","Dongsheng Li","Tun Lu","Peng Zhang","Li Shang","Ning Gu"],"pdf_url":"https://arxiv.org/pdf/2311.04625v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2103.16669v2","updated":"2023-12-02T13:13:38Z","published":"2021-03-30T20:28:02Z","title":"An In-depth Analysis of Passage-Level Label Transfer for Contextual\n Document Ranking","summary":" Pre-trained contextual language models such as BERT, GPT, and XLnet work\nquite well for document retrieval tasks. Such models are fine-tuned based on\nthe query-document/query-passage level relevance labels to capture the ranking\nsignals. However, the documents are longer than the passages and such document\nranking models suffer from the token limitation (512) of BERT. Researchers\nproposed ranking strategies that either truncate the documents beyond the token\nlimit or chunk the documents into units that can fit into the BERT. In the\nlater case, the relevance labels are either directly transferred from the\noriginal query-document pair or learned through some external model. In this\npaper, we conduct a detailed study of the design decisions about splitting and\nlabel transfer on retrieval effectiveness and efficiency. We find that direct\ntransfer of relevance labels from documents to passages introduces label noise\nthat strongly affects retrieval effectiveness for large training datasets. We\nalso find that query processing times are adversely affected by fine-grained\nsplitting schemes. As a remedy, we propose a careful passage level labelling\nscheme using weak supervision that delivers improved performance (3-14% in\nterms of nDCG score) over most of the recently proposed models for ad-hoc\nretrieval while maintaining manageable computational complexity on four diverse\ndocument retrieval datasets.\n","authors":["Koustav Rudra","Zeon Trevor Fernando","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2103.16669v2.pdf","comment":"Paper is about the performance analysis of contextual ranking\n strategies in an ad-hoc document retrieval"}],"Multimedia":[{"id":"http://arxiv.org/abs/2208.00339v5","updated":"2023-12-02T12:11:01Z","published":"2022-07-31T02:23:24Z","title":"GraphMFT: A Graph Network based Multimodal Fusion Technique for Emotion\n Recognition in Conversation","summary":" Multimodal machine learning is an emerging area of research, which has\nreceived a great deal of scholarly attention in recent years. Up to now, there\nare few studies on multimodal Emotion Recognition in Conversation (ERC). Since\nGraph Neural Networks (GNNs) possess the powerful capacity of relational\nmodeling, they have an inherent advantage in the field of multimodal learning.\nGNNs leverage the graph constructed from multimodal data to perform intra- and\ninter-modal information interaction, which effectively facilitates the\nintegration and complementation of multimodal data. In this work, we propose a\nnovel Graph network based Multimodal Fusion Technique (GraphMFT) for emotion\nrecognition in conversation. Multimodal data can be modeled as a graph, where\neach data object is regarded as a node, and both intra- and inter-modal\ndependencies existing between data objects can be regarded as edges. GraphMFT\nutilizes multiple improved graph attention networks to capture intra-modal\ncontextual information and inter-modal complementary information. In addition,\nthe proposed GraphMFT attempts to address the challenges of existing\ngraph-based multimodal conversational emotion recognition models such as MMGCN.\nEmpirical results on two public multimodal datasets reveal that our model\noutperforms the State-Of-The-Art (SOTA) approaches with the accuracy of 67.90%\nand 61.30%.\n","authors":["Jiang Li","Xiaoping Wang","Guoqing Lv","Zhigang Zeng"],"pdf_url":"https://arxiv.org/pdf/2208.00339v5.pdf","comment":"Accepted by Neurocomputing"},{"id":"http://arxiv.org/abs/2312.01067v1","updated":"2023-12-02T08:33:34Z","published":"2023-12-02T08:33:34Z","title":"Painterly Reality: Enhancing Audience Experience with Paintings through\n Interactive Art","summary":" Perceiving paintings entails more than merely engaging the audience's eyes\nand brains; their perceptions and experiences of a painting can be intricately\nconnected with body movement. This paper proposes an interactive art approach\nentitled \"Painterly Reality\" that facilitates the perception and interaction\nwith paintings in a three-dimensional manner. Its objective is to promote\nbodily engagement with the painting (i.e., embedded body embodiment and its\nmovement and interaction) to enhance the audience's experience, while\nmaintaining its essence. Unlike two-dimensional interactions, this approach\nconstructs the Painterly Reality by capturing the audience's body embodiment in\nreal-time and embedding into a three-dimensional painterly world derived from a\ngiven painting input. Through their body embodiment, the audience can navigate\nthe painterly world and play with the magical realism (i.e., interactive\npainterly objects), fostering meaningful experiences via interactions. The\nPainterly Reality is subsequently projected through an Augmented Reality Mirror\nas a live painting and displayed in front of the audience. Hence, the audience\ncan gain enhanced experiences through bodily engagement while simultaneously\nviewing and appreciating the live painting. The paper implements the proposed\napproach as an interactive artwork, entitled \"Everyday Conjunctive,\" with Fong\nTse Ka's painting and installs in a local museum, which successfully enhances\naudience experience through bodily engagement.\n","authors":["Aven Le Zhou","Kang Zhang","David Yip"],"pdf_url":"https://arxiv.org/pdf/2312.01067v1.pdf","comment":"13 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.01017v1","updated":"2023-12-02T03:38:49Z","published":"2023-12-02T03:38:49Z","title":"Unveiling the Power of Audio-Visual Early Fusion Transformers with Dense\n Interactions through Masked Modeling","summary":" Humans possess a remarkable ability to integrate auditory and visual\ninformation, enabling a deeper understanding of the surrounding environment.\nThis early fusion of audio and visual cues, demonstrated through cognitive\npsychology and neuroscience research, offers promising potential for developing\nmultimodal perception models. However, training early fusion architectures\nposes significant challenges, as the increased model expressivity requires\nrobust learning frameworks to harness their enhanced capabilities. In this\npaper, we address this challenge by leveraging the masked reconstruction\nframework, previously successful in unimodal settings, to train audio-visual\nencoders with early fusion. Additionally, we propose an attention-based fusion\nmodule that captures interactions between local audio and visual\nrepresentations, enhancing the model's ability to capture fine-grained\ninteractions. While effective, this procedure can become computationally\nintractable, as the number of local representations increases. Thus, to address\nthe computational complexity, we propose an alternative procedure that\nfactorizes the local representations before representing audio-visual\ninteractions. Extensive evaluations on a variety of datasets demonstrate the\nsuperiority of our approach in audio-event classification, visual sound\nlocalization, sound separation, and audio-visual segmentation. These\ncontributions enable the efficient training of deeply integrated audio-visual\nmodels and significantly advance the usefulness of early fusion architectures.\n","authors":["Shentong Mo","Pedro Morgado"],"pdf_url":"https://arxiv.org/pdf/2312.01017v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11284v3","updated":"2023-12-02T02:57:54Z","published":"2023-11-19T09:59:09Z","title":"LucidDreamer: Towards High-Fidelity Text-to-3D Generation via Interval\n Score Matching","summary":" The recent advancements in text-to-3D generation mark a significant milestone\nin generative models, unlocking new possibilities for creating imaginative 3D\nassets across various real-world scenarios. While recent advancements in\ntext-to-3D generation have shown promise, they often fall short in rendering\ndetailed and high-quality 3D models. This problem is especially prevalent as\nmany methods base themselves on Score Distillation Sampling (SDS). This paper\nidentifies a notable deficiency in SDS, that it brings inconsistent and\nlow-quality updating direction for the 3D model, causing the over-smoothing\neffect. To address this, we propose a novel approach called Interval Score\nMatching (ISM). ISM employs deterministic diffusing trajectories and utilizes\ninterval-based score matching to counteract over-smoothing. Furthermore, we\nincorporate 3D Gaussian Splatting into our text-to-3D generation pipeline.\nExtensive experiments show that our model largely outperforms the\nstate-of-the-art in quality and training efficiency.\n","authors":["Yixun Liang","Xin Yang","Jiantao Lin","Haodong Li","Xiaogang Xu","Yingcong Chen"],"pdf_url":"https://arxiv.org/pdf/2311.11284v3.pdf","comment":"The first two authors contributed equally to this work. Our code will\n be available at: https://github.com/EnVision-Research/LucidDreamer"}]},"2023-12-05T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.02974v1","updated":"2023-12-05T18:59:16Z","published":"2023-12-05T18:59:16Z","title":"Describing Differences in Image Sets with Natural Language","summary":" How do two sets of images differ? Discerning set-level differences is crucial\nfor understanding model behaviors and analyzing datasets, yet manually sifting\nthrough thousands of images is impractical. To aid in this discovery process,\nwe explore the task of automatically describing the differences between two\n$\\textbf{sets}$ of images, which we term Set Difference Captioning. This task\ntakes in image sets $D_A$ and $D_B$, and outputs a description that is more\noften true on $D_A$ than $D_B$. We outline a two-stage approach that first\nproposes candidate difference descriptions from image sets and then re-ranks\nthe candidates by checking how well they can differentiate the two sets. We\nintroduce VisDiff, which first captions the images and prompts a language model\nto propose candidate descriptions, then re-ranks these descriptions using CLIP.\nTo evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image\nsets with ground truth difference descriptions. We apply VisDiff to various\ndomains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing\nclassification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing\nmodel failure modes (supervised ResNet), characterizing differences between\ngenerative models (e.g., StableDiffusionV1 and V2), and discovering what makes\nimages memorable. Using VisDiff, we are able to find interesting and previously\nunknown differences in datasets and models, demonstrating its utility in\nrevealing nuanced insights.\n","authors":["Lisa Dunlap","Yuhui Zhang","Xiaohan Wang","Ruiqi Zhong","Trevor Darrell","Jacob Steinhardt","Joseph E. Gonzalez","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2312.02974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02969v1","updated":"2023-12-05T18:57:40Z","published":"2023-12-05T18:57:40Z","title":"Rank-without-GPT: Building GPT-Independent Listwise Rerankers on\n Open-Source Large Language Models","summary":" Listwise rerankers based on large language models (LLM) are the zero-shot\nstate-of-the-art. However, current works in this direction all depend on the\nGPT models, making it a single point of failure in scientific reproducibility.\nMoreover, it raises the concern that the current research findings only hold\nfor GPT models but not LLM in general. In this work, we lift this pre-condition\nand build for the first time effective listwise rerankers without any form of\ndependency on GPT. Our passage retrieval experiments show that our best list se\nreranker surpasses the listwise rerankers based on GPT-3.5 by 13% and achieves\n97% effectiveness of the ones built on GPT-4. Our results also show that the\nexisting training datasets, which were expressly constructed for pointwise\nranking, are insufficient for building such listwise rerankers. Instead,\nhigh-quality listwise ranking data is required and crucial, calling for further\nwork on building human-annotated listwise data resources.\n","authors":["Xinyu Zhang","Sebastian Hofstätter","Patrick Lewis","Raphael Tang","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.02969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02931v1","updated":"2023-12-05T18:03:13Z","published":"2023-12-05T18:03:13Z","title":"WhisBERT: Multimodal Text-Audio Language Modeling on 100M Words","summary":" Training on multiple modalities of input can augment the capabilities of a\nlanguage model. Here, we ask whether such a training regime can improve the\nquality and efficiency of these systems as well. We focus on text--audio and\nintroduce Whisbert, which is inspired by the text--image approach of FLAVA\n\\citep{singh_flava_2022}. In accordance with Babylm \\citep{warstadt2023papers}\nguidelines, we pretrain Whisbert on a dataset comprising only 100 million words\nplus their corresponding speech from the word-aligned version of the People's\nSpeech dataset \\citep{galvez_peoples_2021}. To assess the impact of\nmultimodality, we compare versions of the model that are trained on text only\nand on both audio and text simultaneously. We find that while Whisbert is able\nto perform well on multimodal masked modeling and surpasses the Babylm\nbaselines in most benchmark tasks, it struggles to optimize its complex\nobjective and outperform its text-only Whisbert baseline.\n","authors":["Lukas Wolf","Klemen Kotar","Greta Tuckute","Eghbal Hosseini","Tamar Regev","Ethan Wilcox","Alex Warstadt"],"pdf_url":"https://arxiv.org/pdf/2312.02931v1.pdf","comment":"Published at the BabyLM Challenge, a shared task co-sponsored by CMCL\n 2023 and CoNLL 2023, hosted by EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.01678v2","updated":"2023-12-05T18:02:46Z","published":"2023-12-04T07:01:54Z","title":"Jellyfish: A Large Language Model for Data Preprocessing","summary":" In this paper, we present Jellyfish, an open-source LLM as a universal task\nsolver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned\nwith the datasets of several typical DP tasks including error detection, data\nimputation, schema matching, and entity matching, and delivers generalizability\nto other tasks. Remarkably, Jellyfish can operate on a local, single, and\nlow-priced GPU with its 13 billion parameters, ensuring data security and\nenabling further tuning. Its proficiency in understanding natural language\nallows users to manually craft instructions for DP tasks. Unlike many existing\nmethods that heavily rely on prior knowledge, Jellyfish acquires domain\nknowledge during its tuning process and integrates optional knowledge injection\nduring inference. A distinctive feature of Jellyfish is its interpreter, which\nelucidates its output decisions. To construct Jellyfish, we develop a series of\npre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance\nserializer, which automatically translates raw data into model prompts, and a\nknowledge injector, which optionally introduces task- and dataset-specific\nknowledge to enhance DP performance. Our evaluation of Jellyfish, using a range\nof real datasets, shows its competitiveness compared to state-of-the-art\nmethods and its strong generalizability to unseen tasks. Jellyfish's\nperformance rivals that of GPT series models, and its interpreter offers\nenhanced reasoning capabilities compared to GPT-3.5. Furthermore, our\nevaluation highlights the effectiveness of the techniques employed in\nconstructing Jellyfish. Our model is available at Hugging Face:\nhttps://huggingface.co/NECOUDBFM/Jellyfish .\n","authors":["Haochen Zhang","Yuyang Dong","Chuan Xiao","Masafumi Oyamada"],"pdf_url":"https://arxiv.org/pdf/2312.01678v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02913v1","updated":"2023-12-05T17:38:02Z","published":"2023-12-05T17:38:02Z","title":"Let the LLMs Talk: Simulating Human-to-Human Conversational QA via\n Zero-Shot LLM-to-LLM Interactions","summary":" Conversational question-answering (CQA) systems aim to create interactive\nsearch systems that effectively retrieve information by interacting with users.\nTo replicate human-to-human conversations, existing work uses human annotators\nto play the roles of the questioner (student) and the answerer (teacher).\nDespite its effectiveness, challenges exist as human annotation is\ntime-consuming, inconsistent, and not scalable. To address this issue and\ninvestigate the applicability of large language models (LLMs) in CQA\nsimulation, we propose a simulation framework that employs zero-shot learner\nLLMs for simulating teacher-student interactions. Our framework involves two\nLLMs interacting on a specific topic, with the first LLM acting as a student,\ngenerating questions to explore a given search topic. The second LLM plays the\nrole of a teacher by answering questions and is equipped with additional\ninformation, including a text on the given topic. We implement both the student\nand teacher by zero-shot prompting the GPT-4 model. To assess the effectiveness\nof LLMs in simulating CQA interactions and understand the disparities between\nLLM- and human-generated conversations, we evaluate the simulated data from\nvarious perspectives. We begin by evaluating the teacher's performance through\nboth automatic and human assessment. Next, we evaluate the performance of the\nstudent, analyzing and comparing the disparities between questions generated by\nthe LLM and those generated by humans. Furthermore, we conduct extensive\nanalyses to thoroughly examine the LLM performance by benchmarking\nstate-of-the-art reading comprehension models on both datasets. Our results\nreveal that the teacher LLM generates lengthier answers that tend to be more\naccurate and complete. The student LLM generates more diverse questions,\ncovering more aspects of a given topic.\n","authors":["Zahra Abbasiantaeb","Yifei Yuan","Evangelos Kanoulas","Mohammad Aliannejadi"],"pdf_url":"https://arxiv.org/pdf/2312.02913v1.pdf","comment":"Accepted at WSDM 2024"},{"id":"http://arxiv.org/abs/2312.02901v1","updated":"2023-12-05T17:15:16Z","published":"2023-12-05T17:15:16Z","title":"Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive\n Review","summary":" Due to the advent and increase in the popularity of the Internet, people have\nbeen producing and disseminating textual data in several ways, such as reviews,\nsocial media posts, and news articles. As a result, numerous researchers have\nbeen working on discovering patterns in textual data, especially because social\nmedia posts function as social sensors, indicating peoples' opinions,\ninterests, etc. However, most tasks regarding natural language processing are\naddressed using traditional machine learning methods and static datasets. This\nsetting can lead to several problems, such as an outdated dataset, which may\nnot correspond to reality, and an outdated model, which has its performance\ndegrading over time. Concept drift is another aspect that emphasizes these\nissues, which corresponds to data distribution and pattern changes. In a text\nstream scenario, it is even more challenging due to its characteristics, such\nas the high speed and data arriving sequentially. In addition, models for this\ntype of scenario must adhere to the constraints mentioned above while learning\nfrom the stream by storing texts for a limited time and consuming low memory.\nIn this study, we performed a systematic literature review regarding concept\ndrift adaptation in text stream scenarios. Considering well-defined criteria,\nwe selected 40 papers to unravel aspects such as text drift categories, types\nof text drift detection, model update mechanism, the addressed stream mining\ntasks, types of text representations, and text representation update mechanism.\nIn addition, we discussed drift visualization and simulation and listed\nreal-world datasets used in the selected papers. Therefore, this paper\ncomprehensively reviews the concept drift adaptation in text stream mining\nscenarios.\n","authors":["Cristiano Mesquita Garcia","Ramon Simoes Abilio","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr.","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2312.02901v1.pdf","comment":"49 pages"},{"id":"http://arxiv.org/abs/2311.16989v3","updated":"2023-12-05T16:58:46Z","published":"2023-11-28T17:44:51Z","title":"ChatGPT's One-year Anniversary: Are Open-Source Large Language Models\n Catching up?","summary":" Upon its release in late 2022, ChatGPT has brought a seismic shift in the\nentire landscape of AI, both in research and commerce. Through\ninstruction-tuning a large language model (LLM) with supervised fine-tuning and\nreinforcement learning from human feedback, it showed that a model could answer\nhuman questions and follow instructions on a broad panel of tasks. Following\nthis success, interests in LLMs have intensified, with new LLMs flourishing at\nfrequent interval across academia and industry, including many start-ups\nfocused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's\nClaude) generally outperform their open-source counterparts, the progress on\nthe latter has been rapid with claims of achieving parity or even better on\ncertain tasks. This has crucial implications not only on research but also on\nbusiness. In this work, on the first anniversary of ChatGPT, we provide an\nexhaustive overview of this success, surveying all tasks where an open-source\nLLM has claimed to be on par or better than ChatGPT.\n","authors":["Hailin Chen","Fangkai Jiao","Xingxuan Li","Chengwei Qin","Mathieu Ravaut","Ruochen Zhao","Caiming Xiong","Shafiq Joty"],"pdf_url":"https://arxiv.org/pdf/2311.16989v3.pdf","comment":"version v3"},{"id":"http://arxiv.org/abs/2312.02869v1","updated":"2023-12-05T16:36:27Z","published":"2023-12-05T16:36:27Z","title":"Can a Tabula Recta provide security in the XXI century?","summary":" In the not so unlikely scenario of total compromise of computers accessible\nto a group of users, they might be tempted to resort to human-computable\npaper-and-pencil cryptographic methods aided by a classic Tabula Recta, which\nhelps to perform addition and subtraction directly with letters. But do these\nclassic algorithms, or some new ones using the same simple tools, have any\nchance against computer-aided cryptanalysis? In this paper I discuss how some\nhuman-computable algorithms can indeed afford sufficient security in this\nsituation, drawing conclusions from computer-based statistical analysis. Three\nkinds of algorithms are discussed: those that concentrate entropy from shared\ntext sources, stream ciphers based on arithmetic of non-binary spaces, and\nhash-like algorithms that may be used to generate a password from a challenge\ntext.\n","authors":["Francisco Ruiz"],"pdf_url":"https://arxiv.org/pdf/2312.02869v1.pdf","comment":"18 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.20138v2","updated":"2023-12-05T16:14:24Z","published":"2023-10-31T03:09:36Z","title":"DEPN: Detecting and Editing Privacy Neurons in Pretrained Language\n Models","summary":" Large language models pretrained on a huge amount of data capture rich\nknowledge and information in the training data. The ability of data\nmemorization and regurgitation in pretrained language models, revealed in\nprevious studies, brings the risk of data leakage. In order to effectively\nreduce these risks, we propose a framework DEPN to Detect and Edit Privacy\nNeurons in pretrained language models, partially inspired by knowledge neurons\nand model editing. In DEPN, we introduce a novel method, termed as privacy\nneuron detector, to locate neurons associated with private information, and\nthen edit these detected privacy neurons by setting their activations to zero.\nFurthermore, we propose a privacy neuron aggregator dememorize private\ninformation in a batch processing manner. Experimental results show that our\nmethod can significantly and efficiently reduce the exposure of private data\nleakage without deteriorating the performance of the model. Additionally, we\nempirically demonstrate the relationship between model memorization and privacy\nneurons, from multiple perspectives, including model size, training time,\nprompts, privacy neuron distribution, illustrating the robustness of our\napproach.\n","authors":["Xinwei Wu","Junzhuo Li","Minghui Xu","Weilong Dong","Shuangzhi Wu","Chao Bian","Deyi Xiong"],"pdf_url":"https://arxiv.org/pdf/2310.20138v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.18743v3","updated":"2023-12-05T16:04:15Z","published":"2023-11-30T17:41:30Z","title":"AlignBench: Benchmarking Chinese Alignment of Large Language Models","summary":" Alignment has become a critical step for instruction-tuned Large Language\nModels (LLMs) to become helpful assistants. However, effective evaluation of\nalignment for emerging Chinese LLMs is still significantly lacking, calling for\nreal-scenario grounded, open-ended, challenging and automatic evaluations\ntailored for alignment. To fill in this gap, we introduce AlignBench, a\ncomprehensive multi-dimensional benchmark for evaluating LLMs' alignment in\nChinese. Equipped with a human-in-the-loop data curation pipeline, our\nbenchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with\nChain-of-Thought to generate explanations and final ratings as evaluations,\nensuring high reliability and interpretability. Furthermore, we report\nAlignBench evaluated by CritiqueLLM, a dedicated Chinese evaluator LLM that\nrecovers 95% of GPT-4's evaluation ability. We will provide public APIs for\nevaluating AlignBench with CritiqueLLM to facilitate the evaluation of LLMs'\nChinese alignment. All evaluation codes, data, and LLM generations are\navailable at \\url{https://github.com/THUDM/AlignBench}.\n","authors":["Xiao Liu","Xuanyu Lei","Shengyuan Wang","Yue Huang","Zhuoer Feng","Bosi Wen","Jiale Cheng","Pei Ke","Yifan Xu","Weng Lam Tam","Xiaohan Zhang","Lichao Sun","Hongning Wang","Jing Zhang","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04743v2","updated":"2023-12-05T15:05:58Z","published":"2023-06-07T19:37:55Z","title":"ScienceBenchmark: A Complex Real-World Benchmark for Evaluating Natural\n Language to SQL Systems","summary":" Natural Language to SQL systems (NL-to-SQL) have recently shown a significant\nincrease in accuracy for natural language to SQL query translation. This\nimprovement is due to the emergence of transformer-based language models, and\nthe popularity of the Spider benchmark - the de-facto standard for evaluating\nNL-to-SQL systems. The top NL-to-SQL systems reach accuracies of up to 85\\%.\nHowever, Spider mainly contains simple databases with few tables, columns, and\nentries, which does not reflect a realistic setting. Moreover, complex\nreal-world databases with domain-specific content have little to no training\ndata available in the form of NL/SQL-pairs leading to poor performance of\nexisting NL-to-SQL systems.\n In this paper, we introduce ScienceBenchmark, a new complex NL-to-SQL\nbenchmark for three real-world, highly domain-specific databases. For this new\nbenchmark, SQL experts and domain experts created high-quality NL/SQL-pairs for\neach domain. To garner more data, we extended the small amount of\nhuman-generated data with synthetic data generated using GPT-3. We show that\nour benchmark is highly challenging, as the top performing systems on Spider\nachieve a very low performance on our benchmark. Thus, the challenge is\nmany-fold: creating NL-to-SQL systems for highly complex domains with a small\namount of hand-made training data augmented with synthetic data. To our\nknowledge, ScienceBenchmark is the first NL-to-SQL benchmark designed with\ncomplex real-world scientific databases, containing challenging training and\ntest data carefully validated by domain experts.\n","authors":["Yi Zhang","Jan Deriu","George Katsogiannis-Meimarakis","Catherine Kosten","Georgia Koutrika","Kurt Stockinger"],"pdf_url":"https://arxiv.org/pdf/2306.04743v2.pdf","comment":"12 pages, 2 figures, 5 tables"},{"id":"http://arxiv.org/abs/2312.02820v1","updated":"2023-12-05T15:03:27Z","published":"2023-12-05T15:03:27Z","title":"Clustering Pseudo Language Family in Multilingual Translation Models\n with Fisher Information Matrix","summary":" In multilingual translation research, the comprehension and utilization of\nlanguage families are of paramount importance. Nevertheless, clustering\nlanguages based solely on their ancestral families can yield suboptimal results\ndue to variations in the datasets employed during the model's training phase.\nTo mitigate this challenge, we introduce an innovative method that leverages\nthe fisher information matrix (FIM) to cluster language families, anchored on\nthe multilingual translation model's characteristics. We hypothesize that\nlanguage pairs with similar effects on model parameters exhibit a considerable\ndegree of linguistic congruence and should thus be grouped cohesively. This\nconcept has led us to define pseudo language families. We provide an in-depth\ndiscussion regarding the inception and application of these pseudo language\nfamilies. Empirical evaluations reveal that employing these pseudo language\nfamilies enhances performance over conventional language families in adapting a\nmultilingual translation model to unfamiliar language pairs. The proposed\nmethodology may also be extended to scenarios requiring language similarity\nmeasurements. The source code and associated scripts can be accessed at\nhttps://github.com/ecoli-hit/PseudoFamily.\n","authors":["Xinyu Ma","Xuebo Liu","Min Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02820v1.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.15218v2","updated":"2023-12-05T14:49:36Z","published":"2023-11-26T07:19:10Z","title":"Real-Time Online Stock Forecasting Utilizing Integrated Quantitative and\n Qualitative Analysis","summary":" The application of Machine learning to finance has become a familiar\napproach, even more so in stock market forecasting. The stock market is highly\nvolatile and huge amounts of data are generated every minute globally. The\nextraction of effective intelligence from this data is of critical importance.\nHowever, a collaboration of numerical stock data with qualitative text data can\nbe a challenging task. In this work, we accomplish this and provide an\nunprecedented, publicly available dataset with technical and fundamental data,\nsentiment that we gathered from News Archives, TV news captions, Radio\nTranscripts, Tweets, Daily financial newspapers, etc. The text data entries\nused for sentiment extraction total more than 1.4 Million. The dataset consists\nof daily entries from January 2018 to December 2022 for 8 companies\nrepresenting diverse industrial sectors and the Dow Jones Industrial Average\n(DJIA) as a whole. Holistic Fundamental and Technical data is provided training\nready for Model learning and deployment. The data generated could be used for\nIncremental online learning with real-time data points retrieved daily, since\nthere was no stagnant data utilized, all the data was retired from APIs or\nself-designed scripts. Moreover, the utilization of Spearman's rank correlation\nover real-time data, linking stock returns with sentiment analysis has produced\nnoteworthy results for the DJIA achieving accuracy levels surpassing 60\\%. The\ndataset is made available at https://github.com/batking24/Huge-Stock-Dataset\n","authors":["Sai Akash Bathini","Dagli Cihan"],"pdf_url":"https://arxiv.org/pdf/2311.15218v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02803v1","updated":"2023-12-05T14:44:08Z","published":"2023-12-05T14:44:08Z","title":"Leveraging Domain Adaptation and Data Augmentation to Improve Qur'anic\n IR in English and Arabic","summary":" In this work, we approach the problem of Qur'anic information retrieval (IR)\nin Arabic and English. Using the latest state-of-the-art methods in neural IR,\nwe research what helps to tackle this task more efficiently. Training retrieval\nmodels requires a lot of data, which is difficult to obtain for training\nin-domain. Therefore, we commence with training on a large amount of general\ndomain data and then continue training on in-domain data. To handle the lack of\nin-domain data, we employed a data augmentation technique, which considerably\nimproved results in MRR@10 and NDCG@5 metrics, setting the state-of-the-art in\nQur'anic IR for both English and Arabic. The absence of an Islamic corpus and\ndomain-specific model for IR task in English motivated us to address this lack\nof resources and take preliminary steps of the Islamic corpus compilation and\ndomain-specific language model (LM) pre-training, which helped to improve the\nperformance of the retrieval models that use the domain-specific LM as the\nshared backbone. We examined several language models (LMs) in Arabic to select\none that efficiently deals with the Qur'anic IR task. Besides transferring\nsuccessful experiments from English to Arabic, we conducted additional\nexperiments with retrieval task in Arabic to amortize the scarcity of general\ndomain datasets used to train the retrieval models. Handling Qur'anic IR task\ncombining English and Arabic allowed us to enhance the comparison and share\nvaluable insights across models and languages.\n","authors":["Vera Pavlova"],"pdf_url":"https://arxiv.org/pdf/2312.02803v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02798v1","updated":"2023-12-05T14:35:11Z","published":"2023-12-05T14:35:11Z","title":"Weakly Supervised Detection of Hallucinations in LLM Activations","summary":" We propose an auditing method to identify whether a large language model\n(LLM) encodes patterns such as hallucinations in its internal states, which may\npropagate to downstream tasks. We introduce a weakly supervised auditing\ntechnique using a subset scanning approach to detect anomalous patterns in LLM\nactivations from pre-trained models. Importantly, our method does not need\nknowledge of the type of patterns a-priori. Instead, it relies on a reference\ndataset devoid of anomalies during testing. Further, our approach enables the\nidentification of pivotal nodes responsible for encoding these patterns, which\nmay offer crucial insights for fine-tuning specific sub-networks for bias\nmitigation. We introduce two new scanning methods to handle LLM activations for\nanomalous sentences that may deviate from the expected distribution in either\ndirection. Our results confirm prior findings of BERT's limited internal\ncapacity for encoding hallucinations, while OPT appears capable of encoding\nhallucination information internally. Importantly, our scanning approach,\nwithout prior exposure to false statements, performs comparably to a fully\nsupervised out-of-distribution classifier.\n","authors":["Miriam Rateike","Celia Cintas","John Wamburu","Tanya Akumu","Skyler Speakman"],"pdf_url":"https://arxiv.org/pdf/2312.02798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02783v1","updated":"2023-12-05T14:14:27Z","published":"2023-12-05T14:14:27Z","title":"Large Language Models on Graphs: A Comprehensive Survey","summary":" Large language models (LLMs), such as ChatGPT and LLaMA, are creating\nsignificant advancements in natural language processing, due to their strong\ntext encoding/decoding ability and newly found emergent capability (e.g.,\nreasoning). While LLMs are mainly designed to process pure texts, there are\nmany real-world scenarios where text data are associated with rich structure\ninformation in the form of graphs (e.g., academic networks, and e-commerce\nnetworks) or scenarios where graph data are paired with rich textual\ninformation (e.g., molecules with descriptions). Besides, although LLMs have\nshown their pure text-based reasoning ability, it is underexplored whether such\nability can be generalized to graph scenarios (i.e., graph-based reasoning). In\nthis paper, we provide a systematic review of scenarios and techniques related\nto large language models on graphs. We first summarize potential scenarios of\nadopting LLMs on graphs into three categories, namely pure graphs, text-rich\ngraphs, and text-paired graphs. We then discuss detailed techniques for\nutilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM\nas Aligner, and compare the advantages and disadvantages of different schools\nof models. Furthermore, we mention the real-world applications of such methods\nand summarize open-source codes and benchmark datasets. Finally, we conclude\nwith potential future research directions in this fast-growing field. The\nrelated source can be found at\nhttps://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs.\n","authors":["Bowen Jin","Gang Liu","Chi Han","Meng Jiang","Heng Ji","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2312.02783v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2312.02780v1","updated":"2023-12-05T14:12:15Z","published":"2023-12-05T14:12:15Z","title":"Scaling Laws for Adversarial Attacks on Language Model Activations","summary":" We explore a class of adversarial attacks targeting the activations of\nlanguage models. By manipulating a relatively small subset of model\nactivations, $a$, we demonstrate the ability to control the exact prediction of\na significant number (in some cases up to 1000) of subsequent tokens $t$. We\nempirically verify a scaling law where the maximum number of target tokens\n$t_\\mathrm{max}$ predicted depends linearly on the number of tokens $a$ whose\nactivations the attacker controls as $t_\\mathrm{max} = \\kappa a$. We find that\nthe number of bits of control in the input space needed to control a single bit\nin the output space (what we call attack resistance $\\chi$) is remarkably\nconstant between $\\approx 16$ and $\\approx 25$ over 2 orders of magnitude of\nmodel sizes for different language models. Compared to attacks on tokens,\nattacks on activations are predictably much stronger, however, we identify a\nsurprising regularity where one bit of input steered either via activations or\nvia tokens is able to exert control over a similar amount of output bits. This\ngives support for the hypothesis that adversarial attacks are a consequence of\ndimensionality mismatch between the input and output spaces. A practical\nimplication of the ease of attacking language model activations instead of\ntokens is for multi-modal and selected retrieval models, where additional data\nsources are added as activations directly, sidestepping the tokenized input.\nThis opens up a new, broad attack surface. By using language models as a\ncontrollable test-bed to study adversarial attacks, we were able to experiment\nwith input-output dimensions that are inaccessible in computer vision,\nespecially where the output dimension dominates.\n","authors":["Stanislav Fort"],"pdf_url":"https://arxiv.org/pdf/2312.02780v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.02748v1","updated":"2023-12-05T13:23:15Z","published":"2023-12-05T13:23:15Z","title":"Compositional Generalization for Data-to-Text Generation","summary":" Data-to-text generation involves transforming structured data, often\nrepresented as predicate-argument tuples, into coherent textual descriptions.\nDespite recent advances, systems still struggle when confronted with unseen\ncombinations of predicates, producing unfaithful descriptions (e.g.\nhallucinations or omissions). We refer to this issue as compositional\ngeneralisation, and it encouraged us to create a benchmark for assessing the\nperformance of different approaches on this specific problem. Furthermore, we\npropose a novel model that addresses compositional generalization by clustering\npredicates into groups. Our model generates text in a sentence-by-sentence\nmanner, relying on one cluster of predicates at a time. This approach\nsignificantly outperforms T5~baselines across all evaluation metrics.Notably,\nit achieved a 31% improvement over T5 in terms of a metric focused on\nmaintaining faithfulness to the input.\n","authors":["Xinnuo Xu","Ivan Titov","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2312.02748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07989v3","updated":"2023-12-05T13:09:06Z","published":"2023-11-14T08:34:26Z","title":"Unifying the Perspectives of NLP and Software Engineering: A Survey on\n Language Models for Code","summary":" In this work we systematically review the recent advancements in code\nprocessing with language models, covering 50+ models, 30+ evaluation tasks,\n170+ datasets, and 700 related works. We break down code processing models into\ngeneral language models represented by the GPT family and specialized models\nthat are specifically pretrained on code, often with tailored objectives. We\ndiscuss the relations and differences between these models, and highlight the\nhistorical transition of code modeling from statistical models and RNNs to\npretrained Transformers and LLMs, which is exactly the same course that had\nbeen taken by NLP. We also discuss code-specific features such as AST, CFG, and\nunit tests, along with their application in training code language models, and\nidentify key challenges and potential future directions in this domain. We keep\nthe survey open and updated on GitHub at\nhttps://github.com/codefuse-ai/Awesome-Code-LLM.\n","authors":["Ziyin Zhang","Chaoyu Chen","Bingchang Liu","Cong Liao","Zi Gong","Hang Yu","Jianguo Li","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2311.07989v3.pdf","comment":"Repo is available at https://github.com/codefuse-ai/Awesome-Code-LLM.\n 8 figures, 9 tables, and 694 references"},{"id":"http://arxiv.org/abs/2312.02730v1","updated":"2023-12-05T12:48:04Z","published":"2023-12-05T12:48:04Z","title":"Towards Measuring Representational Similarity of Large Language Models","summary":" Understanding the similarity of the numerous released large language models\n(LLMs) has many uses, e.g., simplifying model selection, detecting illegal\nmodel reuse, and advancing our understanding of what makes LLMs perform well.\nIn this work, we measure the similarity of representations of a set of LLMs\nwith 7B parameters. Our results suggest that some LLMs are substantially\ndifferent from others. We identify challenges of using representational\nsimilarity measures that suggest the need of careful study of similarity scores\nto avoid false conclusions.\n","authors":["Max Klabunde","Mehdi Ben Amor","Michael Granitzer","Florian Lemmerich"],"pdf_url":"https://arxiv.org/pdf/2312.02730v1.pdf","comment":"Extended abstract in UniReps Workshop @ NeurIPS 2023"},{"id":"http://arxiv.org/abs/2306.11300v4","updated":"2023-12-05T12:23:36Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. A critical challenge is how\nto make use of existing large-scale pre-trained VLMs, which are trained on\ncommon objects, to perform the domain-specific transfer for accomplishing\ndomain-related downstream tasks. In this paper, we propose a new framework that\nincludes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap\nbetween the General Vision-Language Model (GVLM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\nfine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning\nmethods on RS5M to implement the DVLM. Experimental results show that our\nproposed dataset is highly effective for various tasks, and our model GeoRSCLIP\nimproves upon the baseline or previous state-of-the-art model by $3\\%\\sim20\\%$\nin Zero-shot Classification (ZSC), $3\\%\\sim6\\%$ in Remote Sensing Cross-Modal\nText-Image Retrieval (RSCTIR) and $4\\%\\sim5\\%$ in Semantic Localization (SeLo)\ntasks. Dataset and models have been released in:\n\\url{https://github.com/om-ai-lab/RS5M}.\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v4.pdf","comment":"RS5M dataset v5"},{"id":"http://arxiv.org/abs/2312.02706v1","updated":"2023-12-05T12:07:30Z","published":"2023-12-05T12:07:30Z","title":"Large Knowledge Model: Perspectives and Challenges","summary":" Humankind's understanding of the world is fundamentally linked to our\nperception and cognition, with \\emph{human languages} serving as one of the\nmajor carriers of \\emph{world knowledge}. In this vein, \\emph{Large Language\nModels} (LLMs) like ChatGPT epitomize the pre-training of extensive,\nsequence-based world knowledge into neural networks, facilitating the\nprocessing and manipulation of this knowledge in a parametric space. This\narticle explores large models through the lens of ``knowledge''. We initially\ninvestigate the role of symbolic knowledge such as Knowledge Graphs (KGs) in\nenhancing LLMs, covering aspects like knowledge-augmented language model,\nstructure-inducing pre-training, knowledgeable prompts, structured CoT,\nknowledge editing, semantic tools for LLM and knowledgeable AI agents.\nSubsequently, we examine how LLMs can amplify traditional symbolic knowledge\nbases, encompassing aspects like using LLM as KG builder and controller,\nstructured knowledge pretraining, LLM-enhanced symbolic reasoning, and the\namalgamation of perception with cognition. Considering the intricate nature of\nhuman knowledge, we advocate for the creation of \\emph{Large Knowledge Models}\n(LKM), specifically engineered to manage diversified spectrum of knowledge\nstructures. This ambitious undertaking could entail several key challenges,\nsuch as disentangling knowledge representation from language models,\nrestructuring pre-training with structured knowledge, and building large\ncommonsense models, among others. We finally propose a five-``A'' principle to\ndistinguish the concept of LKM.\n","authors":["Huajun Chen"],"pdf_url":"https://arxiv.org/pdf/2312.02706v1.pdf","comment":"This is an early draft subject to revision in a near future"},{"id":"http://arxiv.org/abs/2308.16549v2","updated":"2023-12-05T11:43:44Z","published":"2023-08-31T08:40:41Z","title":"Thesis Distillation: Investigating The Impact of Bias in NLP Models on\n Hate Speech Detection","summary":" This paper is a summary of the work done in my PhD thesis. Where I\ninvestigate the impact of bias in NLP models on the task of hate speech\ndetection from three perspectives: explainability, offensive stereotyping bias,\nand fairness. Then, I discuss the main takeaways from my thesis and how they\ncan benefit the broader NLP community. Finally, I discuss important future\nresearch directions. The findings of my thesis suggest that the bias in NLP\nmodels impacts the task of hate speech detection from all three perspectives.\nAnd that unless we start incorporating social sciences in studying bias in NLP\nmodels, we will not effectively overcome the current limitations of measuring\nand mitigating bias in NLP models.\n","authors":["Fatma Elsafoury"],"pdf_url":"https://arxiv.org/pdf/2308.16549v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13307v2","updated":"2023-12-05T10:25:23Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Reasoning and\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as an interaction between\nvision and language fields. Previous works inherited the ideology of\nvision-to-language generation tasks,aiming to generate paragraphs with high\nconsistency as reports. However, one unique characteristic of RRG, the\nindependence between diseases, was neglected, leading to the injection of\ndisease co-occurrence as a confounder that effects the results through backdoor\npath. Unfortunately, this confounder confuses the process of report generation\nworse because of the biased RRG data distribution. In this paper, to rethink\nthis issue thoroughly, we reason about its causes and effects from a novel\nperspective of statistics and causality, where the Joint Vision Coupling and\nthe Conditional Sentence Coherence Coupling are two aspects prone to implicitly\ndecrease the accuracy of reports. Then, a counterfactual augmentation strategy\nthat contains the Counterfactual Sample Synthesis and the Counterfactual Report\nReconstruction sub-methods is proposed to break these two aspects of spurious\neffects. Experimental results and further analyses on two widely used datasets\njustify our reasoning and proposed methods.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v2.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2310.20204v2","updated":"2023-12-05T10:20:11Z","published":"2023-10-31T06:04:18Z","title":"General-Purpose Retrieval-Enhanced Medical Prediction Model Using\n Near-Infinite History","summary":" Developing clinical prediction models (e.g., mortality prediction) based on\nelectronic health records (EHRs) typically relies on expert opinion for feature\nselection and adjusting observation window size. This burdens experts and\ncreates a bottleneck in the development process. We propose Retrieval-Enhanced\nMedical prediction model (REMed) to address such challenges. REMed can\nessentially evaluate an unlimited number of clinical events, select the\nrelevant ones, and make predictions. This approach effectively eliminates the\nneed for manual feature selection and enables an unrestricted observation\nwindow. We verified these properties through experiments on 27 clinical tasks\nand two independent cohorts from publicly available EHR datasets, where REMed\noutperformed other contemporary architectures that aim to handle as many events\nas possible. Notably, we found that the preferences of REMed align closely with\nthose of medical experts. We expect our approach to significantly expedite the\ndevelopment of EHR prediction models by minimizing clinicians' need for manual\ninvolvement.\n","authors":["Junu Kim","Chaeeun Shim","Bosco Seong Kyu Yang","Chami Im","Sung Yoon Lim","Han-Gil Jeong","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2310.20204v2.pdf","comment":"The source codes corresponding to this paper are available at:\n https://github.com/starmpcc/REMed"},{"id":"http://arxiv.org/abs/2305.17819v2","updated":"2023-12-05T09:51:55Z","published":"2023-05-28T22:46:21Z","title":"Large Language Models, scientific knowledge and factuality: A systematic\n analysis in antibiotic discovery","summary":" Inferring over and extracting information from Large Language Models (LLMs)\ntrained on a large corpus of scientific literature can potentially drive a new\nera in biomedical research, reducing the barriers for accessing existing\nmedical evidence. This work examines the potential of LLMs for dialoguing with\nbiomedical background knowledge, using the context of antibiotic discovery. The\nsystematic analysis is applied to ten state-of-the-art models, from models\nspecialised on biomedical scientific corpora to general models such as ChatGPT,\nGPT-4 and Llama 2 in two prompting-based tasks: chemical compound definition\ngeneration and chemical compound-fungus relation determination. The work\nprovides a systematic assessment on the ability of LLMs to encode and express\nthese relations, verifying for fluency, prompt-alignment, semantic coherence,\nfactual knowledge and specificity of generated responses. Results show that\nwhile recent models have improved in fluency, factual accuracy is still low and\nmodels are biased towards over-represented entities. The ability of LLMs to\nserve as biomedical knowledge bases is questioned, and the need for additional\nsystematic evaluation frameworks is highlighted. The best performing GPT-4\nproduced a factual definition for 70% of chemical compounds and 43.6% factual\nrelations to fungi, whereas the best open source model BioGPT-large 30% of the\ncompounds and 30% of the relations for the best-performing prompt. The results\nshow that while LLMs are currently not fit for purpose to be used as biomedical\nfactual knowledge bases, there is a promising emerging property in the\ndirection of factuality as the models become domain specialised, scale-up in\nsize and level of human feedback.\n","authors":["Magdalena Wysocka","Oskar Wysocki","Maxime Delmas","Vincent Mutel","Andre Freitas"],"pdf_url":"https://arxiv.org/pdf/2305.17819v2.pdf","comment":"28 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.02614v1","updated":"2023-12-05T09:44:45Z","published":"2023-12-05T09:44:45Z","title":"Prompt Optimization via Adversarial In-Context Learning","summary":" We propose a new method, Adversarial In-Context Learning (adv-ICL), to\noptimize prompt for in-context learning (ICL) by employing one LLM as a\ngenerator, another as a discriminator, and a third as a prompt modifier. As in\ntraditional adversarial learning, adv-ICL is implemented as a two-player game\nbetween the generator and discriminator, where the generator tries to generate\nrealistic enough output to fool the discriminator. In each round, given an\ninput prefixed by task instructions and several exemplars, the generator\nproduces an output. The discriminator is then tasked with classifying the\ngenerator input-output pair as model-generated or real data. Based on the\ndiscriminator loss, the prompt modifier proposes possible edits to the\ngenerator and discriminator prompts, and the edits that most improve the\nadversarial loss are selected. We show that adv-ICL results in significant\nimprovements over state-of-the-art prompt optimization techniques for both open\nand closed-source models on 11 generation and classification tasks including\nsummarization, arithmetic reasoning, machine translation, data-to-text\ngeneration, and the MMLU and big-bench hard benchmarks. In addition, because\nour method uses pre-trained models and updates only prompts rather than model\nparameters, it is computationally efficient, easy to extend to any LLM and\ntask, and effective in low-resource settings.\n","authors":["Xuan Long Do","Yiran Zhao","Hannah Brown","Yuxi Xie","James Xu Zhao","Nancy F. Chen","Kenji Kawaguchi","Michael Qizhe Xie","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2312.02614v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2312.02598v1","updated":"2023-12-05T09:16:03Z","published":"2023-12-05T09:16:03Z","title":"Impact of Tokenization on LLaMa Russian Adaptation","summary":" Latest instruction-tuned large language models (LLM) show great results on\nvarious tasks, however, they often face performance degradation for non-English\ninput. There is evidence that the reason lies in inefficient tokenization\ncaused by low language representation in pre-training data which hinders the\ncomprehension of non-English instructions, limiting the potential of target\nlanguage instruction-tuning. In this work we investigate the possibility of\naddressing the issue with vocabulary substitution in the context of LLaMa\nRussian language adaptation. We explore three variants of vocabulary adaptation\nand test their performance on Saiga instruction-tuning and fine-tuning on\nRussian Super Glue benchmark. The results of automatic evaluation show that\nvocabulary substitution not only improves the model's quality in Russian but\nalso accelerates fine-tuning (35%) and inference (up to 60%) while reducing\nmemory consumption. Additional human evaluation of the instruction-tuned models\ndemonstrates that models with Russian-adapted vocabulary generate answers with\nhigher user preference than the original Saiga-LLaMa model.\n","authors":["Mikhail Tikhomirov","Daniil Chernyshev"],"pdf_url":"https://arxiv.org/pdf/2312.02598v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02590v1","updated":"2023-12-05T09:04:22Z","published":"2023-12-05T09:04:22Z","title":"Text Intimacy Analysis using Ensembles of Multilingual Transformers","summary":" Intimacy estimation of a given text has recently gained importance due to the\nincrease in direct interaction of NLP systems with humans. Intimacy is an\nimportant aspect of natural language and has a substantial impact on our\neveryday communication. Thus the level of intimacy can provide us with deeper\ninsights and richer semantics of conversations. In this paper, we present our\nwork on the SemEval shared task 9 on predicting the level of intimacy for the\ngiven text. The dataset consists of tweets in ten languages, out of which only\nsix are available in the training dataset. We conduct several experiments and\nshow that an ensemble of multilingual models along with a language-specific\nmonolingual model has the best performance. We also evaluate other data\naugmentation methods such as translation and present the results. Lastly, we\nstudy the results thoroughly and present some noteworthy insights into this\nproblem.\n","authors":["Tanmay Chavan","Ved Patwardhan"],"pdf_url":"https://arxiv.org/pdf/2312.02590v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01529v2","updated":"2023-12-05T09:01:07Z","published":"2023-12-03T23:03:22Z","title":"T3D: Towards 3D Medical Image Understanding through Vision-Language\n Pre-training","summary":" Expert annotation of 3D medical image for downstream analysis is\nresource-intensive, posing challenges in clinical applications. Visual\nself-supervised learning (vSSL), though effective for learning visual\ninvariance, neglects the incorporation of domain knowledge from medicine. To\nincorporate medical knowledge into visual representation learning,\nvision-language pre-training (VLP) has shown promising results in 2D image.\nHowever, existing VLP approaches become generally impractical when applied to\nhigh-resolution 3D medical images due to GPU hardware constraints and the\npotential loss of critical details caused by downsampling, which is the\nintuitive solution to hardware constraints. To address the above limitations,\nwe introduce T3D, the first VLP framework designed for high-resolution 3D\nmedical images. T3D incorporates two text-informed pretext tasks:\n(\\lowerromannumeral{1}) text-informed contrastive learning;\n(\\lowerromannumeral{2}) text-informed image restoration. These tasks focus on\nlearning 3D visual representations from high-resolution 3D medical images and\nintegrating clinical knowledge from radiology reports, without distorting\ninformation through forced alignment of downsampled volumes with detailed\nanatomical text. Trained on a newly curated large-scale dataset of 3D medical\nimages and radiology reports, T3D significantly outperforms current vSSL\nmethods in tasks like organ and tumor segmentation, as well as disease\nclassification. This underlines T3D's potential in representation learning for\n3D medical image analysis. All data and code will be available upon acceptance.\n","authors":["Che Liu","Cheng Ouyang","Yinda Chen","Cesar César Quilodrán-Casas","Lei Ma","Jie Fu","Yike Guo","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2312.01529v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10313v4","updated":"2023-12-05T08:59:33Z","published":"2023-09-19T04:51:13Z","title":"Investigating the Catastrophic Forgetting in Multimodal Large Language\n Models","summary":" Following the success of GPT4, there has been a surge in interest in\nmultimodal large language model (MLLM) research. This line of research focuses\non developing general-purpose LLMs through fine-tuning pre-trained LLMs and\nvision models. However, catastrophic forgetting, a notorious phenomenon where\nthe fine-tuned model fails to retain similar performance compared to the\npre-trained model, still remains an inherent problem in multimodal LLMs (MLLM).\nIn this paper, we introduce EMT: Evaluating MulTimodality for evaluating the\ncatastrophic forgetting in MLLMs, by treating each MLLM as an image classifier.\nWe first apply EMT to evaluate several open-source fine-tuned MLLMs and we\ndiscover that almost all evaluated MLLMs fail to retain the same performance\nlevels as their vision encoders on standard image classification tasks.\nMoreover, we continue fine-tuning LLaVA, an MLLM and utilize EMT to assess\nperformance throughout the fine-tuning. Interestingly, our results suggest that\nearly-stage fine-tuning on an image dataset improves performance across other\nimage datasets, by enhancing the alignment of text and visual features.\nHowever, as fine-tuning proceeds, the MLLMs begin to hallucinate, resulting in\na significant loss of generalizability, even when the image encoder remains\nfrozen. Our results suggest that MLLMs have yet to demonstrate performance on\npar with their vision models on standard image classification tasks and the\ncurrent MLLM fine-tuning procedure still has room for improvement.\n","authors":["Yuexiang Zhai","Shengbang Tong","Xiao Li","Mu Cai","Qing Qu","Yong Jae Lee","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2309.10313v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02578v1","updated":"2023-12-05T08:50:34Z","published":"2023-12-05T08:50:34Z","title":"Empathy and Distress Detection using Ensembles of Transformer Models","summary":" This paper presents our approach for the WASSA 2023 Empathy, Emotion and\nPersonality Shared Task. Empathy and distress are human feelings that are\nimplicitly expressed in natural discourses. Empathy and distress detection are\ncrucial challenges in Natural Language Processing that can aid our\nunderstanding of conversations. The provided dataset consists of several\nlong-text examples in the English language, with each example associated with a\nnumeric score for empathy and distress. We experiment with several BERT-based\nmodels as a part of our approach. We also try various ensemble methods. Our\nfinal submission has a Pearson's r score of 0.346, placing us third in the\nempathy and distress detection subtask.\n","authors":["Tanmay Chavan","Kshitij Deshpande","Sheetal Sonawane"],"pdf_url":"https://arxiv.org/pdf/2312.02578v1.pdf","comment":"Accepted at the WASSA 2023 workshop at ACL 2023"},{"id":"http://arxiv.org/abs/2311.04923v2","updated":"2023-12-05T08:44:12Z","published":"2023-11-03T08:59:51Z","title":"Is one brick enough to break the wall of spoken dialogue state tracking?","summary":" In Task-Oriented Dialogue (TOD) systems, correctly updating the system's\nunderstanding of the user's needs (a.k.a dialogue state tracking) is key to a\nsmooth interaction. Traditionally, TOD systems perform this update in three\nsteps: transcription of the user's utterance, semantic extraction of the key\nconcepts, and contextualization with the previously identified concepts. Such\ncascade approaches suffer from cascading errors and separate optimization.\nEnd-to-End approaches have been proved helpful up to the semantic extraction\nstep. This paper goes one step further paving the path towards completely\nneural spoken dialogue state tracking by comparing three approaches: (1) a\nstate of the art cascade approach, (2) a locally E2E approach with rule-based\ncontextualization and (3) a completely neural approach.\n","authors":["Lucas Druart","Valentin Vielzeuf","Yannick Estève"],"pdf_url":"https://arxiv.org/pdf/2311.04923v2.pdf","comment":"Bug found in previous version, ongoing work"},{"id":"http://arxiv.org/abs/2309.12307v2","updated":"2023-12-05T08:12:50Z","published":"2023-09-21T17:59:11Z","title":"LongLoRA: Efficient Fine-tuning of Long-Context Large Language Models","summary":" We present LongLoRA, an efficient fine-tuning approach that extends the\ncontext sizes of pre-trained large language models (LLMs), with limited\ncomputation cost. Typically, training LLMs with long context sizes is\ncomputationally expensive, requiring extensive training hours and GPU\nresources. For example, training on the context length of 8192 needs 16x\ncomputational costs in self-attention layers as that of 2048. In this paper, we\nspeed up the context extension of LLMs in two aspects. On the one hand,\nalthough dense global attention is needed during inference, fine-tuning the\nmodel can be effectively and efficiently done by sparse local attention. The\nproposed shifted sparse attention (S$^2$-Attn) effectively enables context\nextension, leading to non-trivial computation saving with similar performance\nto fine-tuning with vanilla attention. Particularly, it can be implemented with\nonly two lines of code in training, while being optional in inference. On the\nother hand, we revisit the parameter-efficient fine-tuning regime for context\nexpansion. Notably, we find that LoRA for context extension works well under\nthe premise of trainable embedding and normalization. LongLoRA combines this\nimproved LoRA with S$^2$-Attn. LongLoRA demonstrates strong empirical results\non various tasks on Llama2 models from 7B/13B to 70B. LongLoRA adopts Llama2 7B\nfrom 4k context to 100k, or Llama2 70B to 32k on a single 8x A100 machine.\nLongLoRA extends models' context while retaining their original architectures,\nand is compatible with most existing techniques, like Flash-Attention2. In\naddition, we further conduct supervised fine-tuning with LongLoRA and our long\ninstruction-following LongAlpaca dataset.\n","authors":["Yukang Chen","Shengju Qian","Haotian Tang","Xin Lai","Zhijian Liu","Song Han","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2309.12307v2.pdf","comment":"Code, models, dataset, and demo are available at\n https://github.com/dvlab-research/LongLoRA"},{"id":"http://arxiv.org/abs/2312.02554v1","updated":"2023-12-05T07:52:12Z","published":"2023-12-05T07:52:12Z","title":"ULMA: Unified Language Model Alignment with Demonstration and Point-wise\n Human Preference","summary":" Language model alignment is a cutting-edge technique in large language model\ntraining to align the model output to user's intent, e.g., being helpful and\nharmless. Recent alignment framework consists of two steps: supervised\nfine-tuning with demonstration data and preference learning with human\npreference data. Previous preference learning methods, such as RLHF and DPO,\nmainly focus on pair-wise preference data. However, in many real-world\nscenarios where human feedbacks are intrinsically point-wise, these methods\nwill suffer from information loss or even fail. To fill this gap, in this\npaper, we first develop a preference learning method called point-wise DPO to\ntackle point-wise preference data. Further revelation on the connection between\nsupervised fine-tuning and point-wise preference learning enables us to develop\na unified framework for both human demonstration and point-wise preference\ndata, which sheds new light on the construction of preference dataset.\nExtensive experiments on point-wise datasets with binary or continuous labels\ndemonstrate the superior performance and efficiency of our proposed methods. A\nnew dataset with high-quality demonstration samples on harmlessness is\nconstructed and made publicly available.\n","authors":["Tianchi Cai","Xierui Song","Jiyan Jiang","Fei Teng","Jinjie Gu","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02549v1","updated":"2023-12-05T07:37:21Z","published":"2023-12-05T07:37:21Z","title":"DemaFormer: Damped Exponential Moving Average Transformer with\n Energy-Based Modeling for Temporal Language Grounding","summary":" Temporal Language Grounding seeks to localize video moments that semantically\ncorrespond to a natural language query. Recent advances employ the attention\nmechanism to learn the relations between video moments and the text query.\nHowever, naive attention might not be able to appropriately capture such\nrelations, resulting in ineffective distributions where target video moments\nare difficult to separate from the remaining ones. To resolve the issue, we\npropose an energy-based model framework to explicitly learn moment-query\ndistributions. Moreover, we propose DemaFormer, a novel Transformer-based\narchitecture that utilizes exponential moving average with a learnable damping\nfactor to effectively encode moment-query inputs. Comprehensive experiments on\nfour public temporal language grounding datasets showcase the superiority of\nour methods over the state-of-the-art baselines.\n","authors":["Thong Nguyen","Xiaobao Wu","Xinshuai Dong","Cong-Duy Nguyen","See-Kiong Ng","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2312.02549v1.pdf","comment":"Accepted at EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2312.02532v1","updated":"2023-12-05T06:28:45Z","published":"2023-12-05T06:28:45Z","title":"DRAFT: Dense Retrieval Augmented Few-shot Topic classifier Framework","summary":" With the growing volume of diverse information, the demand for classifying\narbitrary topics has become increasingly critical. To address this challenge,\nwe introduce DRAFT, a simple framework designed to train a classifier for\nfew-shot topic classification. DRAFT uses a few examples of a specific topic as\nqueries to construct Customized dataset with a dense retriever model.\nMulti-query retrieval (MQR) algorithm, which effectively handles multiple\nqueries related to a specific topic, is applied to construct the Customized\ndataset. Subsequently, we fine-tune a classifier using the Customized dataset\nto identify the topic. To demonstrate the efficacy of our proposed approach, we\nconduct evaluations on both widely used classification benchmark datasets and\nmanually constructed datasets with 291 diverse topics, which simulate diverse\ncontents encountered in real-world applications. DRAFT shows competitive or\nsuperior performance compared to baselines that use in-context learning, such\nas GPT-3 175B and InstructGPT 175B, on few-shot topic classification tasks\ndespite having 177 times fewer parameters, demonstrating its effectiveness.\n","authors":["Keonwoo Kim","Younggun Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16894v4","updated":"2023-12-05T05:34:18Z","published":"2023-03-29T17:59:10Z","title":"ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with\n GPT and Prototype Guidance","summary":" Understanding 3D scenes from multi-view inputs has been proven to alleviate\nthe view discrepancy issue in 3D visual grounding. However, existing methods\nnormally neglect the view cues embedded in the text modality and fail to weigh\nthe relative importance of different views. In this paper, we propose\nViewRefer, a multi-view framework for 3D visual grounding exploring how to\ngrasp the view knowledge from both text and 3D modalities. For the text branch,\nViewRefer leverages the diverse linguistic knowledge of large-scale language\nmodels, e.g., GPT, to expand a single grounding text to multiple\ngeometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer\nfusion module with inter-view attention is introduced to boost the interaction\nof objects across views. On top of that, we further present a set of learnable\nmulti-view prototypes, which memorize scene-agnostic knowledge for different\nviews, and enhance the framework from two perspectives: a view-guided attention\nmodule for more robust text features, and a view-guided scoring strategy during\nthe final prediction. With our designed paradigm, ViewRefer achieves superior\nperformance on three benchmarks and surpasses the second-best by +2.8%, +1.5%,\nand +1.35% on Sr3D, Nr3D, and ScanRefer. Code is released at\nhttps://github.com/Ivan-Tang-3D/ViewRefer3D.\n","authors":["Zoey Guo","Yiwen Tang","Ray Zhang","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2303.16894v4.pdf","comment":"Accepted by ICCV 2023. Code is released at\n https://github.com/Ivan-Tang-3D/ViewRefer3D"},{"id":"http://arxiv.org/abs/2310.01415v3","updated":"2023-12-05T05:26:29Z","published":"2023-10-02T17:59:57Z","title":"GPT-Driver: Learning to Drive with GPT","summary":" We present a simple yet effective approach that can transform the OpenAI\nGPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion\nplanning is a core challenge in autonomous driving, aiming to plan a driving\ntrajectory that is safe and comfortable. Existing motion planners predominantly\nleverage heuristic methods to forecast driving trajectories, yet these\napproaches demonstrate insufficient generalization capabilities in the face of\nnovel and unseen driving scenarios. In this paper, we propose a novel approach\nto motion planning that capitalizes on the strong reasoning capabilities and\ngeneralization potential inherent to Large Language Models (LLMs). The\nfundamental insight of our approach is the reformulation of motion planning as\na language modeling problem, a perspective not previously explored.\nSpecifically, we represent the planner inputs and outputs as language tokens,\nand leverage the LLM to generate driving trajectories through a language\ndescription of coordinate positions. Furthermore, we propose a novel\nprompting-reasoning-finetuning strategy to stimulate the numerical reasoning\npotential of the LLM. With this strategy, the LLM can describe highly precise\ntrajectory coordinates and also its internal decision-making process in natural\nlanguage. We evaluate our approach on the large-scale nuScenes dataset, and\nextensive experiments substantiate the effectiveness, generalization ability,\nand interpretability of our GPT-based motion planner. Code is now available at\nhttps://github.com/PointsCoder/GPT-Driver.\n","authors":["Jiageng Mao","Yuxi Qian","Junjie Ye","Hang Zhao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01415v3.pdf","comment":"NeurIPS 2023 Foundation Models for Decision Making Workshop"},{"id":"http://arxiv.org/abs/2310.04406v2","updated":"2023-12-05T05:25:55Z","published":"2023-10-06T17:55:11Z","title":"Language Agent Tree Search Unifies Reasoning Acting and Planning in\n Language Models","summary":" While large language models (LLMs) have demonstrated impressive performance\non a range of decision-making tasks, they rely on simple acting processes and\nfall short of broad deployment as autonomous agents. We introduce LATS\n(Language Agent Tree Search), a general framework that synergizes the\ncapabilities of LLMs in planning, acting, and reasoning. Drawing inspiration\nfrom Monte Carlo tree search in model-based reinforcement learning, LATS\nemploys LLMs as agents, value functions, and optimizers, repurposing their\nlatent strengths for enhanced decision-making. What is crucial in this method\nis the use of an environment for external feedback, which offers a more\ndeliberate and adaptive problem-solving mechanism that moves beyond the\nlimitations of existing techniques. Our experimental evaluation across diverse\ndomains, such as programming, HotPotQA, and WebShop, illustrates the\napplicability of LATS for both reasoning and acting. In particular, LATS\nachieves 94.4% for programming on HumanEval with GPT-4 and an average score of\n75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness\nand generality of our method.\n","authors":["Andy Zhou","Kai Yan","Michal Shlapentokh-Rothman","Haohan Wang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.04406v2.pdf","comment":"Website and code can be found at\n https://andyz245.github.io/LanguageAgentTreeSearch"},{"id":"http://arxiv.org/abs/2312.02496v1","updated":"2023-12-05T04:55:54Z","published":"2023-12-05T04:55:54Z","title":"MKA: A Scalable Medical Knowledge Assisted Mechanism for Generative\n Models on Medical Conversation Tasks","summary":" Using natural language processing (NLP) technologies to develop medical\nchatbots makes the diagnosis of the patient more convenient and efficient,\nwhich is a typical application in healthcare AI. Because of its importance,\nlots of research have been come out. Recently, the neural generative models\nhave shown their impressive ability as the core of chatbot, while it cannot\nscale well when directly applied to medical conversation due to the lack of\nmedical-specific knowledge. To address the limitation, a scalable Medical\nKnowledge Assisted mechanism, MKA, is proposed in this paper. The mechanism\naims to assist general neural generative models to achieve better performance\non the medical conversation task. The medical-specific knowledge graph is\ndesigned within the mechanism, which contains 6 types of medical-related\ninformation, including department, drug, check, symptom, disease, food.\nBesides, the specific token concatenation policy is defined to effectively\ninject medical information into the input data. Evaluation of our method is\ncarried out on two typical medical datasets, MedDG and MedDialog-CN. The\nevaluation results demonstrate that models combined with our mechanism\noutperform original methods in multiple automatic evaluation metrics. Besides,\nMKA-Bert-GPT achieves state-of-the-art performance. The open-sourced codes are\npublic:\nhttps://github.com/LIANGKE23/Knowledge_Assisted_Medical_Dialogue_Generation_Mechanism\n","authors":["Ke Liang","Sifan Wu","Jiayi Gu"],"pdf_url":"https://arxiv.org/pdf/2312.02496v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02143v2","updated":"2023-12-05T03:44:19Z","published":"2023-12-04T18:58:57Z","title":"Competition-Level Problems are Effective LLM Evaluators","summary":" Large language models (LLMs) have demonstrated impressive reasoning\ncapabilities, yet there is ongoing debate about these abilities and the\npotential data contamination problem recently. This paper aims to evaluate the\nreasoning capacities of LLMs, specifically in solving recent competition-level\nprogramming problems in Codeforces, which are expert-crafted and unique,\nrequiring deep understanding and robust reasoning skills. We first provide a\ncomprehensive evaluation of GPT-4's peiceived zero-shot performance on this\ntask, considering various aspects such as problems' release time, difficulties,\nand types of errors encountered. Surprisingly, the peiceived performance of\nGPT-4 has experienced a cliff like decline in problems after September 2021\nconsistently across all the difficulties and types of problems, which shows the\npotential data contamination, as well as the challenges for any existing LLM to\nsolve unseen complex reasoning problems. We further explore various approaches\nsuch as fine-tuning, Chain-of-Thought prompting and problem description\nsimplification, unfortunately none of them is able to consistently mitigate the\nchallenges. Through our work, we emphasis the importance of this excellent data\nsource for assessing the genuine reasoning capabilities of LLMs, and foster the\ndevelopment of LLMs with stronger reasoning abilities and better generalization\nin the future.\n","authors":["Yiming Huang","Zhenghao Lin","Xiao Liu","Yeyun Gong","Shuai Lu","Fangyu Lei","Yaobo Liang","Yelong Shen","Chen Lin","Nan Duan","Weizhu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.02143v2.pdf","comment":"In progress"},{"id":"http://arxiv.org/abs/2312.02441v1","updated":"2023-12-05T02:44:07Z","published":"2023-12-05T02:44:07Z","title":"MedDM:LLM-executable clinical guidance tree for clinical decision-making","summary":" It is becoming increasingly emphasis on the importance of LLM participating\nin clinical diagnosis decision-making. However, the low specialization refers\nto that current medical LLMs can not provide specific medical advice, which are\nmore like a medical Q\\&A. And there is no suitable clinical guidance tree data\nset that can be used directly with LLM. To address this issue, we first propose\nLLM-executavle clinical guidance tree(CGT), which can be directly used by large\nlanguage models, and construct medical diagnostic decision-making dataset\n(MedDM), from flowcharts in clinical practice guidelines. We propose an\napproach to screen flowcharts from medical literature, followed by their\nidentification and conversion into standardized diagnostic decision trees.\nConstructed a knowledge base with 1202 decision trees, which came from 5000\nmedical literature and covered 12 hospital departments, including internal\nmedicine, surgery, psychiatry, and over 500 diseases.Moreover, we propose a\nmethod for reasoning on LLM-executable CGT and a Patient-LLM multi-turn\ndialogue framework.\n","authors":["Binbin Li","Tianxin Meng","Xiaoming Shi","Jie Zhai","Tong Ruan"],"pdf_url":"https://arxiv.org/pdf/2312.02441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02439v1","updated":"2023-12-05T02:41:57Z","published":"2023-12-05T02:41:57Z","title":"Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language\n Models with Creative Humor Generation","summary":" Chain-of-Thought (CoT) guides large language models (LLMs) to reason\nstep-by-step, and can motivate their logical reasoning ability. While effective\nfor logical tasks, CoT is not conducive to creative problem-solving which often\nrequires out-of-box thoughts and is crucial for innovation advancements. In\nthis paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a\nnon-sequential, creative paradigm involving strong associations and knowledge\nleaps. To this end, we study LLMs on the popular Oogiri game which needs\nparticipants to have good creativity and strong associative thinking for\nresponding unexpectedly and humorously to the given image, text, or both, and\nthus is suitable for LoT study. Then to investigate LLMs' LoT ability in the\nOogiri game, we first build a multimodal and multilingual Oogiri-GO dataset\nwhich contains over 130,000 samples from the Oogiri game, and observe the\ninsufficient LoT ability or failures of most existing LLMs on the Oogiri game.\nAccordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve\nLLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into\nLoT-oriented instruction tuning data to train pretrained LLM for achieving\ncertain LoT humor generation and discrimination abilities. Then CLoT designs an\nexplorative self-refinement that encourages the LLM to generate more creative\nLoT data via exploring parallels between seemingly unrelated concepts and\nselects high-quality data to train itself for self-refinement. CLoT not only\nexcels in humor generation in the Oogiri game but also boosts creative\nabilities in various tasks like cloud guessing game and divergent association\ntask. These findings advance our understanding and offer a pathway to improve\nLLMs' creative capacities for innovative applications across domains. The\ndataset, code, and models will be released online.\nhttps://github.com/sail-sg/CLoT.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Shanghua Gao","Wushao Wen","Liang Lin","Marinka Zitnik","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02439v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.02436v1","updated":"2023-12-05T02:32:08Z","published":"2023-12-05T02:32:08Z","title":"MUFFIN: Curating Multi-Faceted Instructions for Improving\n Instruction-Following","summary":" In the realm of large language models (LLMs), enhancing instruction-following\ncapability often involves curating expansive training data. This is achieved\nthrough two primary schemes: i) Scaling-Inputs: Amplifying (input, output)\npairs per task instruction, aiming for better instruction adherence. ii)\nScaling Input-Free Tasks: Enlarging tasks, each composed of an (instruction,\noutput) pair (without requiring a separate input anymore). However, LLMs under\nScaling-Inputs tend to be overly sensitive to inputs, leading to\nmisinterpretation or non-compliance with instructions. Conversely, Scaling\nInput-Free Tasks demands a substantial number of tasks but is less effective in\ninstruction following when dealing with instances in Scaling-Inputs. This work\nintroduces MUFFIN, a new scheme of instruction-following dataset curation.\nSpecifically, we automatically Scale Tasks per Input by diversifying these\ntasks with various input facets. Experimental results across four zero-shot\nbenchmarks, spanning both Scaling-Inputs and Scaling Input-Free Tasks schemes,\nreveal that LLMs, at various scales, trained on MUFFIN generally demonstrate\nsuperior instruction-following capabilities compared to those trained on the\ntwo aforementioned schemes.\n","authors":["Renze Lou","Kai Zhang","Jian Xie","Yuxuan Sun","Janice Ahn","Hanzi Xu","Yu Su","Wenpeng Yin"],"pdf_url":"https://arxiv.org/pdf/2312.02436v1.pdf","comment":"Website: https://renzelou.github.io/Muffin/"},{"id":"http://arxiv.org/abs/2310.07135v2","updated":"2023-12-05T02:18:40Z","published":"2023-10-11T02:16:12Z","title":"Comparing Styles across Languages","summary":" Understanding how styles differ across languages is advantageous for training\nboth humans and computers to generate culturally appropriate text. We introduce\nan explanation framework to extract stylistic differences from multilingual LMs\nand compare styles across languages. Our framework (1) generates comprehensive\nstyle lexica in any language and (2) consolidates feature importances from LMs\ninto comparable lexical categories. We apply this framework to compare\npoliteness, creating the first holistic multilingual politeness dataset and\nexploring how politeness varies across four languages. Our approach enables an\neffective evaluation of how distinct linguistic categories contribute to\nstylistic variations and provides interpretable insights into how people\ncommunicate differently around the world.\n","authors":["Shreya Havaldar","Matthew Pressimone","Eric Wong","Lyle Ungar"],"pdf_url":"https://arxiv.org/pdf/2310.07135v2.pdf","comment":"Accepted to EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.02431v1","updated":"2023-12-05T02:17:29Z","published":"2023-12-05T02:17:29Z","title":"Visually Grounded Language Learning: a review of language games,\n datasets, tasks, and models","summary":" In recent years, several machine learning models have been proposed. They are\ntrained with a language modelling objective on large-scale text-only data. With\nsuch pretraining, they can achieve impressive results on many Natural Language\nUnderstanding and Generation tasks. However, many facets of meaning cannot be\nlearned by ``listening to the radio\" only. In the literature, many\nVision+Language (V+L) tasks have been defined with the aim of creating models\nthat can ground symbols in the visual modality. In this work, we provide a\nsystematic literature review of several tasks and models proposed in the V+L\nfield. We rely on Wittgenstein's idea of `language games' to categorise such\ntasks into 3 different families: 1) discriminative games, 2) generative games,\nand 3) interactive games. Our analysis of the literature provides evidence that\nfuture work should be focusing on interactive games where communication in\nNatural Language is important to resolve ambiguities about object referents and\naction plans and that physical embodiment is essential to understand the\nsemantics of situations and events. Overall, these represent key requirements\nfor developing grounded meanings in neural models.\n","authors":["Alessandro Suglia","Ioannis Konstas","Oliver Lemon"],"pdf_url":"https://arxiv.org/pdf/2312.02431v1.pdf","comment":"Preprint for JAIR before copyediting"},{"id":"http://arxiv.org/abs/2211.16934v2","updated":"2023-12-05T01:24:29Z","published":"2022-11-30T12:09:40Z","title":"VideoDubber: Machine Translation with Speech-Aware Length Control for\n Video Dubbing","summary":" Video dubbing aims to translate the original speech in a film or television\nprogram into the speech in a target language, which can be achieved with a\ncascaded system consisting of speech recognition, machine translation and\nspeech synthesis. To ensure the translated speech to be well aligned with the\ncorresponding video, the length/duration of the translated speech should be as\nclose as possible to that of the original speech, which requires strict length\ncontrol. Previous works usually control the number of words or characters\ngenerated by the machine translation model to be similar to the source\nsentence, without considering the isochronicity of speech as the speech\nduration of words/characters in different languages varies. In this paper, we\npropose a machine translation system tailored for the task of video dubbing,\nwhich directly considers the speech duration of each token in translation, to\nmatch the length of source and target speech. Specifically, we control the\nspeech length of generated sentence by guiding the prediction of each word with\nthe duration information, including the speech duration of itself as well as\nhow much duration is left for the remaining words. We design experiments on\nfour language directions (German -> English, Spanish -> English, Chinese <->\nEnglish), and the results show that the proposed method achieves better length\ncontrol ability on the generated speech than baseline methods. To make up the\nlack of real-world datasets, we also construct a real-world test set collected\nfrom films to provide comprehensive evaluations on the video dubbing task.\n","authors":["Yihan Wu","Junliang Guo","Xu Tan","Chen Zhang","Bohan Li","Ruihua Song","Lei He","Sheng Zhao","Arul Menezes","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2211.16934v2.pdf","comment":"AAAI 2023 camera version"},{"id":"http://arxiv.org/abs/2312.02418v1","updated":"2023-12-05T01:19:30Z","published":"2023-12-05T01:19:30Z","title":"Decoding Data Quality via Synthetic Corruptions: Embedding-guided\n Pruning of Code Data","summary":" Code datasets, often collected from diverse and uncontrolled sources such as\nGitHub, potentially suffer from quality issues, thereby affecting the\nperformance and training efficiency of Large Language Models (LLMs) optimized\nfor code generation. Previous studies demonstrated the benefit of using\nembedding spaces for data pruning, but they mainly focused on duplicate removal\nor increasing variety, and in other modalities, such as images. Our work\nfocuses on using embeddings to identify and remove \"low-quality\" code data.\nFirst, we explore features of \"low-quality\" code in embedding space, through\nthe use of synthetic corruptions. Armed with this knowledge, we devise novel\npruning metrics that operate in embedding space to identify and remove\nlow-quality entries in the Stack dataset. We demonstrate the benefits of this\nsynthetic corruption informed pruning (SCIP) approach on the well-established\nHumanEval and MBPP benchmarks, outperforming existing embedding-based methods.\nImportantly, we achieve up to a 3% performance improvement over no pruning,\nthereby showing the promise of insights from synthetic corruptions for data\npruning.\n","authors":["Yu Yang","Aaditya K. Singh","Mostafa Elhoushi","Anas Mahmoud","Kushal Tirumala","Fabian Gloeckle","Baptiste Rozière","Carole-Jean Wu","Ari S. Morcos","Newsha Ardalani"],"pdf_url":"https://arxiv.org/pdf/2312.02418v1.pdf","comment":"12 pages, 4 figures, Oral Presentation at 3rd Workshop on Efficient\n Natural Language and Speech Processing (ENLSP-III), NeurIPS 2023"},{"id":"http://arxiv.org/abs/2212.09917v2","updated":"2023-12-05T01:06:17Z","published":"2022-12-19T23:45:05Z","title":"Inverse Reinforcement Learning for Text Summarization","summary":" We introduce inverse reinforcement learning (IRL) as an effective paradigm\nfor training abstractive summarization models, imitating human summarization\nbehaviors. Our IRL model estimates the reward function using a suite of\nimportant sub-rewards for summarization and concurrently optimizes the policy\nnetwork. Experimental results across datasets in different domains\n(CNN/DailyMail and WikiHow) and various model sizes (BART-base and BART-large)\ndemonstrate the superiority of our proposed IRL model for summarization over\nMLE and RL baselines. The resulting summaries exhibit greater similarity to\nhuman-crafted gold references, outperforming MLE and RL baselines on metrics\nsuch as ROUGE, coverage, novelty, compression ratio, factuality, and human\nevaluations.\n","authors":["Yu Fu","Deyi Xiong","Yue Dong"],"pdf_url":"https://arxiv.org/pdf/2212.09917v2.pdf","comment":"8 pages, 2 figures; accepted to Findings of EMNLP 2013"},{"id":"http://arxiv.org/abs/2312.02406v1","updated":"2023-12-05T00:42:35Z","published":"2023-12-05T00:42:35Z","title":"Efficient Online Data Mixing For Language Model Pre-Training","summary":" The data used to pretrain large language models has a decisive impact on a\nmodel's downstream performance, which has led to a large body of work on data\nselection methods that aim to automatically determine the most suitable data to\nuse for pretraining. Existing data selection methods suffer from slow and\ncomputationally expensive processes, a problem amplified by the increasing size\nof models and of pretraining datasets. Data mixing, on the other hand, reduces\nthe complexity of data selection by grouping data points together and\ndetermining sampling probabilities across entire groups. However, data mixing\nproportions are typically fixed before training and therefore cannot adapt to\nchanging training dynamics. To address these limitations, we develop an\nefficient algorithm for Online Data Mixing (ODM) that combines elements from\nboth data selection and data mixing. Based on multi-armed bandit algorithms,\nour online approach optimizes the data mixing proportions during training.\nRemarkably, our method trains a model that reaches the final perplexity of the\nnext best method with 19\\% fewer training iterations, and improves performance\non the 5-shot MMLU benchmark by 1.9% relative accuracy, while adding negligible\nwall-clock time during pretraining.\n","authors":["Alon Albalak","Liangming Pan","Colin Raffel","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03173v1","updated":"2023-12-05T22:29:43Z","published":"2023-12-05T22:29:43Z","title":"A Comparative Study of AI-Generated (GPT-4) and Human-crafted MCQs in\n Programming Education","summary":" There is a constant need for educators to develop and maintain effective\nup-to-date assessments. While there is a growing body of research in computing\neducation on utilizing large language models (LLMs) in generation and\nengagement with coding exercises, the use of LLMs for generating programming\nMCQs has not been extensively explored. We analyzed the capability of GPT-4 to\nproduce multiple-choice questions (MCQs) aligned with specific learning\nobjectives (LOs) from Python programming classes in higher education.\nSpecifically, we developed an LLM-powered (GPT-4) system for generation of MCQs\nfrom high-level course context and module-level LOs. We evaluated 651\nLLM-generated and 449 human-crafted MCQs aligned to 246 LOs from 6 Python\ncourses. We found that GPT-4 was capable of producing MCQs with clear language,\na single correct choice, and high-quality distractors. We also observed that\nthe generated MCQs appeared to be well-aligned with the LOs. Our findings can\nbe leveraged by educators wishing to take advantage of the state-of-the-art\ngenerative models to support MCQ authoring efforts.\n","authors":["Jacob Doughty","Zipiao Wan","Anishka Bompelli","Jubahed Qayum","Taozhi Wang","Juran Zhang","Yujia Zheng","Aidan Doyle","Pragnya Sridhar","Arav Agarwal","Christopher Bogart","Eric Keylor","Can Kultur","Jaromir Savelka","Majd Sakr"],"pdf_url":"https://arxiv.org/pdf/2312.03173v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03171v1","updated":"2023-12-05T22:28:42Z","published":"2023-12-05T22:28:42Z","title":"Combining Counting Processes and Classification Improves a Stopping Rule\n for Technology Assisted Review","summary":" Technology Assisted Review (TAR) stopping rules aim to reduce the cost of\nmanually assessing documents for relevance by minimising the number of\ndocuments that need to be examined to ensure a desired level of recall. This\npaper extends an effective stopping rule using information derived from a text\nclassifier that can be trained without the need for any additional annotation.\nExperiments on multiple data sets (CLEF e-Health, TREC Total Recall, TREC Legal\nand RCV1) showed that the proposed approach consistently improves performance\nand outperforms several alternative methods.\n","authors":["Reem Bin-Hezam","Mark Stevenson"],"pdf_url":"https://arxiv.org/pdf/2312.03171v1.pdf","comment":"Accepted at EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2212.09710v2","updated":"2023-12-05T21:22:11Z","published":"2022-12-19T18:39:43Z","title":"Continual Learning for Instruction Following from Realtime Feedback","summary":" We propose and deploy an approach to continually train an\ninstruction-following agent from feedback provided by users during\ncollaborative interactions. During interaction, human users instruct an agent\nusing natural language, and provide realtime binary feedback as they observe\nthe agent following their instructions. We design a contextual bandit learning\napproach, converting user feedback to immediate reward. We evaluate through\nthousands of human-agent interactions, demonstrating 15.4% absolute improvement\nin instruction execution accuracy over time. We also show our approach is\nrobust to several design variations, and that the feedback signal is roughly\nequivalent to the learning signal of supervised demonstration data.\n","authors":["Alane Suhr","Yoav Artzi"],"pdf_url":"https://arxiv.org/pdf/2212.09710v2.pdf","comment":"NeurIPS 2023 Spotlight paper"},{"id":"http://arxiv.org/abs/2305.13388v2","updated":"2023-12-05T21:20:16Z","published":"2023-05-22T18:06:32Z","title":"The neural dynamics of auditory word recognition and integration","summary":" Listeners recognize and integrate words in rapid and noisy everyday speech by\ncombining expectations about upcoming content with incremental sensory\nevidence. We present a computational model of word recognition which formalizes\nthis perceptual process in Bayesian decision theory. We fit this model to\nexplain scalp EEG signals recorded as subjects passively listened to a\nfictional story, revealing both the dynamics of the online auditory word\nrecognition process and the neural correlates of the recognition and\nintegration of words.\n The model reveals distinct neural processing of words depending on whether or\nnot they can be quickly recognized. While all words trigger a neural response\ncharacteristic of probabilistic integration -- voltage modulations predicted by\na word's surprisal in context -- these modulations are amplified for words\nwhich require more than roughly 150 ms of input to be recognized. We observe no\ndifference in the latency of these neural responses according to words'\nrecognition times. Our results are consistent with a two-part model of speech\ncomprehension, combining an eager and rapid process of word recognition with a\ntemporally independent process of word integration. However, we also developed\nalternative models of the scalp EEG signal not incorporating word recognition\ndynamics which showed similar performance improvements. We discuss potential\nfuture modeling steps which may help to separate these hypotheses.\n","authors":["Jon Gauthier","Roger Levy"],"pdf_url":"https://arxiv.org/pdf/2305.13388v2.pdf","comment":"Published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.03140v1","updated":"2023-12-05T21:19:33Z","published":"2023-12-05T21:19:33Z","title":"FlexModel: A Framework for Interpretability of Distributed Large\n Language Models","summary":" With the growth of large language models, now incorporating billions of\nparameters, the hardware prerequisites for their training and deployment have\nseen a corresponding increase. Although existing tools facilitate model\nparallelization and distributed training, deeper model interactions, crucial\nfor interpretability and responsible AI techniques, still demand thorough\nknowledge of distributed computing. This often hinders contributions from\nresearchers with machine learning expertise but limited distributed computing\nbackground. Addressing this challenge, we present FlexModel, a software package\nproviding a streamlined interface for engaging with models distributed across\nmulti-GPU and multi-node configurations. The library is compatible with\nexisting model distribution libraries and encapsulates PyTorch models. It\nexposes user-registerable HookFunctions to facilitate straightforward\ninteraction with distributed model internals, bridging the gap between\ndistributed and single-device model paradigms. Primarily, FlexModel enhances\naccessibility by democratizing model interactions and promotes more inclusive\nresearch in the domain of large-scale neural networks. The package is found at\nhttps://github.com/VectorInstitute/flex_model.\n","authors":["Matthew Choi","Muhammad Adil Asif","John Willes","David Emerson"],"pdf_url":"https://arxiv.org/pdf/2312.03140v1.pdf","comment":"14 pages, 8 figures. To appear at the Socially Responsible Language\n Modelling Research (SoLaR) Workshop, 37th Conference on Neural Information\n Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2305.14815v4","updated":"2023-12-05T20:59:58Z","published":"2023-05-24T07:09:56Z","title":"Machine Reading Comprehension using Case-based Reasoning","summary":" We present an accurate and interpretable method for answer extraction in\nmachine reading comprehension that is reminiscent of case-based reasoning (CBR)\nfrom classical AI. Our method (CBR-MRC) builds upon the hypothesis that\ncontextualized answers to similar questions share semantic similarities with\neach other. Given a test question, CBR-MRC first retrieves a set of similar\ncases from a nonparametric memory and then predicts an answer by selecting the\nspan in the test context that is most similar to the contextualized\nrepresentations of answers in the retrieved cases. The semi-parametric nature\nof our approach allows it to attribute a prediction to the specific set of\nevidence cases, making it a desirable choice for building reliable and\ndebuggable QA systems. We show that CBR-MRC provides high accuracy comparable\nwith large reader models and outperforms baselines by 11.5 and 8.4 EM on\nNaturalQuestions and NewsQA, respectively. Further, we demonstrate the ability\nof CBR-MRC in identifying not just the correct answer tokens but also the span\nwith the most relevant supporting evidence. Lastly, we observe that contexts\nfor certain question types show higher lexical diversity than others and find\nthat CBR-MRC is robust to these variations while performance using\nfully-parametric methods drops.\n","authors":["Dung Thai","Dhruv Agarwal","Mudit Chaudhary","Wenlong Zhao","Rajarshi Das","Manzil Zaheer","Jay-Yoon Lee","Hannaneh Hajishirzi","Andrew McCallum"],"pdf_url":"https://arxiv.org/pdf/2305.14815v4.pdf","comment":"9 pages, 2 figures"},{"id":"http://arxiv.org/abs/2311.00694v2","updated":"2023-12-05T20:44:45Z","published":"2023-11-01T17:52:15Z","title":"Unleashing the Creative Mind: Language Model As Hierarchical Policy For\n Improved Exploration on Challenging Problem Solving","summary":" Large Language Models (LLMs) have achieved tremendous progress, yet they\nstill often struggle with challenging reasoning problems. Current approaches\naddress this challenge by sampling or searching detailed and low-level\nreasoning chains. However, these methods are still limited in their exploration\ncapabilities, making it challenging for correct solutions to stand out in the\nhuge solution space. In this work, we unleash LLMs' creative potential for\nexploring multiple diverse problem solving strategies by framing an LLM as a\nhierarchical policy via in-context learning. This policy comprises of a\nvisionary leader that proposes multiple diverse high-level problem-solving\ntactics as hints, accompanied by a follower that executes detailed\nproblem-solving processes following each of the high-level instruction. The\nfollower uses each of the leader's directives as a guide and samples multiple\nreasoning chains to tackle the problem, generating a solution group for each\nleader proposal. Additionally, we propose an effective and efficient\ntournament-based approach to select among these explored solution groups to\nreach the final answer. Our approach produces meaningful and inspiring hints,\nenhances problem-solving strategy exploration, and improves the final answer\naccuracy on challenging problems in the MATH dataset. Code will be released at\nhttps://github.com/lz1oceani/LLM-As-Hierarchical-Policy.\n","authors":["Zhan Ling","Yunhao Fang","Xuanlin Li","Tongzhou Mu","Mingu Lee","Reza Pourreza","Roland Memisevic","Hao Su"],"pdf_url":"https://arxiv.org/pdf/2311.00694v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03122v1","updated":"2023-12-05T20:41:34Z","published":"2023-12-05T20:41:34Z","title":"Assertion Enhanced Few-Shot Learning: Instructive Technique for Large\n Language Models to Generate Educational Explanations","summary":" Human educators possess an intrinsic ability to anticipate and seek\neducational explanations from students, which drives them to pose\nthought-provoking questions when students cannot articulate these explanations\nindependently. We aim to imbue Intelligent Tutoring Systems with this ability\nusing few-shot learning capability of Large Language Models. Our work proposes\na novel prompting technique, Assertion Enhanced Few-Shot Learning, to\nfacilitate the generation of accurate, detailed oriented educational\nexplanations. Our central hypothesis is that, in educational domain, few-shot\ndemonstrations are necessary but not a sufficient condition for quality\nexplanation generation. We conducted a study involving 12 in-service teachers,\ncomparing our approach to Traditional Few-Shot Learning. The results show that\nAssertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and\nyields higher-quality explanations, as evaluated by teachers. We also conduct a\nqualitative ablation study to factor the impact of assertions to provide\neducator-friendly prompting guidelines for generating explanations in their\ndomain of interest.\n","authors":["Tasmia Shahriar","Noboru Matsuda","Kelly Ramos"],"pdf_url":"https://arxiv.org/pdf/2312.03122v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09428v2","updated":"2023-12-05T20:36:23Z","published":"2023-11-15T22:57:13Z","title":"Beyond Detection: Unveiling Fairness Vulnerabilities in Abusive Language\n Models","summary":" This work investigates the potential of undermining both fairness and\ndetection performance in abusive language detection. In a dynamic and complex\ndigital world, it is crucial to investigate the vulnerabilities of these\ndetection models to adversarial fairness attacks to improve their fairness\nrobustness. We propose a simple yet effective framework FABLE that leverages\nbackdoor attacks as they allow targeted control over the fairness and detection\nperformance. FABLE explores three types of trigger designs (i.e., rare,\nartificial, and natural triggers) and novel sampling strategies. Specifically,\nthe adversary can inject triggers into samples in the minority group with the\nfavored outcome (i.e., \"non-abusive\") and flip their labels to the unfavored\noutcome, i.e., \"abusive\". Experiments on benchmark datasets demonstrate the\neffectiveness of FABLE attacking fairness and utility in abusive language\ndetection.\n","authors":["Yueqing Liang","Lu Cheng","Ali Payani","Kai Shu"],"pdf_url":"https://arxiv.org/pdf/2311.09428v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.03095v1","updated":"2023-12-05T19:26:28Z","published":"2023-12-05T19:26:28Z","title":"Understanding Environmental Posts: Sentiment and Emotion Analysis of\n Social Media Data","summary":" Social media is now the predominant source of information due to the\navailability of immediate public response. As a result, social media data has\nbecome a valuable resource for comprehending public sentiments. Studies have\nshown that it can amplify ideas and influence public sentiments. This study\nanalyzes the public perception of climate change and the environment over a\ndecade from 2014 to 2023. Using the Pointwise Mutual Information (PMI)\nalgorithm, we identify sentiment and explore prevailing emotions expressed\nwithin environmental tweets across various social media platforms, namely\nTwitter, Reddit, and YouTube. Accuracy on a human-annotated dataset was 0.65,\nhigher than Vader score but lower than that of an expert rater (0.90). Our\nfindings suggest that negative environmental tweets are far more common than\npositive or neutral ones. Climate change, air quality, emissions, plastic, and\nrecycling are the most discussed topics on all social media platforms,\nhighlighting its huge global concern. The most common emotions in environmental\ntweets are fear, trust, and anticipation, demonstrating public reactions wide\nand complex nature. By identifying patterns and trends in opinions related to\nthe environment, we hope to provide insights that can help raise awareness\nregarding environmental issues, inform the development of interventions, and\nadapt further actions to meet environmental challenges.\n","authors":["Daniyar Amangeldi","Aida Usmanova","Pakizar Shamoi"],"pdf_url":"https://arxiv.org/pdf/2312.03095v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2303.09128v2","updated":"2023-12-05T19:25:52Z","published":"2023-03-16T07:45:46Z","title":"Exploring Distributional Shifts in Large Language Models for Code\n Analysis","summary":" We systematically study how three large language models with code\ncapabilities - CodeT5, Codex, and ChatGPT - generalize to out-of-domain data.\nWe consider two fundamental applications - code summarization, and code\ngeneration. We split data into domains following its natural boundaries - by an\norganization, by a project, and by a module within the software project. We\nestablish that samples from each new domain present all the models with a\nsignificant challenge of distribution shift. We study how established methods\nadapt models to better generalize to new domains. Our experiments show that\nwhile multitask learning alone is a reasonable baseline, combining it with\nfew-shot finetuning on examples retrieved from training data can achieve very\nstrong performance. Moreover, this solution can outperform direct finetuning\nfor very low-data scenarios. Finally, we consider variations of this approach\nto create a more broadly applicable method to adapt to multiple domains at\nonce. We find that for code generation, a model adapted to multiple domains\nsimultaneously performs on par with those adapted to a single domain\n","authors":["Shushan Arakelyan","Rocktim Jyoti Das","Yi Mao","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2303.09128v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03093v1","updated":"2023-12-05T19:25:38Z","published":"2023-12-05T19:25:38Z","title":"RESIN-EDITOR: A Schema-guided Hierarchical Event Graph Visualizer and\n Editor","summary":" In this paper, we present RESIN-EDITOR, an interactive event graph visualizer\nand editor designed for analyzing complex events. Our RESIN-EDITOR system\nallows users to render and freely edit hierarchical event graphs extracted from\nmultimedia and multi-document news clusters with guidance from human-curated\nevent schemas. RESIN-EDITOR's unique features include hierarchical graph\nvisualization, comprehensive source tracing, and interactive user editing,\nwhich is more powerful and versatile than existing Information Extraction (IE)\nvisualization tools. In our evaluation of RESIN-EDITOR, we demonstrate ways in\nwhich our tool is effective in understanding complex events and enhancing\nsystem performance. The source code, a video demonstration, and a live website\nfor RESIN-EDITOR have been made publicly available.\n","authors":["Khanh Duy Nguyen","Zixuan Zhang","Reece Suchocki","Sha Li","Martha Palmer","Susan Brown","Jiawei Han","Heng Ji"],"pdf_url":"https://arxiv.org/pdf/2312.03093v1.pdf","comment":"The first two authors contribute equally to this paper"},{"id":"http://arxiv.org/abs/2312.03088v1","updated":"2023-12-05T19:04:50Z","published":"2023-12-05T19:04:50Z","title":"LLMs for Multi-Modal Knowledge Extraction and Analysis in\n Intelligence/Safety-Critical Applications","summary":" Large Language Models have seen rapid progress in capability in recent years;\nthis progress has been accelerating and their capabilities, measured by various\nbenchmarks, are beginning to approach those of humans. There is a strong demand\nto use such models in a wide variety of applications but, due to unresolved\nvulnerabilities and limitations, great care needs to be used before applying\nthem to intelligence and safety-critical applications. This paper reviews\nrecent literature related to LLM assessment and vulnerabilities to synthesize\nthe current research landscape and to help understand what advances are most\ncritical to enable use of of these technologies in intelligence and\nsafety-critical applications. The vulnerabilities are broken down into ten\nhigh-level categories and overlaid onto a high-level life cycle of an LLM. Some\ngeneral categories of mitigations are reviewed.\n","authors":["Brett Israelsen","Soumalya Sarkar"],"pdf_url":"https://arxiv.org/pdf/2312.03088v1.pdf","comment":"initial draft"},{"id":"http://arxiv.org/abs/2312.03077v1","updated":"2023-12-05T19:00:18Z","published":"2023-12-05T19:00:18Z","title":"Clinical Notes Reveal Physician Fatigue","summary":" Physicians write notes about patients. In doing so, they reveal much about\nthemselves. Using data from 129,228 emergency room visits, we train a model to\nidentify notes written by fatigued physicians -- those who worked 5 or more of\nthe prior 7 days. In a hold-out set, the model accurately identifies notes\nwritten by these high-workload physicians, and also flags notes written in\nother high-fatigue settings: on overnight shifts, and after high patient\nvolumes. Model predictions also correlate with worse decision-making on at\nleast one important metric: yield of testing for heart attack is 18% lower with\neach standard deviation increase in model-predicted fatigue. Finally, the model\nindicates that notes written about Black and Hispanic patients have 12% and 21%\nhigher predicted fatigue than Whites -- larger than overnight vs. daytime\ndifferences. These results have an important implication for large language\nmodels (LLMs). Our model indicates that fatigued doctors write more predictable\nnotes. Perhaps unsurprisingly, because word prediction is the core of how LLMs\nwork, we find that LLM-written notes have 17% higher predicted fatigue than\nreal physicians' notes. This indicates that LLMs may introduce distortions in\ngenerated text that are not yet fully understood.\n","authors":["Chao-Chun Hsu","Ziad Obermeyer","Chenhao Tan"],"pdf_url":"https://arxiv.org/pdf/2312.03077v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03052v1","updated":"2023-12-05T18:58:37Z","published":"2023-12-05T18:58:37Z","title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning\n into Vision-Language Models","summary":" Solving complex visual tasks such as \"Who invented the musical instrument on\nthe right?\" involves a composition of skills: understanding space, recognizing\ninstruments, and also retrieving prior knowledge. Recent work shows promise by\ndecomposing such tasks using a large language model (LLM) into an executable\nprogram that invokes specialized vision models. However, generated programs are\nerror-prone: they omit necessary steps, include spurious ones, and are unable\nto recover when the specialized models give incorrect outputs. Moreover, they\nrequire loading multiple models, incurring high latency and computation costs.\nWe propose Visual Program Distillation (VPD), an instruction tuning framework\nthat produces a vision-language model (VLM) capable of solving complex visual\ntasks with a single forward pass. VPD distills the reasoning ability of LLMs by\nusing them to sample multiple candidate programs, which are then executed and\nverified to identify a correct one. It translates each correct program into a\nlanguage description of the reasoning steps, which are then distilled into a\nVLM. Extensive experiments show that VPD improves the VLM's ability to count,\nunderstand spatial relations, and reason compositionally. Our VPD-trained\nPaLI-X outperforms all prior VLMs, achieving state-of-the-art performance\nacross complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE,\nand Hateful Memes. An evaluation with human annotators also confirms that VPD\nimproves model response factuality and consistency. Finally, experiments on\ncontent moderation demonstrate that VPD is also helpful for adaptation to\nreal-world applications with limited data.\n","authors":["Yushi Hu","Otilia Stretcu","Chun-Ta Lu","Krishnamurthy Viswanathan","Kenji Hata","Enming Luo","Ranjay Krishna","Ariel Fuxman"],"pdf_url":"https://arxiv.org/pdf/2312.03052v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03042v1","updated":"2023-12-05T16:02:20Z","published":"2023-12-05T16:02:20Z","title":"Inherent limitations of LLMs regarding spatial information","summary":" Despite the significant advancements in natural language processing\ncapabilities demonstrated by large language models such as ChatGPT, their\nproficiency in comprehending and processing spatial information, especially\nwithin the domains of 2D and 3D route planning, remains notably underdeveloped.\nThis paper investigates the inherent limitations of ChatGPT and similar models\nin spatial reasoning and navigation-related tasks, an area critical for\napplications ranging from autonomous vehicle guidance to assistive technologies\nfor the visually impaired. In this paper, we introduce a novel evaluation\nframework complemented by a baseline dataset, meticulously crafted for this\nstudy. This dataset is structured around three key tasks: plotting spatial\npoints, planning routes in two-dimensional (2D) spaces, and devising pathways\nin three-dimensional (3D) environments. We specifically developed this dataset\nto assess the spatial reasoning abilities of ChatGPT. Our evaluation reveals\nkey insights into the model's capabilities and limitations in spatial\nunderstanding.\n","authors":["He Yan","Xinyao Hu","Xiangpeng Wan","Chengyu Huang","Kai Zou","Shiqi Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03042v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03025v1","updated":"2023-12-05T08:11:34Z","published":"2023-12-05T08:11:34Z","title":"Training on Synthetic Data Beats Real Data in Multimodal Relation\n Extraction","summary":" The task of multimodal relation extraction has attracted significant research\nattention, but progress is constrained by the scarcity of available training\ndata. One natural thought is to extend existing datasets with cross-modal\ngenerative models. In this paper, we consider a novel problem setting, where\nonly unimodal data, either text or image, are available during training. We aim\nto train a multimodal classifier from synthetic data that perform well on real\nmultimodal test data. However, training with synthetic data suffers from two\nobstacles: lack of data diversity and label information loss. To alleviate the\nissues, we propose Mutual Information-aware Multimodal Iterated Relational dAta\nGEneration (MI2RAGE), which applies Chained Cross-modal Generation (CCG) to\npromote diversity in the generated data and exploits a teacher network to\nselect valuable training samples with high mutual information with the\nground-truth labels. Comparing our method to direct training on synthetic data,\nwe observed a significant improvement of 24.06% F1 with synthetic text and\n26.42% F1 with synthetic images. Notably, our best model trained on completely\nsynthetic images outperforms prior state-of-the-art models trained on real\nmultimodal data by a margin of 3.76% in F1. Our codebase will be made available\nupon acceptance.\n","authors":["Zilin Du","Haoxin Li","Xu Guo","Boyang Li"],"pdf_url":"https://arxiv.org/pdf/2312.03025v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03022v1","updated":"2023-12-05T07:27:08Z","published":"2023-12-05T07:27:08Z","title":"Beyond Isolation: Multi-Agent Synergy for Improving Knowledge Graph\n Construction","summary":" Knowledge graph construction (KGC) is a multifaceted undertaking involving\nthe extraction of entities, relations, and events. Traditionally, large\nlanguage models (LLMs) have been viewed as solitary task-solving agents in this\ncomplex landscape. However, this paper challenges this paradigm by introducing\na novel framework, CooperKGC. Departing from the conventional approach,\nCooperKGC establishes a collaborative processing network, assembling a KGC\ncollaboration team capable of concurrently addressing entity, relation, and\nevent extraction tasks. Our experiments unequivocally demonstrate that\nfostering collaboration and information interaction among diverse agents within\nCooperKGC yields superior results compared to individual cognitive processes\noperating in isolation. Importantly, our findings reveal that the collaboration\nfacilitated by CooperKGC enhances knowledge selection, correction, and\naggregation capabilities across multiple rounds of interactions.\n","authors":["Hongbin Ye","Honghao Gui","Aijia Zhang","Tong Liu","Wei Hua","Weiqiang Jia"],"pdf_url":"https://arxiv.org/pdf/2312.03022v1.pdf","comment":"work in progress; 12 pages"},{"id":"http://arxiv.org/abs/2312.03016v1","updated":"2023-12-05T01:47:38Z","published":"2023-12-05T01:47:38Z","title":"Protein Language Model-Powered 3D Ligand Binding Site Prediction from\n Protein Sequence","summary":" Prediction of ligand binding sites of proteins is a fundamental and important\ntask for understanding the function of proteins and screening potential drugs.\nMost existing methods require experimentally determined protein holo-structures\nas input. However, such structures can be unavailable on novel or less-studied\nproteins. To tackle this limitation, we propose LaMPSite, which only takes\nprotein sequences and ligand molecular graphs as input for ligand binding site\npredictions. The protein sequences are used to retrieve residue-level\nembeddings and contact maps from the pre-trained ESM-2 protein language model.\nThe ligand molecular graphs are fed into a graph neural network to compute\natom-level embeddings. Then we compute and update the protein-ligand\ninteraction embedding based on the protein residue-level embeddings and ligand\natom-level embeddings, and the geometric constraints in the inferred protein\ncontact map and ligand distance map. A final pooling on protein-ligand\ninteraction embedding would indicate which residues belong to the binding\nsites. Without any 3D coordinate information of proteins, our proposed model\nachieves competitive performance compared to baseline methods that require 3D\nprotein structures when predicting binding sites. Given that less than 50% of\nproteins have reliable structure information in the current stage, LaMPSite\nwill provide new opportunities for drug discovery.\n","authors":["Shuo Zhang","Lei Xie"],"pdf_url":"https://arxiv.org/pdf/2312.03016v1.pdf","comment":"Accepted by the AI for Science (AI4Science) Workshop and the New\n Frontiers of AI for Drug Discovery and Development (AI4D3) Workshop at\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.03769v1","updated":"2023-12-05T21:41:52Z","published":"2023-12-05T21:41:52Z","title":"GPT vs Human for Scientific Reviews: A Dual Source Review on\n Applications of ChatGPT in Science","summary":" The new polymath Large Language Models (LLMs) can speed-up greatly scientific\nreviews, possibly using more unbiased quantitative metrics, facilitating\ncross-disciplinary connections, and identifying emerging trends and research\ngaps by analyzing large volumes of data. However, at the present time, they\nlack the required deep understanding of complex methodologies, they have\ndifficulty in evaluating innovative claims, and they are unable to assess\nethical issues and conflicts of interest. Herein, we consider 13 GPT-related\npapers across different scientific domains, reviewed by a human reviewer and\nSciSpace, a large language model, with the reviews evaluated by three distinct\ntypes of evaluators, namely GPT-3.5, a crowd panel, and GPT-4. We found that\n50% of SciSpace's responses to objective questions align with those of a human\nreviewer, with GPT-4 (informed evaluator) often rating the human reviewer\nhigher in accuracy, and SciSpace higher in structure, clarity, and\ncompleteness. In subjective questions, the uninformed evaluators (GPT-3.5 and\ncrowd panel) showed varying preferences between SciSpace and human responses,\nwith the crowd panel showing a preference for the human responses. However,\nGPT-4 rated them equally in accuracy and structure but favored SciSpace for\ncompleteness.\n","authors":["Chenxi Wu","Alan John Varghese","Vivek Oommen","George Em Karniadakis"],"pdf_url":"https://arxiv.org/pdf/2312.03769v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03766v1","updated":"2023-12-05T20:07:34Z","published":"2023-12-05T20:07:34Z","title":"Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment","summary":" While existing image-text alignment models reach high quality binary\nassessments, they fall short of pinpointing the exact source of misalignment.\nIn this paper, we present a method to provide detailed textual and visual\nexplanation of detected misalignments between text-image pairs. We leverage\nlarge language models and visual grounding models to automatically construct a\ntraining set that holds plausible misaligned captions for a given image and\ncorresponding textual explanations and visual indicators. We also publish a new\nhuman curated test set comprising ground-truth textual and visual misalignment\nannotations. Empirical results show that fine-tuning vision language models on\nour training set enables them to articulate misalignments and visually indicate\nthem within images, outperforming strong baselines both on the binary alignment\nclassification and the explanation generation tasks. Our method code and human\ncurated test set are available at: https://mismatch-quest.github.io/\n","authors":["Brian Gordon","Yonatan Bitton","Yonatan Shafir","Roopal Garg","Xi Chen","Dani Lischinski","Daniel Cohen-Or","Idan Szpektor"],"pdf_url":"https://arxiv.org/pdf/2312.03766v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.02981v1","updated":"2023-12-05T18:59:58Z","published":"2023-12-05T18:59:58Z","title":"ReconFusion: 3D Reconstruction with Diffusion Priors","summary":" 3D reconstruction methods such as Neural Radiance Fields (NeRFs) excel at\nrendering photorealistic novel views of complex scenes. However, recovering a\nhigh-quality NeRF typically requires tens to hundreds of input images,\nresulting in a time-consuming capture process. We present ReconFusion to\nreconstruct real-world scenes using only a few photos. Our approach leverages a\ndiffusion prior for novel view synthesis, trained on synthetic and multiview\ndatasets, which regularizes a NeRF-based 3D reconstruction pipeline at novel\ncamera poses beyond those captured by the set of input images. Our method\nsynthesizes realistic geometry and texture in underconstrained regions while\npreserving the appearance of observed regions. We perform an extensive\nevaluation across various real-world datasets, including forward-facing and\n360-degree scenes, demonstrating significant performance improvements over\nprevious few-view NeRF reconstruction approaches.\n","authors":["Rundi Wu","Ben Mildenhall","Philipp Henzler","Keunhong Park","Ruiqi Gao","Daniel Watson","Pratul P. Srinivasan","Dor Verbin","Jonathan T. Barron","Ben Poole","Aleksander Holynski"],"pdf_url":"https://arxiv.org/pdf/2312.02981v1.pdf","comment":"Project page: https://reconfusion.github.io/"},{"id":"http://arxiv.org/abs/2312.02980v1","updated":"2023-12-05T18:59:55Z","published":"2023-12-05T18:59:55Z","title":"GPT4Point: A Unified Framework for Point-Language Understanding and\n Generation","summary":" Multimodal Large Language Models (MLLMs) have excelled in 2D image-text\ncomprehension and image generation, but their understanding of the 3D world is\nnotably deficient, limiting progress in 3D language understanding and\ngeneration. To solve this problem, we introduce GPT4Point, an innovative\ngroundbreaking point-language multimodal model designed specifically for\nunified 3D object understanding and generation within the MLLM framework.\nGPT4Point as a powerful 3D MLLM seamlessly can execute a variety of point-text\nreference tasks such as point-cloud captioning and Q&A. Additionally, GPT4Point\nis equipped with advanced capabilities for controllable 3D generation, it can\nget high-quality results through a low-quality point-text feature maintaining\nthe geometric shapes and colors. To support the expansive needs of 3D\nobject-text pairs, we develop Pyramid-XL, a point-language dataset annotation\nengine. It constructs a large-scale database over 1M objects of varied text\ngranularity levels from the Objaverse-XL dataset, essential for training\nGPT4Point. A comprehensive benchmark has been proposed to evaluate 3D\npoint-language understanding capabilities. In extensive evaluations, GPT4Point\nhas demonstrated superior performance in understanding and generation.\n","authors":["Zhangyang Qi","Ye Fang","Zeyi Sun","Xiaoyang Wu","Tong Wu","Jiaqi Wang","Dahua Lin","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.02980v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02976v1","updated":"2023-12-05T18:59:45Z","published":"2023-12-05T18:59:45Z","title":"Imitating Shortest Paths in Simulation Enables Effective Navigation and\n Manipulation in the Real World","summary":" Reinforcement learning (RL) with dense rewards and imitation learning (IL)\nwith human-generated trajectories are the most widely used approaches for\ntraining modern embodied agents. RL requires extensive reward shaping and\nauxiliary losses and is often too slow and ineffective for long-horizon tasks.\nWhile IL with human supervision is effective, collecting human trajectories at\nscale is extremely expensive. In this work, we show that imitating\nshortest-path planners in simulation produces agents that, given a language\ninstruction, can proficiently navigate, explore, and manipulate objects in both\nsimulation and in the real world using only RGB sensors (no depth map or GPS\ncoordinates). This surprising result is enabled by our end-to-end,\ntransformer-based, SPOC architecture, powerful visual encoders paired with\nextensive image augmentation, and the dramatic scale and diversity of our\ntraining data: millions of frames of shortest-path-expert trajectories\ncollected inside approximately 200,000 procedurally generated houses containing\n40,000 unique 3D assets. Our models, data, training code, and newly proposed\n10-task benchmarking suite CHORES will be open-sourced.\n","authors":["Kiana Ehsani","Tanmay Gupta","Rose Hendrix","Jordi Salvador","Luca Weihs","Kuo-Hao Zeng","Kunal Pratap Singh","Yejin Kim","Winson Han","Alvaro Herrasti","Ranjay Krishna","Dustin Schwenk","Eli VanderBilt","Aniruddha Kembhavi"],"pdf_url":"https://arxiv.org/pdf/2312.02976v1.pdf","comment":"First six authors contributed equally. Project page:\n https://spoc-robot.github.io/"},{"id":"http://arxiv.org/abs/2312.02975v1","updated":"2023-12-05T18:59:23Z","published":"2023-12-05T18:59:23Z","title":"Dexterous Functional Grasping","summary":" While there have been significant strides in dexterous manipulation, most of\nit is limited to benchmark tasks like in-hand reorientation which are of\nlimited utility in the real world. The main benefit of dexterous hands over\ntwo-fingered ones is their ability to pickup tools and other objects (including\nthin ones) and grasp them firmly to apply force. However, this task requires\nboth a complex understanding of functional affordances as well as precise\nlow-level control. While prior work obtains affordances from human data this\napproach doesn't scale to low-level control. Similarly, simulation training\ncannot give the robot an understanding of real-world semantics. In this paper,\nwe aim to combine the best of both worlds to accomplish functional grasping for\nin-the-wild objects. We use a modular approach. First, affordances are obtained\nby matching corresponding regions of different objects and then a low-level\npolicy trained in sim is run to grasp it. We propose a novel application of\neigengrasps to reduce the search space of RL using a small amount of human data\nand find that it leads to more stable and physically realistic motion. We find\nthat eigengrasp action space beats baselines in simulation and outperforms\nhardcoded grasping in real and matches or outperforms a trained human\nteleoperator. Results visualizations and videos at https://dexfunc.github.io/\n","authors":["Ananye Agarwal","Shagun Uppal","Kenneth Shaw","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2312.02975v1.pdf","comment":"In CoRL 2023. Website at https://dexfunc.github.io/"},{"id":"http://arxiv.org/abs/2312.02974v1","updated":"2023-12-05T18:59:16Z","published":"2023-12-05T18:59:16Z","title":"Describing Differences in Image Sets with Natural Language","summary":" How do two sets of images differ? Discerning set-level differences is crucial\nfor understanding model behaviors and analyzing datasets, yet manually sifting\nthrough thousands of images is impractical. To aid in this discovery process,\nwe explore the task of automatically describing the differences between two\n$\\textbf{sets}$ of images, which we term Set Difference Captioning. This task\ntakes in image sets $D_A$ and $D_B$, and outputs a description that is more\noften true on $D_A$ than $D_B$. We outline a two-stage approach that first\nproposes candidate difference descriptions from image sets and then re-ranks\nthe candidates by checking how well they can differentiate the two sets. We\nintroduce VisDiff, which first captions the images and prompts a language model\nto propose candidate descriptions, then re-ranks these descriptions using CLIP.\nTo evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image\nsets with ground truth difference descriptions. We apply VisDiff to various\ndomains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing\nclassification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing\nmodel failure modes (supervised ResNet), characterizing differences between\ngenerative models (e.g., StableDiffusionV1 and V2), and discovering what makes\nimages memorable. Using VisDiff, we are able to find interesting and previously\nunknown differences in datasets and models, demonstrating its utility in\nrevealing nuanced insights.\n","authors":["Lisa Dunlap","Yuhui Zhang","Xiaohan Wang","Ruiqi Zhong","Trevor Darrell","Jacob Steinhardt","Joseph E. Gonzalez","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2312.02974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02973v1","updated":"2023-12-05T18:59:14Z","published":"2023-12-05T18:59:14Z","title":"GauHuman: Articulated Gaussian Splatting from Monocular Human Videos","summary":" We present, GauHuman, a 3D human model with Gaussian Splatting for both fast\ntraining (1 ~ 2 minutes) and real-time rendering (up to 189 FPS), compared with\nexisting NeRF-based implicit representation modelling frameworks demanding\nhours of training and seconds of rendering per frame. Specifically, GauHuman\nencodes Gaussian Splatting in the canonical space and transforms 3D Gaussians\nfrom canonical space to posed space with linear blend skinning (LBS), in which\neffective pose and LBS refinement modules are designed to learn fine details of\n3D humans under negligible computational cost. Moreover, to enable fast\noptimization of GauHuman, we initialize and prune 3D Gaussians with 3D human\nprior, while splitting/cloning via KL divergence guidance, along with a novel\nmerge operation for further speeding up. Extensive experiments on ZJU_Mocap and\nMonoCap datasets demonstrate that GauHuman achieves state-of-the-art\nperformance quantitatively and qualitatively with fast training and real-time\nrendering speed. Notably, without sacrificing rendering quality, GauHuman can\nfast model the 3D human performer with ~13k 3D Gaussians.\n","authors":["Shoukang Hu","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02973v1.pdf","comment":"project page: https://skhu101.github.io/GauHuman/; code:\n https://github.com/skhu101/GauHuman"},{"id":"http://arxiv.org/abs/2312.02970v1","updated":"2023-12-05T18:58:26Z","published":"2023-12-05T18:58:26Z","title":"Alchemist: Parametric Control of Material Properties with Diffusion\n Models","summary":" We propose a method to control material attributes of objects like roughness,\nmetallic, albedo, and transparency in real images. Our method capitalizes on\nthe generative prior of text-to-image models known for photorealism, employing\na scalar value and instructions to alter low-level material properties.\nAddressing the lack of datasets with controlled material attributes, we\ngenerated an object-centric synthetic dataset with physically-based materials.\nFine-tuning a modified pre-trained text-to-image model on this synthetic\ndataset enables us to edit material properties in real-world images while\npreserving all other attributes. We show the potential application of our model\nto material edited NeRFs.\n","authors":["Prafull Sharma","Varun Jampani","Yuanzhen Li","Xuhui Jia","Dmitry Lagun","Fredo Durand","William T. Freeman","Mark Matthews"],"pdf_url":"https://arxiv.org/pdf/2312.02970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02967v1","updated":"2023-12-05T18:56:06Z","published":"2023-12-05T18:56:06Z","title":"AmbiGen: Generating Ambigrams from Pre-trained Diffusion Model","summary":" Ambigrams are calligraphic designs that have different meanings depending on\nthe viewing orientation. Creating ambigrams is a challenging task even for\nskilled artists, as it requires maintaining the meaning under two different\nviewpoints at the same time. In this work, we propose to generate ambigrams by\ndistilling a large-scale vision and language diffusion model, namely DeepFloyd\nIF, to optimize the letters' outline for legibility in the two viewing\norientations. Empirically, we demonstrate that our approach outperforms\nexisting ambigram generation methods. On the 500 most common words in English,\nour method achieves more than an 11.6% increase in word accuracy and at least a\n41.9% reduction in edit distance.\n","authors":["Boheng Zhao","Rana Hanocka","Raymond A. Yeh"],"pdf_url":"https://arxiv.org/pdf/2312.02967v1.pdf","comment":"Project page: https://raymond-yeh.com/AmbiGen/"},{"id":"http://arxiv.org/abs/2312.02966v1","updated":"2023-12-05T18:54:03Z","published":"2023-12-05T18:54:03Z","title":"Diffusion-SS3D: Diffusion Model for Semi-supervised 3D Object Detection","summary":" Semi-supervised object detection is crucial for 3D scene understanding,\nefficiently addressing the limitation of acquiring large-scale 3D bounding box\nannotations. Existing methods typically employ a teacher-student framework with\npseudo-labeling to leverage unlabeled point clouds. However, producing reliable\npseudo-labels in a diverse 3D space still remains challenging. In this work, we\npropose Diffusion-SS3D, a new perspective of enhancing the quality of\npseudo-labels via the diffusion model for semi-supervised 3D object detection.\nSpecifically, we include noises to produce corrupted 3D object size and class\nlabel distributions, and then utilize the diffusion model as a denoising\nprocess to obtain bounding box outputs. Moreover, we integrate the diffusion\nmodel into the teacher-student framework, so that the denoised bounding boxes\ncan be used to improve pseudo-label generation, as well as the entire\nsemi-supervised learning process. We conduct experiments on the ScanNet and SUN\nRGB-D benchmark datasets to demonstrate that our approach achieves\nstate-of-the-art performance against existing methods. We also present\nextensive analysis to understand how our diffusion model design affects\nperformance in semi-supervised learning.\n","authors":["Cheng-Ju Ho","Chen-Hsuan Tai","Yen-Yu Lin","Ming-Hsuan Yang","Yi-Hsuan Tsai"],"pdf_url":"https://arxiv.org/pdf/2312.02966v1.pdf","comment":"Accepted in NeurIPS 2023. Code is available at\n https://github.com/luluho1208/Diffusion-SS3D"},{"id":"http://arxiv.org/abs/2312.02963v1","updated":"2023-12-05T18:50:12Z","published":"2023-12-05T18:50:12Z","title":"MVHumanNet: A Large-scale Dataset of Multi-view Daily Dressing Human\n Captures","summary":" In this era, the success of large language models and text-to-image models\ncan be attributed to the driving force of large-scale datasets. However, in the\nrealm of 3D vision, while remarkable progress has been made with models trained\non large-scale synthetic and real-captured object data like Objaverse and\nMVImgNet, a similar level of progress has not been observed in the domain of\nhuman-centric tasks partially due to the lack of a large-scale human dataset.\nExisting datasets of high-fidelity 3D human capture continue to be mid-sized\ndue to the significant challenges in acquiring large-scale high-quality 3D\nhuman data. To bridge this gap, we present MVHumanNet, a dataset that comprises\nmulti-view human action sequences of 4,500 human identities. The primary focus\nof our work is on collecting human data that features a large number of diverse\nidentities and everyday clothing using a multi-view human capture system, which\nfacilitates easily scalable data collection. Our dataset contains 9,000 daily\noutfits, 60,000 motion sequences and 645 million frames with extensive\nannotations, including human masks, camera parameters, 2D and 3D keypoints,\nSMPL/SMPLX parameters, and corresponding textual descriptions. To explore the\npotential of MVHumanNet in various 2D and 3D visual tasks, we conducted pilot\nstudies on view-consistent action recognition, human NeRF reconstruction,\ntext-driven view-unconstrained human image generation, as well as 2D\nview-unconstrained human image and 3D avatar generation. Extensive experiments\ndemonstrate the performance improvements and effective applications enabled by\nthe scale provided by MVHumanNet. As the current largest-scale 3D human\ndataset, we hope that the release of MVHumanNet data with annotations will\nfoster further innovations in the domain of 3D human-centric tasks at scale.\n","authors":["Zhangyang Xiong","Chenghong Li","Kenkun Liu","Hongjie Liao","Jianqiao Hu","Junyi Zhu","Shuliang Ning","Lingteng Qiu","Chongjie Wang","Shijie Wang","Shuguang Cui","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2312.02963v1.pdf","comment":"Project page: https://x-zhangyang.github.io/MVHumanNet/"},{"id":"http://arxiv.org/abs/2312.02957v1","updated":"2023-12-05T18:41:03Z","published":"2023-12-05T18:41:03Z","title":"Classification for everyone : Building geography agnostic models for\n fairer recognition","summary":" In this paper, we analyze different methods to mitigate inherent geographical\nbiases present in state of the art image classification models. We first\nquantitatively present this bias in two datasets - The Dollar Street Dataset\nand ImageNet, using images with location information. We then present different\nmethods which can be employed to reduce this bias. Finally, we analyze the\neffectiveness of the different techniques on making these models more robust to\ngeographical locations of the images.\n","authors":["Akshat Jindal","Shreya Singh","Soham Gadgil"],"pdf_url":"https://arxiv.org/pdf/2312.02957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02956v1","updated":"2023-12-05T18:40:40Z","published":"2023-12-05T18:40:40Z","title":"Choroidalyzer: An open-source, end-to-end pipeline for choroidal\n analysis in optical coherence tomography","summary":" Purpose: To develop Choroidalyzer, an open-source, end-to-end pipeline for\nsegmenting the choroid region, vessels, and fovea, and deriving choroidal\nthickness, area, and vascular index.\n Methods: We used 5,600 OCT B-scans (233 subjects, 6 systemic disease cohorts,\n3 device types, 2 manufacturers). To generate region and vessel ground-truths,\nwe used state-of-the-art automatic methods following manual correction of\ninaccurate segmentations, with foveal positions manually annotated. We trained\na U-Net deep-learning model to detect the region, vessels, and fovea to\ncalculate choroid thickness, area, and vascular index in a fovea-centred region\nof interest. We analysed segmentation agreement (AUC, Dice) and choroid metrics\nagreement (Pearson, Spearman, mean absolute error (MAE)) in internal and\nexternal test sets. We compared Choroidalyzer to two manual graders on a small\nsubset of external test images and examined cases of high error.\n Results: Choroidalyzer took 0.299 seconds per image on a standard laptop and\nachieved excellent region (Dice: internal 0.9789, external 0.9749), very good\nvessel segmentation performance (Dice: internal 0.8817, external 0.8703) and\nexcellent fovea location prediction (MAE: internal 3.9 pixels, external 3.4\npixels). For thickness, area, and vascular index, Pearson correlations were\n0.9754, 0.9815, and 0.8285 (internal) / 0.9831, 0.9779, 0.7948 (external),\nrespectively (all p<0.0001). Choroidalyzer's agreement with graders was\ncomparable to the inter-grader agreement across all metrics.\n Conclusions: Choroidalyzer is an open-source, end-to-end pipeline that\naccurately segments the choroid and reliably extracts thickness, area, and\nvascular index. Especially choroidal vessel segmentation is a difficult and\nsubjective task, and fully-automatic methods like Choroidalyzer could provide\nobjectivity and standardisation.\n","authors":["Justin Engelmann","Jamie Burke","Charlene Hamid","Megan Reid-Schachter","Dan Pugh","Neeraj Dhaun","Diana Moukaddem","Lyle Gray","Niall Strang","Paul McGraw","Amos Storkey","Paul J. Steptoe","Stuart King","Tom MacGillivray","Miguel O. Bernabeu","Ian J. C. MacCormick"],"pdf_url":"https://arxiv.org/pdf/2312.02956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.16450v2","updated":"2023-12-05T18:35:18Z","published":"2023-05-25T19:54:33Z","title":"Investigation of UAV Detection in Images with Complex Backgrounds and\n Rainy Artifacts","summary":" To detect unmanned aerial vehicles (UAVs) in real-time, computer vision and\ndeep learning approaches are evolving research areas. Interest in this problem\nhas grown due to concerns regarding the possible hazards and misuse of\nemploying UAVs in many applications. These include potential privacy\nviolations. To address the concerns, vision-based object detection methods have\nbeen developed for UAV detection. However, UAV detection in images with complex\nbackgrounds and weather artifacts like rain has yet to be reasonably studied.\nHence, for this purpose, we prepared two training datasets. The first dataset\nhas the sky as its background and is called the Sky Background Dataset (SBD).\nThe second training dataset has more complex scenes (with diverse backgrounds)\nand is named the Complex Background Dataset (CBD). Additionally, two test sets\nwere prepared: one containing clear images and the other with images with three\nrain artifacts, named the Rainy Test Set (RTS). This work also focuses on\nbenchmarking state-of-the-art object detection models, and to the best of our\nknowledge, it is the first to investigate the performance of recent and popular\nvision-based object detection methods for UAV detection under challenging\nconditions such as complex backgrounds, varying UAV sizes, and low-to-heavy\nrainy conditions. The findings presented in the paper shall help provide\ninsights concerning the performance of the selected models for UAV detection\nunder challenging conditions and pave the way to develop more robust UAV\ndetection methods. The codes and datasets are available at:\nhttps://github.com/AdnanMunir294/UAVD-CBRA.\n","authors":["Adnan Munir","Abdul Jabbar Siddiqui","Saeed Anwar"],"pdf_url":"https://arxiv.org/pdf/2305.16450v2.pdf","comment":"Accepted at the Real-World Surveillance Workshop, IEEE/CVF Winter\n Conference on Applications of Computer Vision 2024"},{"id":"http://arxiv.org/abs/2312.02949v1","updated":"2023-12-05T18:29:31Z","published":"2023-12-05T18:29:31Z","title":"LLaVA-Grounding: Grounded Visual Chat with Large Multimodal Models","summary":" With the recent significant advancements in large multi-modal models (LMMs),\nthe importance of their grounding capability in visual chat is increasingly\nrecognized. Despite recent efforts to enable LMMs to support grounding, their\ncapabilities for grounding and chat are usually separate, and their chat\nperformance drops dramatically when asked to ground. The problem is the lack of\na dataset for grounded visual chat (GVC). Existing grounding datasets only\ncontain short captions. To address this issue, we have created GVC data that\nallows for the combination of grounding and chat capabilities. To better\nevaluate the GVC capabilities, we have introduced a benchmark called\nGrounding-Bench. Additionally, we have proposed a model design that can support\nGVC and various types of visual prompts by connecting segmentation models with\nlanguage models. Experimental results demonstrate that our model outperforms\nother LMMs on Grounding-Bench. Furthermore, our model achieves competitive\nperformance on classic grounding benchmarks like RefCOCO/+/g and Flickr30K\nEntities. Our code will be released at\nhttps://github.com/UX-Decoder/LLaVA-Grounding .\n","authors":["Hao Zhang","Hongyang Li","Feng Li","Tianhe Ren","Xueyan Zou","Shilong Liu","Shijia Huang","Jianfeng Gao","Lei Zhang","Chunyuan Li","Jianwei Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02949v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02941v1","updated":"2023-12-05T18:09:47Z","published":"2023-12-05T18:09:47Z","title":"Fast CT anatomic localization algorithm","summary":" Automatically determining the position of every slice in a CT scan is a basic\nyet powerful capability allowing fast retrieval of region of interest for\nvisual inspection and automated analysis. Unlike conventional localization\napproaches which work at the slice level, we directly localize only a fraction\nof the slices and and then fit a linear model which maps slice index to its\nestimated axial anatomical position based on those slices. The model is then\nused to assign axial position to every slices of the scan. This approach proves\nto be both computationally efficient, with a typical processing time of less\nthan a second per scan (regardless of its size), accurate, with a typical\nmedian localization error of 1 cm, and robust to different noise sources,\nimaging protocols, metal induced artifacts, anatomical deformations etc.\nAnother key element of our approach is the introduction of a mapping confidence\nscore. This score acts as a fail safe mechanism which allows a rejection of\nunreliable localization results in rare cases of anomalous scans. Our algorithm\nsets new State Of The Art results in terms of localization accuracy. It also\noffers a decrease of two orders of magnitude in processing time with respect to\nall published processing times. It was designed to be invariant to various scan\nresolutions, scan protocols, patient orientations, strong artifacts and various\ndeformations and abnormalities. Additionally, our algorithm is the first one to\nthe best of our knowledge which supports the entire body from head to feet and\nis not confined to specific anatomical region. This algorithm was tested on\nthousands of scans and proves to be very reliable and useful as a preprocessing\nstage for many applications.\n","authors":["Amit Oved"],"pdf_url":"https://arxiv.org/pdf/2312.02941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02936v1","updated":"2023-12-05T18:05:59Z","published":"2023-12-05T18:05:59Z","title":"Drag-A-Video: Non-rigid Video Editing with Point-based Interaction","summary":" Video editing is a challenging task that requires manipulating videos on both\nthe spatial and temporal dimensions. Existing methods for video editing mainly\nfocus on changing the appearance or style of the objects in the video, while\nkeeping their structures unchanged. However, there is no existing method that\nallows users to interactively ``drag'' any points of instances on the first\nframe to precisely reach the target points with other frames consistently\ndeformed. In this paper, we propose a new diffusion-based method for\ninteractive point-based video manipulation, called Drag-A-Video. Our method\nallows users to click pairs of handle points and target points as well as masks\non the first frame of an input video. Then, our method transforms the inputs\ninto point sets and propagates these sets across frames. To precisely modify\nthe contents of the video, we employ a new video-level motion supervision to\nupdate the features of the video and introduce the latent offsets to achieve\nthis update at multiple denoising timesteps. We propose a temporal-consistent\npoint tracking module to coordinate the movement of the points in the handle\npoint sets. We demonstrate the effectiveness and flexibility of our method on\nvarious videos. The website of our work is available here:\nhttps://drag-a-video.github.io/.\n","authors":["Yao Teng","Enze Xie","Yue Wu","Haoyu Han","Zhenguo Li","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02936v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02934v1","updated":"2023-12-05T18:05:14Z","published":"2023-12-05T18:05:14Z","title":"WoVoGen: World Volume-aware Diffusion for Controllable Multi-camera\n Driving Scene Generation","summary":" Generating multi-camera street-view videos is critical for augmenting\nautonomous driving datasets, addressing the urgent demand for extensive and\nvaried data. Due to the limitations in diversity and challenges in handling\nlighting conditions, traditional rendering-based methods are increasingly being\nsupplanted by diffusion-based methods. However, a significant challenge in\ndiffusion-based methods is ensuring that the generated sensor data preserve\nboth intra-world consistency and inter-sensor coherence. To address these\nchallenges, we combine an additional explicit world volume and propose the\nWorld Volume-aware Multi-camera Driving Scene Generator (WoVoGen). This system\nis specifically designed to leverage 4D world volume as a foundational element\nfor video generation. Our model operates in two distinct phases: (i)\nenvisioning the future 4D temporal world volume based on vehicle control\nsequences, and (ii) generating multi-camera videos, informed by this envisioned\n4D temporal world volume and sensor interconnectivity. The incorporation of the\n4D world volume empowers WoVoGen not only to generate high-quality street-view\nvideos in response to vehicle control inputs but also to facilitate scene\nediting tasks.\n","authors":["Jiachen Lu","Ze Huang","Jiahui Zhang","Zeyu Yang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02934v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02928v1","updated":"2023-12-05T17:59:52Z","published":"2023-12-05T17:59:52Z","title":"LivePhoto: Real Image Animation with Text-guided Motion Control","summary":" Despite the recent progress in text-to-video generation, existing studies\nusually overlook the issue that only spatial contents but not temporal motions\nin synthesized videos are under the control of text. Towards such a challenge,\nthis work presents a practical system, named LivePhoto, which allows users to\nanimate an image of their interest with text descriptions. We first establish a\nstrong baseline that helps a well-learned text-to-image generator (i.e., Stable\nDiffusion) take an image as a further input. We then equip the improved\ngenerator with a motion module for temporal modeling and propose a carefully\ndesigned training pipeline to better link texts and motions. In particular,\nconsidering the facts that (1) text can only describe motions roughly (e.g.,\nregardless of the moving speed) and (2) text may include both content and\nmotion descriptions, we introduce a motion intensity estimation module as well\nas a text re-weighting module to reduce the ambiguity of text-to-motion\nmapping. Empirical evidence suggests that our approach is capable of well\ndecoding motion-related textual instructions into videos, such as actions,\ncamera movements, or even conjuring new contents from thin air (e.g., pouring\nwater into an empty glass). Interestingly, thanks to the proposed intensity\nlearning mechanism, our system offers users an additional control signal (i.e.,\nthe motion intensity) besides text for video customization.\n","authors":["Xi Chen","Zhiheng Liu","Mengting Chen","Yutong Feng","Yu Liu","Yujun Shen","Hengshuang Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.02928v1.pdf","comment":"Project page: https://xavierchen34.github.io/LivePhoto-Page/"},{"id":"http://arxiv.org/abs/2312.02923v1","updated":"2023-12-05T17:50:55Z","published":"2023-12-05T17:50:55Z","title":"Split & Merge: Unlocking the Potential of Visual Adapters via Sparse\n Training","summary":" With the rapid growth in the scale of pre-trained foundation models,\nparameter-efficient fine-tuning techniques have gained significant attention,\namong which Adapter Tuning is the most widely used. Despite achieving\nefficiency, Adapter Tuning still underperforms full fine-tuning, and the\nperformance improves at the cost of an increase in parameters. Recent efforts\naddress this issue by pruning the original adapters, but it also introduces\ntraining instability and suboptimal performance on certain datasets. Motivated\nby this, we propose Mixture of Sparse Adapters, or MoSA, as a novel Adapter\nTuning method to fully unleash the potential of each parameter in the adapter.\nWe first split the standard adapter into multiple non-overlapping modules, then\nstochastically activate modules for sparse training, and finally merge them to\nform a complete adapter after tuning. In this way, MoSA can achieve\nsignificantly better performance than standard adapters without any additional\ncomputational or storage overhead. Furthermore, we propose a hierarchical\nsparse strategy to better leverage limited training data. Extensive experiments\non a series of 27 visual tasks demonstrate that MoSA consistently outperforms\nother Adapter Tuning methods as well as other baselines by a significant\nmargin. Furthermore, in two challenging scenarios with low-resource and\nmulti-task settings, MoSA achieves satisfactory results, further demonstrating\nthe effectiveness of our design. Our code will be released.\n","authors":["Qizhe Zhang","Bocheng Zou","Ruichuan An","Jiaming Liu","Shanghang Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02923v1.pdf","comment":"16 pages, 7 figures. Official code:\n https://github.com/Theia-4869/MoSA"},{"id":"http://arxiv.org/abs/2312.02919v1","updated":"2023-12-05T17:47:33Z","published":"2023-12-05T17:47:33Z","title":"Fine-grained Controllable Video Generation via Object Appearance and\n Context","summary":" Text-to-video generation has shown promising results. However, by taking only\nnatural languages as input, users often face difficulties in providing detailed\ninformation to precisely control the model's output. In this work, we propose\nfine-grained controllable video generation (FACTOR) to achieve detailed\ncontrol. Specifically, FACTOR aims to control objects' appearances and context,\nincluding their location and category, in conjunction with the text prompt. To\nachieve detailed control, we propose a unified framework to jointly inject\ncontrol signals into the existing text-to-video model. Our model consists of a\njoint encoder and adaptive cross-attention layers. By optimizing the encoder\nand the inserted layer, we adapt the model to generate videos that are aligned\nwith both text prompts and fine-grained control. Compared to existing methods\nrelying on dense control signals such as edge maps, we provide a more intuitive\nand user-friendly interface to allow object-level fine-grained control. Our\nmethod achieves controllability of object appearances without finetuning, which\nreduces the per-subject optimization efforts for the users. Extensive\nexperiments on standard benchmark datasets and user-provided inputs validate\nthat our model obtains a 70% improvement in controllability metrics over\ncompetitive baselines.\n","authors":["Hsin-Ping Huang","Yu-Chuan Su","Deqing Sun","Lu Jiang","Xuhui Jia","Yukun Zhu","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02919v1.pdf","comment":"Project page: https://hhsinping.github.io/factor"},{"id":"http://arxiv.org/abs/2312.02918v1","updated":"2023-12-05T17:47:11Z","published":"2023-12-05T17:47:11Z","title":"Multimodal Prompt Perceiver: Empower Adaptiveness, Generalizability and\n Fidelity for All-in-One Image Restoration","summary":" Despite substantial progress, all-in-one image restoration (IR) grapples with\npersistent challenges in handling intricate real-world degradations. This paper\nintroduces MPerceiver: a novel multimodal prompt learning approach that\nharnesses Stable Diffusion (SD) priors to enhance adaptiveness,\ngeneralizability and fidelity for all-in-one image restoration. Specifically,\nwe develop a dual-branch module to master two types of SD prompts: textual for\nholistic representation and visual for multiscale detail representation. Both\nprompts are dynamically adjusted by degradation predictions from the CLIP image\nencoder, enabling adaptive responses to diverse unknown degradations. Moreover,\na plug-in detail refinement module improves restoration fidelity via direct\nencoder-to-decoder information transformation. To assess our method, MPerceiver\nis trained on 9 tasks for all-in-one IR and outperforms state-of-the-art\ntask-specific methods across most tasks. Post multitask pre-training,\nMPerceiver attains a generalized representation in low-level vision, exhibiting\nremarkable zero-shot and few-shot capabilities in unseen tasks. Extensive\nexperiments on 16 IR tasks and 26 benchmarks underscore the superiority of\nMPerceiver in terms of adaptiveness, generalizability and fidelity.\n","authors":["Yuang Ai","Huaibo Huang","Xiaoqiang Zhou","Jiexiang Wang","Ran He"],"pdf_url":"https://arxiv.org/pdf/2312.02918v1.pdf","comment":"13 pages, 8 figures, 9 tables"},{"id":"http://arxiv.org/abs/2312.02916v1","updated":"2023-12-05T17:46:52Z","published":"2023-12-05T17:46:52Z","title":"MIND: Multi-Task Incremental Network Distillation","summary":" The recent surge in pervasive devices generating dynamic data streams has\nunderscored the necessity for learning systems to adapt to data distributional\nshifts continually. To tackle this challenge, the research community has put\nforth a spectrum of methodologies, including the demanding pursuit of\nclass-incremental learning without replay data. In this study, we present MIND,\na parameter isolation method that aims to significantly enhance the performance\nof replay-free solutions and achieve state-of-the-art results on several widely\nstudied datasets. Our approach introduces two main contributions: two\nalternative distillation procedures that significantly improve the efficiency\nof MIND increasing the accumulated knowledge of each sub-network, and the\noptimization of the BachNorm layers across tasks inside the sub-networks.\nOverall, MIND outperforms all the state-of-the-art methods for rehearsal-free\nClass-Incremental learning (with an increment in classification accuracy of\napprox. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx.\n+40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each\ncontribution to demonstrate its impact on performance improvement. Our results\nshowcase the superior performance of MIND indicating its potential for\naddressing the challenges posed by Class-incremental and Domain-Incremental\nlearning in resource-constrained environments.\n","authors":["Jacopo Bonato","Francesco Pelosin","Luigi Sabetta","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01677v2","updated":"2023-12-05T17:46:12Z","published":"2023-12-04T06:59:55Z","title":"Multi-task Image Restoration Guided By Robust DINO Features","summary":" Multi-task image restoration has gained significant interest due to its\ninherent versatility and efficiency compared to its single-task counterpart.\nDespite its potential, performance degradation is observed with an increase in\nthe number of tasks, primarily attributed to the distinct nature of each\nrestoration task. Addressing this challenge, we introduce\n\\mbox{\\textbf{DINO-IR}}, a novel multi-task image restoration approach\nleveraging robust features extracted from DINOv2. Our empirical analysis shows\nthat while shallow features of DINOv2 capture rich low-level image\ncharacteristics, the deep features ensure a robust semantic representation\ninsensitive to degradations while preserving high-frequency contour details.\nBuilding on these features, we devise specialized components, including\nmulti-layer semantic fusion module, DINO-Restore adaption and fusion module,\nand DINO perception contrastive loss, to integrate DINOv2 features into the\nrestoration paradigm. Equipped with the aforementioned components, our DINO-IR\nperforms favorably against existing multi-task image restoration approaches in\nvarious tasks by a large margin, indicating the superiority and necessity of\nreinforcing the robust features for multi-task image restoration.\n","authors":["Xin Lin","Chao Ren","Kelvin C. K. Chan","Lu Qi","Jinshan Pan","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.01677v2.pdf","comment":"Some important information need to add"},{"id":"http://arxiv.org/abs/2312.02914v1","updated":"2023-12-05T17:39:19Z","published":"2023-12-05T17:39:19Z","title":"Unsupervised Video Domain Adaptation with Masked Pre-Training and\n Collaborative Self-Training","summary":" In this work, we tackle the problem of unsupervised domain adaptation (UDA)\nfor video action recognition. Our approach, which we call UNITE, uses an image\nteacher model to adapt a video student model to the target domain. UNITE first\nemploys self-supervised pre-training to promote discriminative feature learning\non target domain videos using a teacher-guided masked distillation objective.\nWe then perform self-training on masked target data, using the video student\nmodel and image teacher model together to generate improved pseudolabels for\nunlabeled target videos. Our self-training process successfully leverages the\nstrengths of both models to achieve strong transfer performance across domains.\nWe evaluate our approach on multiple video domain adaptation benchmarks and\nobserve significant improvements upon previously reported results.\n","authors":["Arun Reddy","William Paul","Corban Rivera","Ketul Shah","Celso M. de Melo","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2312.02914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02912v1","updated":"2023-12-05T17:36:34Z","published":"2023-12-05T17:36:34Z","title":"Realistic Scatterer Based Adversarial Attacks on SAR Image Classifiers","summary":" Adversarial attacks have highlighted the vulnerability of classifiers based\non machine learning for Synthetic Aperture Radar (SAR) Automatic Target\nRecognition (ATR) tasks. An adversarial attack perturbs SAR images of on-ground\ntargets such that the classifiers are misled into making incorrect predictions.\nHowever, many existing attacking techniques rely on arbitrary manipulation of\nSAR images while overlooking the feasibility of executing the attacks on\nreal-world SAR imagery. Instead, adversarial attacks should be able to be\nimplemented by physical actions, for example, placing additional false objects\nas scatterers around the on-ground target to perturb the SAR image and fool the\nSAR ATR.\n In this paper, we propose the On-Target Scatterer Attack (OTSA), a\nscatterer-based physical adversarial attack. To ensure the feasibility of its\nphysical execution, we enforce a constraint on the positioning of the\nscatterers. Specifically, we restrict the scatterers to be placed only on the\ntarget instead of in the shadow regions or the background. To achieve this, we\nintroduce a positioning score based on Gaussian kernels and formulate an\noptimization problem for our OTSA attack. Using a gradient ascent method to\nsolve the optimization problem, the OTSA can generate a vector of parameters\ndescribing the positions, shapes, sizes and amplitudes of the scatterers to\nguide the physical execution of the attack that will mislead SAR image\nclassifiers. The experimental results show that our attack obtains\nsignificantly higher success rates under the positioning constraint compared\nwith the existing method.\n","authors":["Tian Ye","Rajgopal Kannan","Viktor Prasanna","Carl Busart","Lance Kaplan"],"pdf_url":"https://arxiv.org/pdf/2312.02912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02910v1","updated":"2023-12-05T17:36:04Z","published":"2023-12-05T17:36:04Z","title":"Rare Galaxy Classes Identified In Foundation Model Representations","summary":" We identify rare and visually distinctive galaxy populations by searching for\nstructure within the learned representations of pretrained models. We show that\nthese representations arrange galaxies by appearance in patterns beyond those\nneeded to predict the pretraining labels. We design a clustering approach to\nisolate specific local patterns, revealing groups of galaxies with rare and\nscientifically-interesting morphologies.\n","authors":["Mike Walmsley","Anna M. M. Scaife"],"pdf_url":"https://arxiv.org/pdf/2312.02910v1.pdf","comment":"Accepted at Machine Learning and the Physical Sciences Workshop,\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02908v1","updated":"2023-12-05T17:30:16Z","published":"2023-12-05T17:30:16Z","title":"Deep Learning Segmentation of Spiral Arms and Bars","summary":" We present the first deep learning model for segmenting galactic spiral arms\nand bars. In a blinded assessment by expert astronomers, our predicted spiral\narm masks are preferred over both current automated methods (99% of\nevaluations) and our original volunteer labels (79% of evaluations). Experts\nrated our spiral arm masks as `mostly good' to `perfect' in 89% of evaluations.\nBar lengths trivially derived from our predicted bar masks are in excellent\nagreement with a dedicated crowdsourcing project. The pixelwise precision of\nour masks, previously impossible at scale, will underpin new research into how\nspiral arms and bars evolve.\n","authors":["Mike Walmsley","Ashley Spindler"],"pdf_url":"https://arxiv.org/pdf/2312.02908v1.pdf","comment":"Accepted at Machine Learning and the Physical Sciences Workshop,\n NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02902v1","updated":"2023-12-05T17:19:22Z","published":"2023-12-05T17:19:22Z","title":"HeadGaS: Real-Time Animatable Head Avatars via 3D Gaussian Splatting","summary":" 3D head animation has seen major quality and runtime improvements over the\nlast few years, particularly empowered by the advances in differentiable\nrendering and neural radiance fields. Real-time rendering is a highly desirable\ngoal for real-world applications. We propose HeadGaS, the first model to use 3D\nGaussian Splats (3DGS) for 3D head reconstruction and animation. In this paper\nwe introduce a hybrid model that extends the explicit representation from 3DGS\nwith a base of learnable latent features, which can be linearly blended with\nlow-dimensional parameters from parametric head models to obtain\nexpression-dependent final color and opacity values. We demonstrate that\nHeadGaS delivers state-of-the-art results in real-time inference frame rates,\nwhich surpasses baselines by up to ~2dB, while accelerating rendering speed by\nover x10.\n","authors":["Helisa Dhamo","Yinyu Nie","Arthur Moreau","Jifei Song","Richard Shaw","Yiren Zhou","Eduardo Pérez-Pellitero"],"pdf_url":"https://arxiv.org/pdf/2312.02902v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02087v2","updated":"2023-12-05T17:14:25Z","published":"2023-12-04T17:58:06Z","title":"VideoSwap: Customized Video Subject Swapping with Interactive Semantic\n Point Correspondence","summary":" Current diffusion-based video editing primarily focuses on\nstructure-preserved editing by utilizing various dense correspondences to\nensure temporal consistency and motion alignment. However, these approaches are\noften ineffective when the target edit involves a shape change. To embark on\nvideo editing with shape change, we explore customized video subject swapping\nin this work, where we aim to replace the main subject in a source video with a\ntarget subject having a distinct identity and potentially different shape. In\ncontrast to previous methods that rely on dense correspondences, we introduce\nthe VideoSwap framework that exploits semantic point correspondences, inspired\nby our observation that only a small number of semantic points are necessary to\nalign the subject's motion trajectory and modify its shape. We also introduce\nvarious user-point interactions (\\eg, removing points and dragging points) to\naddress various semantic point correspondence. Extensive experiments\ndemonstrate state-of-the-art video subject swapping results across a variety of\nreal-world videos.\n","authors":["Yuchao Gu","Yipin Zhou","Bichen Wu","Licheng Yu","Jia-Wei Liu","Rui Zhao","Jay Zhangjie Wu","David Junhao Zhang","Mike Zheng Shou","Kevin Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02087v2.pdf","comment":"Project page at https://videoswap.github.io"},{"id":"http://arxiv.org/abs/2312.02896v1","updated":"2023-12-05T17:06:59Z","published":"2023-12-05T17:06:59Z","title":"BenchLMM: Benchmarking Cross-style Visual Capability of Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) such as GPT-4V and LLaVA have shown remarkable\ncapabilities in visual reasoning with common image styles. However, their\nrobustness against diverse style shifts, crucial for practical applications,\nremains largely unexplored. In this paper, we propose a new benchmark,\nBenchLMM, to assess the robustness of LMMs against three different styles:\nartistic image style, imaging sensor style, and application style, where each\nstyle has five sub-styles. Utilizing BenchLMM, we comprehensively evaluate\nstate-of-the-art LMMs and reveal: 1) LMMs generally suffer performance\ndegradation when working with other styles; 2) An LMM performs better than\nanother model in common style does not guarantee its superior performance in\nother styles; 3) LMMs' reasoning capability can be enhanced by prompting LMMs\nto predict the style first, based on which we propose a versatile and\ntraining-free method for improving LMMs; 4) An intelligent LMM is expected to\ninterpret the causes of its errors when facing stylistic variations. We hope\nthat our benchmark and analysis can shed new light on developing more\nintelligent and versatile LMMs.\n","authors":["Rizhao Cai","Zirui Song","Dayan Guan","Zhenhao Chen","Xing Luo","Chenyu Yi","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2312.02896v1.pdf","comment":"Code is available at https://github.com/AIFEG/BenchGPT"},{"id":"http://arxiv.org/abs/2312.02878v1","updated":"2023-12-05T16:48:17Z","published":"2023-12-05T16:48:17Z","title":"Towards More Practical Group Activity Detection: A New Benchmark and\n Model","summary":" Group activity detection (GAD) is the task of identifying members of each\ngroup and classifying the activity of the group at the same time in a video.\nWhile GAD has been studied recently, there is still much room for improvement\nin both dataset and methodology due to their limited capability to address\npractical GAD scenarios. To resolve these issues, we first present a new\ndataset, dubbed Caf\\'e. Unlike existing datasets, Caf\\'e is constructed\nprimarily for GAD and presents more practical evaluation scenarios and metrics,\nas well as being large-scale and providing rich annotations. Along with the\ndataset, we propose a new GAD model that deals with an unknown number of groups\nand latent group members efficiently and effectively. We evaluated our model on\nthree datasets including Caf\\'e, where it outperformed previous work in terms\nof both accuracy and inference speed. Both our dataset and code base will be\nopen to the public to promote future research on GAD.\n","authors":["Dongkeun Kim","Youngkil Song","Minsu Cho","Suha Kwak"],"pdf_url":"https://arxiv.org/pdf/2312.02878v1.pdf","comment":"Project page: https://cvlab.postech.ac.kr/research/CAFE"},{"id":"http://arxiv.org/abs/2312.02877v1","updated":"2023-12-05T16:47:46Z","published":"2023-12-05T16:47:46Z","title":"A Dynamic Network for Efficient Point Cloud Registration","summary":" For the point cloud registration task, a significant challenge arises from\nnon-overlapping points that consume extensive computational resources while\nnegatively affecting registration accuracy. In this paper, we introduce a\ndynamic approach, widely utilized to improve network efficiency in computer\nvision tasks, to the point cloud registration task. We employ an iterative\nregistration process on point cloud data multiple times to identify regions\nwhere matching points cluster, ultimately enabling us to remove noisy points.\nSpecifically, we begin with deep global sampling to perform coarse global\nregistration. Subsequently, we employ the proposed refined node proposal module\nto further narrow down the registration region and perform local registration.\nFurthermore, we utilize a spatial consistency-based classifier to evaluate the\nresults of each registration stage. The model terminates once it reaches\nsufficient confidence, avoiding unnecessary computations. Extended experiments\ndemonstrate that our model significantly reduces time consumption compared to\nother methods with similar results, achieving a speed improvement of over 41%\non indoor dataset (3DMatch) and 33% on outdoor datasets (KITTI) while\nmaintaining competitive registration recall requirements.\n","authors":["Yang Ai","Xi Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02877v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00878v2","updated":"2023-12-05T16:39:31Z","published":"2023-12-01T19:06:12Z","title":"Grounding Everything: Emerging Localization Properties in\n Vision-Language Transformers","summary":" Vision-language foundation models have shown remarkable performance in\nvarious zero-shot settings such as image retrieval, classification, or\ncaptioning. But so far, those models seem to fall behind when it comes to\nzero-shot localization of referential expressions and objects in images. As a\nresult, they need to be fine-tuned for this task. In this paper, we show that\npretrained vision-language (VL) models allow for zero-shot open-vocabulary\nobject localization without any fine-tuning. To leverage those capabilities, we\npropose a Grounding Everything Module (GEM) that generalizes the idea of\nvalue-value attention introduced by CLIPSurgery to a self-self attention path.\nWe show that the concept of self-self attention corresponds to clustering, thus\nenforcing groups of tokens arising from the same object to be similar while\npreserving the alignment with the language space. To further guide the group\nformation, we propose a set of regularizations that allows the model to finally\ngeneralize across datasets and backbones. We evaluate the proposed GEM\nframework on various benchmark tasks and datasets for semantic segmentation. It\nshows that GEM not only outperforms other training-free open-vocabulary\nlocalization methods, but also achieves state-of-the-art results on the\nrecently proposed OpenImagesV7 large-scale segmentation benchmark.\n","authors":["Walid Bousselham","Felix Petersen","Vittorio Ferrari","Hilde Kuehne"],"pdf_url":"https://arxiv.org/pdf/2312.00878v2.pdf","comment":"Code available at https://github.com/WalBouss/GEM"},{"id":"http://arxiv.org/abs/2311.18828v3","updated":"2023-12-05T16:08:36Z","published":"2023-11-30T18:59:20Z","title":"One-step Diffusion with Distribution Matching Distillation","summary":" Diffusion models generate high-quality images but require dozens of forward\npasses. We introduce Distribution Matching Distillation (DMD), a procedure to\ntransform a diffusion model into a one-step image generator with minimal impact\non image quality. We enforce the one-step image generator match the diffusion\nmodel at distribution level, by minimizing an approximate KL divergence whose\ngradient can be expressed as the difference between 2 score functions, one of\nthe target distribution and the other of the synthetic distribution being\nproduced by our one-step generator. The score functions are parameterized as\ntwo diffusion models trained separately on each distribution. Combined with a\nsimple regression loss matching the large-scale structure of the multi-step\ndiffusion outputs, our method outperforms all published few-step diffusion\napproaches, reaching 2.62 FID on ImageNet 64x64 and 11.49 FID on zero-shot\nCOCO-30k, comparable to Stable Diffusion but orders of magnitude faster.\nUtilizing FP16 inference, our model generates images at 20 FPS on modern\nhardware.\n","authors":["Tianwei Yin","Michaël Gharbi","Richard Zhang","Eli Shechtman","Fredo Durand","William T. Freeman","Taesung Park"],"pdf_url":"https://arxiv.org/pdf/2311.18828v3.pdf","comment":"Project page: https://tianweiy.github.io/dmd/"},{"id":"http://arxiv.org/abs/2301.03573v2","updated":"2023-12-05T16:05:00Z","published":"2023-01-09T18:50:03Z","title":"Balance is Essence: Accelerating Sparse Training via Adaptive Gradient\n Correction","summary":" Despite impressive performance, deep neural networks require significant\nmemory and computation costs, prohibiting their application in\nresource-constrained scenarios. Sparse training is one of the most common\ntechniques to reduce these costs, however, the sparsity constraints add\ndifficulty to the optimization, resulting in an increase in training time and\ninstability. In this work, we aim to overcome this problem and achieve\nspace-time co-efficiency. To accelerate and stabilize the convergence of sparse\ntraining, we analyze the gradient changes and develop an adaptive gradient\ncorrection method. Specifically, we approximate the correlation between the\ncurrent and previous gradients, which is used to balance the two gradients to\nobtain a corrected gradient. Our method can be used with the most popular\nsparse training pipelines under both standard and adversarial setups.\nTheoretically, we prove that our method can accelerate the convergence rate of\nsparse training. Extensive experiments on multiple datasets, model\narchitectures, and sparsities demonstrate that our method outperforms leading\nsparse training methods by up to \\textbf{5.0\\%} in accuracy given the same\nnumber of training epochs, and reduces the number of training epochs by up to\n\\textbf{52.1\\%} to achieve the same accuracy. Our code is available on:\n\\url{https://github.com/StevenBoys/AGENT}.\n","authors":["Bowen Lei","Dongkuan Xu","Ruqi Zhang","Shuren He","Bani K. Mallick"],"pdf_url":"https://arxiv.org/pdf/2301.03573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15776v2","updated":"2023-12-05T15:57:17Z","published":"2023-11-27T12:51:42Z","title":"Stable Segment Anything Model","summary":" The Segment Anything Model (SAM) achieves remarkable promptable segmentation\ngiven high-quality prompts which, however, often require good skills to\nspecify. To make SAM robust to casual prompts, this paper presents the first\ncomprehensive analysis on SAM's segmentation stability across a diverse\nspectrum of prompt qualities, notably imprecise bounding boxes and insufficient\npoints. Our key finding reveals that given such low-quality prompts, SAM's mask\ndecoder tends to activate image features that are biased towards the background\nor confined to specific object parts. To mitigate this issue, our key idea\nconsists of calibrating solely SAM's mask attention by adjusting the sampling\nlocations and amplitudes of image features, while the original SAM model\narchitecture and weights remain unchanged. Consequently, our deformable\nsampling plugin (DSP) enables SAM to adaptively shift attention to the prompted\ntarget regions in a data-driven manner, facilitated by our effective robust\ntraining strategy (RTS). During inference, dynamic routing plugin (DRP) is\nproposed that toggles SAM between the deformable and regular grid sampling\nmodes, conditioned on the input prompt quality. Thus, our solution, termed\nStable-SAM, offers several advantages: 1) improved SAM's segmentation stability\nacross a wide range of prompt qualities, while 2) retaining SAM's powerful\npromptable segmentation efficiency and generality, with 3) minimal learnable\nparameters (0.08 M) and fast adaptation (by 1 training epoch). Extensive\nexperiments across multiple datasets validate the effectiveness and advantages\nof our approach, underscoring Stable-SAM as a more robust solution for\nsegmenting anything. Codes will be released upon acceptance.\nhttps://github.com/fanq15/Stable-SAM\n","authors":["Qi Fan","Xin Tao","Lei Ke","Mingqiao Ye","Yuan Zhang","Pengfei Wan","Zhongyuan Wang","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2311.15776v2.pdf","comment":"Smaller file size for the easy access. Codes will be released upon\n acceptance. https://github.com/fanq15/Stable-SAM"},{"id":"http://arxiv.org/abs/2312.02843v1","updated":"2023-12-05T15:53:24Z","published":"2023-12-05T15:53:24Z","title":"Are Vision Transformers More Data Hungry Than Newborn Visual Systems?","summary":" Vision transformers (ViTs) are top performing models on many computer vision\nbenchmarks and can accurately predict human behavior on object recognition\ntasks. However, researchers question the value of using ViTs as models of\nbiological learning because ViTs are thought to be more data hungry than\nbrains, with ViTs requiring more training data to reach similar levels of\nperformance. To test this assumption, we directly compared the learning\nabilities of ViTs and animals, by performing parallel controlled rearing\nexperiments on ViTs and newborn chicks. We first raised chicks in impoverished\nvisual environments containing a single object, then simulated the training\ndata available in those environments by building virtual animal chambers in a\nvideo game engine. We recorded the first-person images acquired by agents\nmoving through the virtual chambers and used those images to train self\nsupervised ViTs that leverage time as a teaching signal, akin to biological\nvisual systems. When ViTs were trained through the eyes of newborn chicks, the\nViTs solved the same view invariant object recognition tasks as the chicks.\nThus, ViTs were not more data hungry than newborn visual systems: both learned\nview invariant object representations in impoverished visual environments. The\nflexible and generic attention based learning mechanism in ViTs combined with\nthe embodied data streams available to newborn animals appears sufficient to\ndrive the development of animal-like object recognition.\n","authors":["Lalit Pandey","Samantha M. W. Wood","Justin N. Wood"],"pdf_url":"https://arxiv.org/pdf/2312.02843v1.pdf","comment":"Accepted in Thirty-seventh Conference on Neural Information\n Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2312.02821v1","updated":"2023-12-05T15:06:04Z","published":"2023-12-05T15:06:04Z","title":"RotaTR: Detection Transformer for Dense and Rotated Object","summary":" Detecting the objects in dense and rotated scenes is a challenging task.\nRecent works on this topic are mostly based on Faster RCNN or Retinanet. As\nthey are highly dependent on the pre-set dense anchors and the NMS operation,\nthe approach is indirect and suboptimal.The end-to-end DETR-based detectors\nhave achieved great success in horizontal object detection and many other areas\nlike segmentation, tracking, action recognition and etc.However, the DETR-based\ndetectors perform poorly on dense rotated target tasks and perform worse than\nmost modern CNN-based detectors. In this paper, we find the most significant\nreason for the poor performance is that the original attention can not\naccurately focus on the oriented targets. Accordingly, we propose Rotated\nobject detection TRansformer (RotaTR) as an extension of DETR to oriented\ndetection. Specifically, we design Rotation Sensitive deformable (RSDeform)\nattention to enhance the DETR's ability to detect oriented targets. It is used\nto build the feature alignment module and rotation-sensitive decoder for our\nmodel. We test RotaTR on four challenging-oriented benchmarks. It shows a great\nadvantage in detecting dense and oriented objects compared to the original\nDETR. It also achieves competitive results when compared to the\nstate-of-the-art.\n","authors":["Zhu Yuke","Ruan Yumeng","Yang Lei","Guo Sheng"],"pdf_url":"https://arxiv.org/pdf/2312.02821v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19535v2","updated":"2023-12-05T15:06:02Z","published":"2023-10-30T13:43:19Z","title":"Revitalizing Legacy Video Content: Deinterlacing with Bidirectional\n Information Propagation","summary":" Due to old CRT display technology and limited transmission bandwidth, early\nfilm and TV broadcasts commonly used interlaced scanning. This meant each field\ncontained only half of the information. Since modern displays require full\nframes, this has spurred research into deinterlacing, i.e. restoring the\nmissing information in legacy video content. In this paper, we present a\ndeep-learning-based method for deinterlacing animated and live-action content.\nOur proposed method supports bidirectional spatio-temporal information\npropagation across multiple scales to leverage information in both space and\ntime. More specifically, we design a Flow-guided Refinement Block (FRB) which\nperforms feature refinement including alignment, fusion, and rectification.\nAdditionally, our method can process multiple fields simultaneously, reducing\nper-frame processing time, and potentially enabling real-time processing. Our\nexperimental results demonstrate that our proposed method achieves superior\nperformance compared to existing methods.\n","authors":["Zhaowei Gao","Mingyang Song","Christopher Schroers","Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2310.19535v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02819v1","updated":"2023-12-05T15:03:15Z","published":"2023-12-05T15:03:15Z","title":"Deterministic Guidance Diffusion Model for Probabilistic Weather\n Forecasting","summary":" Weather forecasting requires not only accuracy but also the ability to\nperform probabilistic prediction. However, deterministic weather forecasting\nmethods do not support probabilistic predictions, and conversely, probabilistic\nmodels tend to be less accurate. To address these challenges, in this paper, we\nintroduce the \\textbf{\\textit{D}}eterministic \\textbf{\\textit{G}}uidance\n\\textbf{\\textit{D}}iffusion \\textbf{\\textit{M}}odel (DGDM) for probabilistic\nweather forecasting, integrating benefits of both deterministic and\nprobabilistic approaches. During the forward process, both the deterministic\nand probabilistic models are trained end-to-end. In the reverse process,\nweather forecasting leverages the predicted result from the deterministic\nmodel, using as an intermediate starting point for the probabilistic model. By\nfusing deterministic models with probabilistic models in this manner, DGDM is\ncapable of providing accurate forecasts while also offering probabilistic\npredictions. To evaluate DGDM, we assess it on the global weather forecasting\ndataset (WeatherBench) and the common video frame prediction benchmark (Moving\nMNIST). We also introduce and evaluate the Pacific Northwest Windstorm\n(PNW)-Typhoon weather satellite dataset to verify the effectiveness of DGDM in\nhigh-resolution regional forecasting. As a result of our experiments, DGDM\nachieves state-of-the-art results not only in global forecasting but also in\nregional forecasting. The code is available at:\n\\url{https://github.com/DongGeun-Yoon/DGDM}.\n","authors":["Donggeun Yoon","Minseok Seo","Doyi Kim","Yeji Choi","Donghyeon Cho"],"pdf_url":"https://arxiv.org/pdf/2312.02819v1.pdf","comment":"16 pages"},{"id":"http://arxiv.org/abs/2312.02813v1","updated":"2023-12-05T14:56:55Z","published":"2023-12-05T14:56:55Z","title":"BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis\n via Bridging Image and Video Diffusion Models","summary":" Diffusion models have made tremendous progress in text-driven image and video\ngeneration. Now text-to-image foundation models are widely applied to various\ndownstream image synthesis tasks, such as controllable image generation and\nimage editing, while downstream video synthesis tasks are less explored for\nseveral reasons. First, it requires huge memory and compute overhead to train a\nvideo generation foundation model. Even with video foundation models,\nadditional costly training is still required for downstream video synthesis\ntasks. Second, although some works extend image diffusion models into videos in\na training-free manner, temporal consistency cannot be well kept. Finally,\nthese adaption methods are specifically designed for one task and fail to\ngeneralize to different downstream video synthesis tasks. To mitigate these\nissues, we propose a training-free general-purpose video synthesis framework,\ncoined as BIVDiff, via bridging specific image diffusion models and general\ntext-to-video foundation diffusion models. Specifically, we first use an image\ndiffusion model (like ControlNet, Instruct Pix2Pix) for frame-wise video\ngeneration, then perform Mixed Inversion on the generated video, and finally\ninput the inverted latents into the video diffusion model for temporal\nsmoothing. Decoupling image and video models enables flexible image model\nselection for different purposes, which endows the framework with strong task\ngeneralization and high efficiency. To validate the effectiveness and general\nuse of BIVDiff, we perform a wide range of video generation tasks, including\ncontrollable video generation video editing, video inpainting and outpainting.\nOur project page is available at https://bivdiff.github.io.\n","authors":["Fengyuan Shi","Jiaxi Gu","Hang Xu","Songcen Xu","Wei Zhang","Limin Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.06467v2","updated":"2023-12-05T14:42:31Z","published":"2022-02-14T03:01:05Z","title":"NeuroMixGDP: A Neural Collapse-Inspired Random Mixup for Private Data\n Release","summary":" Privacy-preserving data release algorithms have gained increasing attention\nfor their ability to protect user privacy while enabling downstream machine\nlearning tasks. However, the utility of current popular algorithms is not\nalways satisfactory. Mixup of raw data provides a new way of data augmentation,\nwhich can help improve utility. However, its performance drastically\ndeteriorates when differential privacy (DP) noise is added. To address this\nissue, this paper draws inspiration from the recently observed Neural Collapse\n(NC) phenomenon, which states that the last layer features of a neural network\nconcentrate on the vertices of a simplex as Equiangular Tight Frame (ETF). We\npropose a scheme to mixup the Neural Collapse features to exploit the ETF\nsimplex structure and release noisy mixed features to enhance the utility of\nthe released data. By using Gaussian Differential Privacy (GDP), we obtain an\nasymptotic rate for the optimal mixup degree. To further enhance the utility\nand address the label collapse issue when the mixup degree is large, we propose\na Hierarchical sampling method to stratify the mixup samples on a small number\nof classes. This method remarkably improves utility when the number of classes\nis large. Extensive experiments demonstrate the effectiveness of our proposed\nmethod in protecting against attacks and improving utility. In particular, our\napproach shows significantly improved utility compared to directly training\nclassification networks with DPSGD on CIFAR100 and MiniImagenet datasets,\nhighlighting the benefits of using privacy-preserving data release. We release\nreproducible code in https://github.com/Lidonghao1996/NeuroMixGDP.\n","authors":["Donghao Li","Yang Cao","Yuan Yao"],"pdf_url":"https://arxiv.org/pdf/2202.06467v2.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.02781v1","updated":"2023-12-05T14:12:38Z","published":"2023-12-05T14:12:38Z","title":"PMMTalk: Speech-Driven 3D Facial Animation from Complementary Pseudo\n Multi-modal Features","summary":" Speech-driven 3D facial animation has improved a lot recently while most\nrelated works only utilize acoustic modality and neglect the influence of\nvisual and textual cues, leading to unsatisfactory results in terms of\nprecision and coherence. We argue that visual and textual cues are not trivial\ninformation. Therefore, we present a novel framework, namely PMMTalk, using\ncomplementary Pseudo Multi-Modal features for improving the accuracy of facial\nanimation. The framework entails three modules: PMMTalk encoder, cross-modal\nalignment module, and PMMTalk decoder. Specifically, the PMMTalk encoder\nemploys the off-the-shelf talking head generation architecture and speech\nrecognition technology to extract visual and textual information from speech,\nrespectively. Subsequently, the cross-modal alignment module aligns the\naudio-image-text features at temporal and semantic levels. Then PMMTalk decoder\nis employed to predict lip-syncing facial blendshape coefficients. Contrary to\nprior methods, PMMTalk only requires an additional random reference face image\nbut yields more accurate results. Additionally, it is artist-friendly as it\nseamlessly integrates into standard animation production workflows by\nintroducing facial blendshape coefficients. Finally, given the scarcity of 3D\ntalking face datasets, we introduce a large-scale 3D Chinese Audio-Visual\nFacial Animation (3D-CAVFA) dataset. Extensive experiments and user studies\nshow that our approach outperforms the state of the art. We recommend watching\nthe supplementary video.\n","authors":["Tianshun Han","Shengnan Gui","Yiqing Huang","Baihui Li","Lijian Liu","Benjia Zhou","Ning Jiang","Quan Lu","Ruicong Zhi","Yanyan Liang","Du Zhang","Jun Wan"],"pdf_url":"https://arxiv.org/pdf/2312.02781v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02772v1","updated":"2023-12-05T14:01:43Z","published":"2023-12-05T14:01:43Z","title":"Generating Fine-Grained Human Motions Using ChatGPT-Refined Descriptions","summary":" Recently, significant progress has been made in text-based motion generation,\nenabling the generation of diverse and high-quality human motions that conform\nto textual descriptions. However, it remains challenging to generate\nfine-grained or stylized motions due to the lack of datasets annotated with\ndetailed textual descriptions. By adopting a divide-and-conquer strategy, we\npropose a new framework named Fine-Grained Human Motion Diffusion Model\n(FG-MDM) for human motion generation. Specifically, we first parse previous\nvague textual annotation into fine-grained description of different body parts\nby leveraging a large language model (GPT-3.5). We then use these fine-grained\ndescriptions to guide a transformer-based diffusion model. FG-MDM can generate\nfine-grained and stylized motions even outside of the distribution of the\ntraining data. Our experimental results demonstrate the superiority of FG-MDM\nover previous methods, especially the strong generalization capability. We will\nrelease our fine-grained textual annotations for HumanML3D and KIT.\n","authors":["Xu Shi","Chuanchen Luo","Junran Peng","Hongwen Zhang","Yunlian Sun"],"pdf_url":"https://arxiv.org/pdf/2312.02772v1.pdf","comment":"Project Page: https://sx0207.github.io/fg-mdm/"},{"id":"http://arxiv.org/abs/2305.15583v4","updated":"2023-12-05T13:48:12Z","published":"2023-05-24T21:39:27Z","title":"Alleviating Exposure Bias in Diffusion Models through Sampling with\n Shifted Time Steps","summary":" Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the\nsynthesis of high-quality images. However, their inference process\ncharacteristically requires numerous, potentially hundreds, of iterative steps,\nwhich could exaggerate the problem of exposure bias due to the training and\ninference discrepancy. Previous work has attempted to mitigate this issue by\nperturbing inputs during training, which consequently mandates the retraining\nof the DPM. In this work, we conduct a systematic study of exposure bias in DPM\nand, intriguingly, we find that the exposure bias could be alleviated with a\nnovel sampling method that we propose, without retraining the model. We\nempirically and theoretically show that, during inference, for each backward\ntime step $t$ and corresponding state $\\hat{x}_t$, there might exist another\ntime step $t_s$ which exhibits superior coupling with $\\hat{x}_t$. Based on\nthis finding, we introduce a sampling method named Time-Shift Sampler. Our\nframework can be seamlessly integrated to existing sampling algorithms, such as\nDDPM, DDIM and other high-order solvers, inducing merely minimal additional\ncomputations. Experimental results show our method brings significant and\nconsistent improvements in FID scores on different datasets and sampling\nmethods. For example, integrating Time-Shift Sampler to F-PNDM yields a\nFID=3.88, achieving 44.49\\% improvements as compared to F-PNDM, on CIFAR-10\nwith 10 sampling steps, which is more performant than the vanilla DDIM with 100\nsampling steps. We will release the code upon acceptance.\n","authors":["Mingxiao Li","Tingyu Qu","Ruicong Yao","Wei Sun","Marie-Francine Moens"],"pdf_url":"https://arxiv.org/pdf/2305.15583v4.pdf","comment":"added new results & proof"},{"id":"http://arxiv.org/abs/2312.02762v1","updated":"2023-12-05T13:44:25Z","published":"2023-12-05T13:44:25Z","title":"Learning Cortical Anomaly through Masked Encoding for Unsupervised\n Heterogeneity Mapping","summary":" The detection of heterogeneous mental disorders based on brain readouts\nremains challenging due to the complexity of symptoms and the absence of\nreliable biomarkers. This paper introduces CAM (Cortical Anomaly Detection\nthrough Masked Image Modeling), a novel self-supervised framework designed for\nthe unsupervised detection of complex brain disorders using cortical surface\nfeatures. We employ this framework for the detection of individuals on the\npsychotic spectrum and demonstrate its capabilities compared to state-ofthe-art\nmethods, achieving an AUC of 0.696 for Schizoaffective and 0.769 for\nSchizophreniform, without the need for any labels. Furthermore, the analysis of\natypical cortical regions includes Pars Triangularis and several frontal areas,\noften implicated in schizophrenia, provide further confidence in our approach.\nAltogether, we demonstrate a scalable approach for anomaly detection of complex\nbrain disorders based on cortical abnormalities.\n","authors":["Hao-Chun Yang","Ole Andreassen","Lars Tjelta Westlye","Andre F. Marquand","Christian F. Beckmann","Thomas Wolfers"],"pdf_url":"https://arxiv.org/pdf/2312.02762v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02753v1","updated":"2023-12-05T13:28:59Z","published":"2023-12-05T13:28:59Z","title":"C3: High-performance and low-complexity neural compression from a single\n image or video","summary":" Most neural compression models are trained on large datasets of images or\nvideos in order to generalize to unseen data. Such generalization typically\nrequires large and expressive architectures with a high decoding complexity.\nHere we introduce C3, a neural compression method with strong rate-distortion\n(RD) performance that instead overfits a small model to each image or video\nseparately. The resulting decoding complexity of C3 can be an order of\nmagnitude lower than neural baselines with similar RD performance. C3 builds on\nCOOL-CHIC (Ladune et al.) and makes several simple and effective improvements\nfor images. We further develop new methodology to apply C3 to videos. On the\nCLIC2020 image benchmark, we match the RD performance of VTM, the reference\nimplementation of the H.266 codec, with less than 3k MACs/pixel for decoding.\nOn the UVG video benchmark, we match the RD performance of the Video\nCompression Transformer (Mentzer et al.), a well-established neural video\ncodec, with less than 5k MACs/pixel for decoding.\n","authors":["Hyunjik Kim","Matthias Bauer","Lucas Theis","Jonathan Richard Schwarz","Emilien Dupont"],"pdf_url":"https://arxiv.org/pdf/2312.02753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02751v1","updated":"2023-12-05T13:27:12Z","published":"2023-12-05T13:27:12Z","title":"C-NERF: Representing Scene Changes as Directional Consistency\n Difference-based NeRF","summary":" In this work, we aim to detect the changes caused by object variations in a\nscene represented by the neural radiance fields (NeRFs). Given an arbitrary\nview and two sets of scene images captured at different timestamps, we can\npredict the scene changes in that view, which has significant potential\napplications in scene monitoring and measuring. We conducted preliminary\nstudies and found that such an exciting task cannot be easily achieved by\nutilizing existing NeRFs and 2D change detection methods with many false or\nmissing detections. The main reason is that the 2D change detection is based on\nthe pixel appearance difference between spatial-aligned image pairs and\nneglects the stereo information in the NeRF. To address the limitations, we\npropose the C-NERF to represent scene changes as directional consistency\ndifference-based NeRF, which mainly contains three modules. We first perform\nthe spatial alignment of two NeRFs captured before and after changes. Then, we\nidentify the change points based on the direction-consistent constraint; that\nis, real change points have similar change representations across view\ndirections, but fake change points do not. Finally, we design the change map\nrendering process based on the built NeRFs and can generate the change map of\nan arbitrarily specified view direction. To validate the effectiveness, we\nbuild a new dataset containing ten scenes covering diverse scenarios with\ndifferent changing objects. Our approach surpasses state-of-the-art 2D change\ndetection and NeRF-based methods by a significant margin.\n","authors":["Rui Huang","Binbin Jiang","Qingyi Zhao","William Wang","Yuxiang Zhang","Qing Guo"],"pdf_url":"https://arxiv.org/pdf/2312.02751v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08529v2","updated":"2023-12-05T13:22:57Z","published":"2023-10-12T17:22:24Z","title":"GaussianDreamer: Fast Generation from Text to 3D Gaussians by Bridging\n 2D and 3D Diffusion Models","summary":" In recent times, the generation of 3D assets from text prompts has shown\nimpressive results. Both 2D and 3D diffusion models can help generate decent 3D\nobjects based on prompts. 3D diffusion models have good 3D consistency, but\ntheir quality and generalization are limited as trainable 3D data is expensive\nand hard to obtain. 2D diffusion models enjoy strong abilities of\ngeneralization and fine generation, but 3D consistency is hard to guarantee.\nThis paper attempts to bridge the power from the two types of diffusion models\nvia the recent explicit and efficient 3D Gaussian splatting representation. A\nfast 3D object generation framework, named as GaussianDreamer, is proposed,\nwhere the 3D diffusion model provides priors for initialization and the 2D\ndiffusion model enriches the geometry and appearance. Operations of noisy point\ngrowing and color perturbation are introduced to enhance the initialized\nGaussians. Our GaussianDreamer can generate a high-quality 3D instance or 3D\navatar within 15 minutes on one GPU, much faster than previous methods, while\nthe generated instances can be directly rendered in real time. Demos and code\nare available at https://taoranyi.com/gaussiandreamer/.\n","authors":["Taoran Yi","Jiemin Fang","Junjie Wang","Guanjun Wu","Lingxi Xie","Xiaopeng Zhang","Wenyu Liu","Qi Tian","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08529v2.pdf","comment":"Project page: https://taoranyi.com/gaussiandreamer/"},{"id":"http://arxiv.org/abs/2309.04331v2","updated":"2023-12-05T12:50:54Z","published":"2023-09-08T13:55:16Z","title":"Leveraging Model Fusion for Improved License Plate Recognition","summary":" License Plate Recognition (LPR) plays a critical role in various\napplications, such as toll collection, parking management, and traffic law\nenforcement. Although LPR has witnessed significant advancements through the\ndevelopment of deep learning, there has been a noticeable lack of studies\nexploring the potential improvements in results by fusing the outputs from\nmultiple recognition models. This research aims to fill this gap by\ninvestigating the combination of up to 12 different models using\nstraightforward approaches, such as selecting the most confident prediction or\nemploying majority vote-based strategies. Our experiments encompass a wide\nrange of datasets, revealing substantial benefits of fusion approaches in both\nintra- and cross-dataset setups. Essentially, fusing multiple models reduces\nconsiderably the likelihood of obtaining subpar performance on a particular\ndataset/scenario. We also found that combining models based on their speed is\nan appealing approach. Specifically, for applications where the recognition\ntask can tolerate some additional time, though not excessively, an effective\nstrategy is to combine 4-6 models. These models may not be the most accurate\nindividually, but their fusion strikes an optimal balance between speed and\naccuracy.\n","authors":["Rayson Laroca","Luiz A. Zanlorensi","Valter Estevam","Rodrigo Minetto","David Menotti"],"pdf_url":"https://arxiv.org/pdf/2309.04331v2.pdf","comment":"Accepted for presentation at the Iberoamerican Congress on Pattern\n Recognition (CIARP) 2023"},{"id":"http://arxiv.org/abs/2312.02725v1","updated":"2023-12-05T12:42:37Z","published":"2023-12-05T12:42:37Z","title":"R3D-SWIN:Use Shifted Window Attention for Single-View 3D Reconstruction","summary":" Recently, vision transformers have performed well in various computer vision\ntasks, including voxel 3D reconstruction. However, the windows of the vision\ntransformer are not multi-scale, and there is no connection between the\nwindows, which limits the accuracy of voxel 3D reconstruction . Therefore, we\npropose a shifted windows attention voxel 3D reconstruction network. To the\nbest of our knowledge, this is the first work to apply shifted window attention\nto voxel 3D reconstruction. Experimental results on ShapeNet verify our method\nachieves SOTA accuracy in single-view reconstruction.\n","authors":["Chenhuan Li","Meihua Xiao","zehuan li","Mengxi Gao"],"pdf_url":"https://arxiv.org/pdf/2312.02725v1.pdf","comment":"4 pages,3 figures,5 tables"},{"id":"http://arxiv.org/abs/2302.02858v3","updated":"2023-12-05T12:33:52Z","published":"2023-02-06T15:25:50Z","title":"TR3D: Towards Real-Time Indoor 3D Object Detection","summary":" Recently, sparse 3D convolutions have changed 3D object detection. Performing\non par with the voting-based approaches, 3D CNNs are memory-efficient and scale\nto large scenes better. However, there is still room for improvement. With a\nconscious, practice-oriented approach to problem-solving, we analyze the\nperformance of such methods and localize the weaknesses. Applying modifications\nthat resolve the found issues one by one, we end up with TR3D: a fast\nfully-convolutional 3D object detection model trained end-to-end, that achieves\nstate-of-the-art results on the standard benchmarks, ScanNet v2, SUN RGB-D, and\nS3DIS. Moreover, to take advantage of both point cloud and RGB inputs, we\nintroduce an early fusion of 2D and 3D features. We employ our fusion module to\nmake conventional 3D object detection methods multimodal and demonstrate an\nimpressive boost in performance. Our model with early feature fusion, which we\nrefer to as TR3D+FF, outperforms existing 3D object detection approaches on the\nSUN RGB-D dataset. Overall, besides being accurate, both TR3D and TR3D+FF\nmodels are lightweight, memory-efficient, and fast, thereby marking another\nmilestone on the way toward real-time 3D object detection. Code is available at\nhttps://github.com/SamsungLabs/tr3d .\n","authors":["Danila Rukhovich","Anna Vorontsova","Anton Konushin"],"pdf_url":"https://arxiv.org/pdf/2302.02858v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01003v2","updated":"2023-12-05T12:26:16Z","published":"2023-12-02T02:28:07Z","title":"Self-Evolving Neural Radiance Fields","summary":" Recently, neural radiance field (NeRF) has shown remarkable performance in\nnovel view synthesis and 3D reconstruction. However, it still requires abundant\nhigh-quality images, limiting its applicability in real-world scenarios. To\novercome this limitation, recent works have focused on training NeRF only with\nsparse viewpoints by giving additional regularizations, often called few-shot\nNeRF. We observe that due to the under-constrained nature of the task, solely\nusing additional regularization is not enough to prevent the model from\noverfitting to sparse viewpoints. In this paper, we propose a novel framework,\ndubbed Self-Evolving Neural Radiance Fields (SE-NeRF), that applies a\nself-training framework to NeRF to address these problems. We formulate\nfew-shot NeRF into a teacher-student framework to guide the network to learn a\nmore robust representation of the scene by training the student with additional\npseudo labels generated from the teacher. By distilling ray-level pseudo labels\nusing distinct distillation schemes for reliable and unreliable rays obtained\nwith our novel reliability estimation method, we enable NeRF to learn a more\naccurate and robust geometry of the 3D scene. We show and evaluate that\napplying our self-training framework to existing models improves the quality of\nthe rendered images and achieves state-of-the-art performance in multiple\nsettings.\n","authors":["Jaewoo Jung","Jisang Han","Jiwon Kang","Seongchan Kim","Min-Seop Kwak","Seungryong Kim"],"pdf_url":"https://arxiv.org/pdf/2312.01003v2.pdf","comment":"34 pages, 21 figures Our project page can be found at :\n https://ku-cvlab.github.io/SE-NeRF/"},{"id":"http://arxiv.org/abs/2306.11300v4","updated":"2023-12-05T12:23:36Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. A critical challenge is how\nto make use of existing large-scale pre-trained VLMs, which are trained on\ncommon objects, to perform the domain-specific transfer for accomplishing\ndomain-related downstream tasks. In this paper, we propose a new framework that\nincludes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap\nbetween the General Vision-Language Model (GVLM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\nfine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning\nmethods on RS5M to implement the DVLM. Experimental results show that our\nproposed dataset is highly effective for various tasks, and our model GeoRSCLIP\nimproves upon the baseline or previous state-of-the-art model by $3\\%\\sim20\\%$\nin Zero-shot Classification (ZSC), $3\\%\\sim6\\%$ in Remote Sensing Cross-Modal\nText-Image Retrieval (RSCTIR) and $4\\%\\sim5\\%$ in Semantic Localization (SeLo)\ntasks. Dataset and models have been released in:\n\\url{https://github.com/om-ai-lab/RS5M}.\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v4.pdf","comment":"RS5M dataset v5"},{"id":"http://arxiv.org/abs/2302.08272v3","updated":"2023-12-05T12:18:54Z","published":"2023-02-16T13:04:59Z","title":"Revisiting Hidden Representations in Transfer Learning for Medical\n Imaging","summary":" While a key component to the success of deep learning is the availability of\nmassive amounts of training data, medical image datasets are often limited in\ndiversity and size. Transfer learning has the potential to bridge the gap\nbetween related yet different domains. For medical applications, however, it\nremains unclear whether it is more beneficial to pre-train on natural or\nmedical images. We aim to shed light on this problem by comparing\ninitialization on ImageNet and RadImageNet on seven medical classification\ntasks. Our work includes a replication study, which yields results contrary to\npreviously published findings. In our experiments, ResNet50 models pre-trained\non ImageNet tend to outperform those trained on RadImageNet. To gain further\ninsights, we investigate the learned representations using Canonical\nCorrelation Analysis (CCA) and compare the predictions of the different models.\nOur results indicate that, contrary to intuition, ImageNet and RadImageNet may\nconverge to distinct intermediate representations, which appear to diverge\nfurther during fine-tuning. Despite these distinct representations, the\npredictions of the models remain similar. Our findings show that the similarity\nbetween networks before and after fine-tuning does not correlate with\nperformance gains, suggesting that the advantages of transfer learning might\nnot solely originate from the reuse of features in the early layers of a\nconvolutional neural network.\n","authors":["Dovile Juodelyte","Amelia Jiménez-Sánchez","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2302.08272v3.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2312.02111v2","updated":"2023-12-05T12:18:25Z","published":"2023-12-04T18:43:45Z","title":"TriDeNT: Triple Deep Network Training for Privileged Knowledge\n Distillation in Histopathology","summary":" Computational pathology models rarely utilise data that will not be available\nfor inference. This means most models cannot learn from highly informative data\nsuch as additional immunohistochemical (IHC) stains and spatial\ntranscriptomics. We present TriDeNT, a novel self-supervised method for\nutilising privileged data that is not available during inference to improve\nperformance. We demonstrate the efficacy of this method for a range of\ndifferent paired data including immunohistochemistry, spatial transcriptomics\nand expert nuclei annotations. In all settings, TriDeNT outperforms other\nstate-of-the-art methods in downstream tasks, with observed improvements of up\nto 101%. Furthermore, we provide qualitative and quantitative measurements of\nthe features learned by these models and how they differ from baselines.\nTriDeNT offers a novel method to distil knowledge from scarce or costly data\nduring training, to create significantly better models for routine inputs.\n","authors":["Lucas Farndale","Robert Insall","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.02111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02705v1","updated":"2023-12-05T12:07:27Z","published":"2023-12-05T12:07:27Z","title":"Unified learning-based lossy and lossless JPEG recompression","summary":" JPEG is still the most widely used image compression algorithm. Most image\ncompression algorithms only consider uncompressed original image, while\nignoring a large number of already existing JPEG images. Recently, JPEG\nrecompression approaches have been proposed to further reduce the size of JPEG\nfiles. However, those methods only consider JPEG lossless recompression, which\nis just a special case of the rate-distortion theorem. In this paper, we\npropose a unified lossly and lossless JPEG recompression framework, which\nconsists of learned quantization table and Markovian hierarchical variational\nautoencoders. Experiments show that our method can achieve arbitrarily low\ndistortion when the bitrate is close to the upper bound, namely the bitrate of\nthe lossless compression model. To the best of our knowledge, this is the first\nlearned method that bridges the gap between lossy and lossless recompression of\nJPEG images.\n","authors":["Jianghui Zhang","Yuanyuan Wang","Lina Guo","Jixiang Luo","Tongda Xu","Yan Wang","Zhi Wang","Hongwei Qin"],"pdf_url":"https://arxiv.org/pdf/2312.02705v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02703v1","updated":"2023-12-05T12:05:01Z","published":"2023-12-05T12:05:01Z","title":"MyPortrait: Morphable Prior-Guided Personalized Portrait Generation","summary":" Generating realistic talking faces is an interesting and long-standing topic\nin the field of computer vision. Although significant progress has been made,\nit is still challenging to generate high-quality dynamic faces with\npersonalized details. This is mainly due to the inability of the general model\nto represent personalized details and the generalization problem to unseen\ncontrollable parameters. In this work, we propose Myportrait, a simple,\ngeneral, and flexible framework for neural portrait generation. We incorporate\npersonalized prior in a monocular video and morphable prior in 3D face\nmorphable space for generating personalized details under novel controllable\nparameters. Our proposed framework supports both video-driven and audio-driven\nface animation given a monocular video of a single person. Distinguished by\nwhether the test data is sent to training or not, our method provides a\nreal-time online version and a high-quality offline version. Comprehensive\nexperiments in various metrics demonstrate the superior performance of our\nmethod over the state-of-the-art methods. The code will be publicly available.\n","authors":["Bo Ding","Zhenfeng Fan","Shuang Yang","Shihong Xia"],"pdf_url":"https://arxiv.org/pdf/2312.02703v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02702v1","updated":"2023-12-05T12:04:34Z","published":"2023-12-05T12:04:34Z","title":"Neural Sign Actors: A diffusion model for 3D sign language production\n from text","summary":" Sign Languages (SL) serve as the predominant mode of communication for the\nDeaf and Hard of Hearing communities. The advent of deep learning has aided\nnumerous methods in SL recognition and translation, achieving remarkable\nresults. However, Sign Language Production (SLP) poses a challenge for the\ncomputer vision community as the motions generated must be realistic and have\nprecise semantic meanings. Most SLP methods rely on 2D data, thus impeding\ntheir ability to attain a necessary level of realism. In this work, we propose\na diffusion-based SLP model trained on a curated large-scale dataset of 4D\nsigning avatars and their corresponding text transcripts. The proposed method\ncan generate dynamic sequences of 3D avatars from an unconstrained domain of\ndiscourse using a diffusion process formed on a novel and anatomically informed\ngraph neural network defined on the SMPL-X body skeleton. Through a series of\nquantitative and qualitative experiments, we show that the proposed method\nconsiderably outperforms previous methods of SLP. We believe that this work\npresents an important and necessary step towards realistic neural sign avatars,\nbridging the communication gap between Deaf and hearing communities. The code,\nmethod and generated data will be made publicly available.\n","authors":["Vasileios Baltatzis","Rolandos Alexandros Potamias","Evangelos Ververas","Guanxiong Sun","Jiankang Deng","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.02702v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02700v1","updated":"2023-12-05T12:03:00Z","published":"2023-12-05T12:03:00Z","title":"Revisit Human-Scene Interaction via Space Occupancy","summary":" Human-scene Interaction (HSI) generation is a challenging task and crucial\nfor various downstream tasks. However, one of the major obstacles is the\nlimited data scale. High-quality data with simultaneously captured human and 3D\nenvironments is rare, resulting in limited data diversity and complexity. In\nthis work, we argue that interaction with a scene is essentially interacting\nwith the space occupancy of the scene from an abstract physical perspective,\nleading us to a unified novel view of Human-Occupancy Interaction. By treating\npure motion sequences as records of humans interacting with invisible scene\noccupancy, we can aggregate motion-only data into a large-scale paired\nhuman-occupancy interaction database: Motion Occupancy Base (MOB). Thus, the\nneed for costly paired motion-scene datasets with high-quality scene scans can\nbe substantially alleviated. With this new unified view of Human-Occupancy\ninteraction, a single motion controller is proposed to reach the target state\ngiven the surrounding occupancy. Once trained on MOB with complex occupancy\nlayout, the controller could handle cramped scenes and generalize well to\ngeneral scenes with limited complexity. With no GT 3D scenes for training, our\nmethod can generate realistic and stable HSI motions in diverse scenarios,\nincluding both static and dynamic scenes. Our code and data would be made\npublicly available at https://foruck.github.io/occu-page/.\n","authors":["Xinpeng Liu","Haowen Hou","Yanchao Yang","Yong-Lu Li","Cewu Lu"],"pdf_url":"https://arxiv.org/pdf/2312.02700v1.pdf","comment":"The first two authors contributed equally. Yong-Lu Li is the\n corresponding author. Project page: https://foruck.github.io/occu-page/"},{"id":"http://arxiv.org/abs/2312.02699v1","updated":"2023-12-05T12:02:53Z","published":"2023-12-05T12:02:53Z","title":"Enhancing Vehicle Entrance and Parking Management: Deep Learning\n Solutions for Efficiency and Security","summary":" The auto-management of vehicle entrance and parking in any organization is a\ncomplex challenge encompassing record-keeping, efficiency, and security\nconcerns. Manual methods for tracking vehicles and finding parking spaces are\nslow and a waste of time. To solve the problem of auto management of vehicle\nentrance and parking, we have utilized state-of-the-art deep learning models\nand automated the process of vehicle entrance and parking into any\norganization. To ensure security, our system integrated vehicle detection,\nlicense number plate verification, and face detection and recognition models to\nensure that the person and vehicle are registered with the organization. We\nhave trained multiple deep-learning models for vehicle detection, license\nnumber plate detection, face detection, and recognition, however, the YOLOv8n\nmodel outperformed all the other models. Furthermore, License plate recognition\nis facilitated by Google's Tesseract-OCR Engine. By integrating these\ntechnologies, the system offers efficient vehicle detection, precise\nidentification, streamlined record keeping, and optimized parking slot\nallocation in buildings, thereby enhancing convenience, accuracy, and security.\nFuture research opportunities lie in fine-tuning system performance for a wide\nrange of real-world applications.\n","authors":["Muhammad Umer Ramzan","Usman Ali","Syed Haider Abbas Naqvi","Zeeshan Aslam"," Tehseen","Husnain Ali","Muhammad Faheem"],"pdf_url":"https://arxiv.org/pdf/2312.02699v1.pdf","comment":"Accepted for publication in the 25th International Multitopic\n Conference (INMIC) IEEE 2023, 6 Pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.02696v1","updated":"2023-12-05T11:55:47Z","published":"2023-12-05T11:55:47Z","title":"Analyzing and Improving the Training Dynamics of Diffusion Models","summary":" Diffusion models currently dominate the field of data-driven image synthesis\nwith their unparalleled scaling to large datasets. In this paper, we identify\nand rectify several causes for uneven and ineffective training in the popular\nADM diffusion model architecture, without altering its high-level structure.\nObserving uncontrolled magnitude changes and imbalances in both the network\nactivations and weights over the course of training, we redesign the network\nlayers to preserve activation, weight, and update magnitudes on expectation. We\nfind that systematic application of this philosophy eliminates the observed\ndrifts and imbalances, resulting in considerably better networks at equal\ncomputational complexity. Our modifications improve the previous record FID of\n2.41 in ImageNet-512 synthesis to 1.81, achieved using fast deterministic\nsampling.\n As an independent contribution, we present a method for setting the\nexponential moving average (EMA) parameters post-hoc, i.e., after completing\nthe training run. This allows precise tuning of EMA length without the cost of\nperforming several training runs, and reveals its surprising interactions with\nnetwork architecture, training time, and guidance.\n","authors":["Tero Karras","Miika Aittala","Jaakko Lehtinen","Janne Hellsten","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2312.02696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02694v1","updated":"2023-12-05T11:53:17Z","published":"2023-12-05T11:53:17Z","title":"UPOCR: Towards Unified Pixel-Level OCR Interface","summary":" In recent years, the optical character recognition (OCR) field has been\nproliferating with plentiful cutting-edge approaches for a wide spectrum of\ntasks. However, these approaches are task-specifically designed with divergent\nparadigms, architectures, and training strategies, which significantly\nincreases the complexity of research and maintenance and hinders the fast\ndeployment in applications. To this end, we propose UPOCR, a\nsimple-yet-effective generalist model for Unified Pixel-level OCR interface.\nSpecifically, the UPOCR unifies the paradigm of diverse OCR tasks as\nimage-to-image transformation and the architecture as a vision Transformer\n(ViT)-based encoder-decoder. Learnable task prompts are introduced to push the\ngeneral feature representations extracted by the encoder toward task-specific\nspaces, endowing the decoder with task awareness. Moreover, the model training\nis uniformly aimed at minimizing the discrepancy between the generated and\nground-truth images regardless of the inhomogeneity among tasks. Experiments\nare conducted on three pixel-level OCR tasks including text removal, text\nsegmentation, and tampered text detection. Without bells and whistles, the\nexperimental results showcase that the proposed method can simultaneously\nachieve state-of-the-art performance on three tasks with a unified single\nmodel, which provides valuable strategies and insights for future research on\ngeneralist OCR models. Code will be publicly available.\n","authors":["Dezhi Peng","Zhenhua Yang","Jiaxin Zhang","Chongyu Liu","Yongxin Shi","Kai Ding","Fengjun Guo","Lianwen Jin"],"pdf_url":"https://arxiv.org/pdf/2312.02694v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02684v1","updated":"2023-12-05T11:40:41Z","published":"2023-12-05T11:40:41Z","title":"DeepPointMap: Advancing LiDAR SLAM with Unified Neural Descriptors","summary":" Point clouds have shown significant potential in various domains, including\nSimultaneous Localization and Mapping (SLAM). However, existing approaches\neither rely on dense point clouds to achieve high localization accuracy or use\ngeneralized descriptors to reduce map size. Unfortunately, these two aspects\nseem to conflict with each other. To address this limitation, we propose a\nunified architecture, DeepPointMap, achieving excellent preference on both\naspects. We utilize neural network to extract highly representative and sparse\nneural descriptors from point clouds, enabling memory-efficient map\nrepresentation and accurate multi-scale localization tasks (e.g., odometry and\nloop-closure). Moreover, we showcase the versatility of our framework by\nextending it to more challenging multi-agent collaborative SLAM. The promising\nresults obtained in these scenarios further emphasize the effectiveness and\npotential of our approach.\n","authors":["Xiaze Zhang","Ziheng Ding","Qi Jing","Yuejie Zhang","Wenchao Ding","Rui Feng"],"pdf_url":"https://arxiv.org/pdf/2312.02684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05867v2","updated":"2023-12-05T11:37:54Z","published":"2023-10-09T17:03:39Z","title":"Domain-wise Invariant Learning for Panoptic Scene Graph Generation","summary":" Panoptic Scene Graph Generation (PSG) involves the detection of objects and\nthe prediction of their corresponding relationships (predicates). However, the\npresence of biased predicate annotations poses a significant challenge for PSG\nmodels, as it hinders their ability to establish a clear decision boundary\namong different predicates. This issue substantially impedes the practical\nutility and real-world applicability of PSG models. To address the intrinsic\nbias above, we propose a novel framework to infer potentially biased\nannotations by measuring the predicate prediction risks within each\nsubject-object pair (domain), and adaptively transfer the biased annotations to\nconsistent ones by learning invariant predicate representation embeddings.\nExperiments show that our method significantly improves the performance of\nbenchmark models, achieving a new state-of-the-art performance, and shows great\ngeneralization and effectiveness on PSG dataset.\n","authors":["Li Li","You Qin","Wei Ji","Yuxiao Zhou","Roger Zimmermann"],"pdf_url":"https://arxiv.org/pdf/2310.05867v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2307.15567"},{"id":"http://arxiv.org/abs/2302.06353v2","updated":"2023-12-05T11:32:00Z","published":"2023-02-13T13:35:26Z","title":"Contour-based Interactive Segmentation","summary":" Recent advances in interactive segmentation (IS) allow speeding up and\nsimplifying image editing and labeling greatly. The majority of modern IS\napproaches accept user input in the form of clicks. However, using clicks may\nrequire too many user interactions, especially when selecting small objects,\nminor parts of an object, or a group of objects of the same type. In this\npaper, we consider such a natural form of user interaction as a loose contour,\nand introduce a contour-based IS method. We evaluate the proposed method on the\nstandard segmentation benchmarks, our novel UserContours dataset, and its\nsubset UserContours-G containing difficult segmentation cases. Through\nexperiments, we demonstrate that a single contour provides the same accuracy as\nmultiple clicks, thus reducing the required amount of user interactions.\n","authors":["Danil Galeev","Polina Popenova","Anna Vorontsova","Anton Konushin"],"pdf_url":"https://arxiv.org/pdf/2302.06353v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02672v1","updated":"2023-12-05T11:29:00Z","published":"2023-12-05T11:29:00Z","title":"Are Synthetic Data Useful for Egocentric Hand-Object Interaction\n Detection? An Investigation and the HOI-Synth Domain Adaptation Benchmark","summary":" In this study, we investigate the effectiveness of synthetic data in\nenhancing hand-object interaction detection within the egocentric vision\ndomain. We introduce a simulator able to generate synthetic images of\nhand-object interactions automatically labeled with hand-object contact states,\nbounding boxes, and pixel-wise segmentation masks. Through comprehensive\nexperiments and comparative analyses on three egocentric datasets, VISOR,\nEgoHOS, and ENIGMA-51, we demonstrate that the use of synthetic data and domain\nadaptation techniques allows for comparable performance to conventional\nsupervised methods while requiring annotations on only a fraction of the real\ndata. When tested with in-domain synthetic data generated from 3D models of\nreal target environments and objects, our best models show consistent\nperformance improvements with respect to standard fully supervised approaches\nbased on labeled real data only. Our study also sets a new benchmark of domain\nadaptation for egocentric hand-object interaction detection (HOI-Synth) and\nprovides baseline results to encourage the community to engage in this\nchallenging task. We release the generated data, code, and the simulator at the\nfollowing link: https://iplab.dmi.unict.it/HOI-Synth/.\n","authors":["Rosario Leonardi","Antonino Furnari","Francesco Ragusa","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2312.02672v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02663v1","updated":"2023-12-05T11:02:45Z","published":"2023-12-05T11:02:45Z","title":"FaceStudio: Put Your Face Everywhere in Seconds","summary":" This study investigates identity-preserving image synthesis, an intriguing\ntask in image generation that seeks to maintain a subject's identity while\nadding a personalized, stylistic touch. Traditional methods, such as Textual\nInversion and DreamBooth, have made strides in custom image creation, but they\ncome with significant drawbacks. These include the need for extensive resources\nand time for fine-tuning, as well as the requirement for multiple reference\nimages. To overcome these challenges, our research introduces a novel approach\nto identity-preserving synthesis, with a particular focus on human images. Our\nmodel leverages a direct feed-forward mechanism, circumventing the need for\nintensive fine-tuning, thereby facilitating quick and efficient image\ngeneration. Central to our innovation is a hybrid guidance framework, which\ncombines stylized images, facial images, and textual prompts to guide the image\ngeneration process. This unique combination enables our model to produce a\nvariety of applications, such as artistic portraits and identity-blended\nimages. Our experimental results, including both qualitative and quantitative\nevaluations, demonstrate the superiority of our method over existing baseline\nmodels and previous works, particularly in its remarkable efficiency and\nability to preserve the subject's identity with high fidelity.\n","authors":["Yuxuan Yan","Chi Zhang","Rui Wang","Pei Cheng","Gang Yu","Bin Fu"],"pdf_url":"https://arxiv.org/pdf/2312.02663v1.pdf","comment":"Project homepage: https://icoz69.github.io/facestudio/"},{"id":"http://arxiv.org/abs/2312.02647v1","updated":"2023-12-05T10:39:37Z","published":"2023-12-05T10:39:37Z","title":"TPA3D: Triplane Attention for Fast Text-to-3D Generation","summary":" Due to the lack of large-scale text-3D correspondence data, recent text-to-3D\ngeneration works mainly rely on utilizing 2D diffusion models for synthesizing\n3D data. Since diffusion-based methods typically require significant\noptimization time for both training and inference, the use of GAN-based models\nwould still be desirable for fast 3D generation. In this work, we propose\nTriplane Attention for text-guided 3D generation (TPA3D), an end-to-end\ntrainable GAN-based deep learning model for fast text-to-3D generation. With\nonly 3D shape data and their rendered 2D images observed during training, our\nTPA3D is designed to retrieve detailed visual descriptions for synthesizing the\ncorresponding 3D mesh data. This is achieved by the proposed attention\nmechanisms on the extracted sentence and word-level text features. In our\nexperiments, we show that TPA3D generates high-quality 3D textured shapes\naligned with fine-grained descriptions, while impressive computation efficiency\ncan be observed.\n","authors":["Hong-En Chen","Bin-Shih Wu","Sheng-Yu Huang","Yu-Chiang Frank Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02647v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.03408v3","updated":"2023-12-05T10:30:12Z","published":"2022-05-06T12:49:18Z","title":"HRCTCov19 -- A High-Resolution Chest CT Scan Image Dataset for COVID-19\n Diagnosis and Differentiation","summary":" Introduction: During the COVID-19 pandemic, computed tomography (CT) was a\npopular method for diagnosing COVID-19 patients. HRCT (High-Resolution Computed\nTomography) is a form of computed tomography that uses advanced methods to\nimprove image resolution. Publicly accessible COVID-19 CT image datasets are\nvery difficult to come by due to privacy concerns, which impedes the study and\ndevelopment of AI-powered COVID-19 diagnostic algorithms based on CT images.\nData description: To address this problem, we have introduced HRCTCov19, a new\nCOVID-19 high-resolution chest CT scan image dataset that includes not only\nCOVID-19 cases of Ground Glass Opacity (GGO), Crazy Paving, and Air Space\nConsolidation but also CT images of cases with negative COVID-19. The HRCTCov19\ndataset, which includes slice-level, and patient-level labels, has the\npotential to aid COVID-19 research, especially for diagnosis and\ndifferentiation using artificial intelligence algorithms, machine learning, and\ndeep learning methods. This dataset is accessible through the web at:\nhttp://databiox.com and includes 181,106 chest HRCT images from 395 patients\nwith four labels: GGO, Crazy Paving, Air Space Consolidation, and Negative.\nKeywords: COVID-19, CT scan, Computed Tomography, Chest Image, Dataset, Medical\nImaging\n","authors":["Iraj Abedi","Mahsa Vali","Bentolhoda Otroshi","Maryam Zamanian","Hamidreza Bolhasani"],"pdf_url":"https://arxiv.org/pdf/2205.03408v3.pdf","comment":"5 pages, 2 figures and 1 table"},{"id":"http://arxiv.org/abs/2311.13307v2","updated":"2023-12-05T10:25:23Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Reasoning and\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as an interaction between\nvision and language fields. Previous works inherited the ideology of\nvision-to-language generation tasks,aiming to generate paragraphs with high\nconsistency as reports. However, one unique characteristic of RRG, the\nindependence between diseases, was neglected, leading to the injection of\ndisease co-occurrence as a confounder that effects the results through backdoor\npath. Unfortunately, this confounder confuses the process of report generation\nworse because of the biased RRG data distribution. In this paper, to rethink\nthis issue thoroughly, we reason about its causes and effects from a novel\nperspective of statistics and causality, where the Joint Vision Coupling and\nthe Conditional Sentence Coherence Coupling are two aspects prone to implicitly\ndecrease the accuracy of reports. Then, a counterfactual augmentation strategy\nthat contains the Counterfactual Sample Synthesis and the Counterfactual Report\nReconstruction sub-methods is proposed to break these two aspects of spurious\neffects. Experimental results and further analyses on two widely used datasets\njustify our reasoning and proposed methods.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v2.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2312.02638v1","updated":"2023-12-05T10:24:43Z","published":"2023-12-05T10:24:43Z","title":"Synchronization is All You Need: Exocentric-to-Egocentric Transfer for\n Temporal Action Segmentation with Unlabeled Synchronized Video Pairs","summary":" We consider the problem of transferring a temporal action segmentation system\ninitially designed for exocentric (fixed) cameras to an egocentric scenario,\nwhere wearable cameras capture video data. The conventional supervised approach\nrequires the collection and labeling of a new set of egocentric videos to adapt\nthe model, which is costly and time-consuming. Instead, we propose a novel\nmethodology which performs the adaptation leveraging existing labeled\nexocentric videos and a new set of unlabeled, synchronized\nexocentric-egocentric video pairs, for which temporal action segmentation\nannotations do not need to be collected. We implement the proposed methodology\nwith an approach based on knowledge distillation, which we investigate both at\nthe feature and model level. To evaluate our approach, we introduce a new\nbenchmark based on the Assembly101 dataset. Results demonstrate the feasibility\nand effectiveness of the proposed method against classic unsupervised domain\nadaptation and temporal sequence alignment approaches. Remarkably, without\nbells and whistles, our best model performs on par with supervised approaches\ntrained on labeled egocentric data, without ever seeing a single egocentric\nlabel, achieving a +15.99% (28.59% vs 12.60%) improvement in the edit score on\nthe Assembly101 dataset compared to a baseline model trained solely on\nexocentric data.\n","authors":["Camillo Quattrocchi","Antonino Furnari","Daniele Di Mauro","Mario Valerio Giuffrida","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2312.02638v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.06614v3","updated":"2023-12-05T10:08:45Z","published":"2023-07-13T08:27:09Z","title":"Interpretable 2D Vision Models for 3D Medical Images","summary":" Training Artificial Intelligence (AI) models on 3D images presents unique\nchallenges compared to the 2D case: Firstly, the demand for computational\nresources is significantly higher, and secondly, the availability of large\ndatasets for pre-training is often limited, impeding training success. This\nstudy proposes a simple approach of adapting 2D networks with an intermediate\nfeature representation for processing 3D images. Our method employs attention\npooling to learn to assign each slice an importance weight and, by that, obtain\na weighted average of all 2D slices. These weights directly quantify the\ncontribution of each slice to the contribution and thus make the model\nprediction inspectable. We show on all 3D MedMNIST datasets as benchmark and\ntwo real-world datasets consisting of several hundred high-resolution CT or MRI\nscans that our approach performs on par with existing methods. Furthermore, we\ncompare the in-built interpretability of our approach to HiResCam, a\nstate-of-the-art retrospective interpretability approach.\n","authors":["Alexander Ziller","Ayhan Can Erdur","Marwa Trigui","Alp Güvenir","Tamara T. Mueller","Philip Müller","Friederike Jungmann","Johannes Brandt","Jan Peeken","Rickmer Braren","Daniel Rueckert","Georgios Kaissis"],"pdf_url":"https://arxiv.org/pdf/2307.06614v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.05810v2","updated":"2023-12-05T10:07:28Z","published":"2023-08-10T18:09:44Z","title":"Spintronics for image recognition: performance benchmarking via\n ultrafast data-driven simulations","summary":" We present a demonstration of image classification using an echo-state\nnetwork (ESN) relying on a single simulated spintronic nanostructure known as\nthe vortex-based spin-torque oscillator (STVO) delayed in time. We employ an\nultrafast data-driven simulation framework called the data-driven Thiele\nequation approach (DD-TEA) to simulate the STVO dynamics. This allows us to\navoid the challenges associated with repeated experimental manipulation of such\na nanostructured system. We showcase the versatility of our solution by\nsuccessfully applying it to solve classification challenges with the MNIST,\nEMNIST-letters and Fashion MNIST datasets. Through our simulations, we\ndetermine that within a large ESN the results obtained using the STVO dynamics\nas an activation function are comparable to the ones obtained with other\nconventional nonlinear activation functions like the reLU and the sigmoid.\nWhile achieving state-of-the-art accuracy levels on the MNIST dataset, our\nmodel's performance on EMNIST-letters and Fashion MNIST is lower due to the\nrelative simplicity of the system architecture and the increased complexity of\nthe tasks. We expect that the DD-TEA framework will enable the exploration of\ndeeper architectures, ultimately leading to improved classification accuracy.\n","authors":["Anatole Moureaux","Chloé Chopin","Laurent Jacques","Flavio Abreu Araujo"],"pdf_url":"https://arxiv.org/pdf/2308.05810v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00371v3","updated":"2023-12-05T10:06:05Z","published":"2023-07-01T15:48:33Z","title":"Learning Content-enhanced Mask Transformer for Domain Generalized\n Urban-Scene Segmentation","summary":" Domain-generalized urban-scene semantic segmentation (USSS) aims to learn\ngeneralized semantic predictions across diverse urban-scene styles. Unlike\ndomain gap challenges, USSS is unique in that the semantic categories are often\nsimilar in different urban scenes, while the styles can vary significantly due\nto changes in urban landscapes, weather conditions, lighting, and other\nfactors. Existing approaches typically rely on convolutional neural networks\n(CNNs) to learn the content of urban scenes.\n In this paper, we propose a Content-enhanced Mask TransFormer (CMFormer) for\ndomain-generalized USSS. The main idea is to enhance the focus of the\nfundamental component, the mask attention mechanism, in Transformer\nsegmentation models on content information. To achieve this, we introduce a\nnovel content-enhanced mask attention mechanism. It learns mask queries from\nboth the image feature and its down-sampled counterpart, as lower-resolution\nimage features usually contain more robust content information and are less\nsensitive to style variations. These features are fused into a Transformer\ndecoder and integrated into a multi-resolution content-enhanced mask attention\nlearning scheme.\n Extensive experiments conducted on various domain-generalized urban-scene\nsegmentation datasets demonstrate that the proposed CMFormer significantly\noutperforms existing CNN-based methods for domain-generalized semantic\nsegmentation, achieving improvements of up to 14.00\\% in terms of mIoU (mean\nintersection over union). The source code is publicly available at\n\\url{https://github.com/BiQiWHU/CMFormer}.\n","authors":["Qi Bi","Shaodi You","Theo Gevers"],"pdf_url":"https://arxiv.org/pdf/2307.00371v3.pdf","comment":"18 pages, 10 figures"},{"id":"http://arxiv.org/abs/2311.16926v3","updated":"2023-12-05T10:04:37Z","published":"2023-11-28T16:31:27Z","title":"LLaFS: When Large-Language Models Meet Few-Shot Segmentation","summary":" This paper proposes LLaFS, the first attempt to leverage large language\nmodels (LLMs) in few-shot segmentation. In contrast to the conventional\nfew-shot segmentation methods that only rely on the limited and biased\ninformation from the annotated support images, LLaFS leverages the vast prior\nknowledge gained by LLM as an effective supplement and directly uses the LLM to\nsegment images in a few-shot manner. To enable the text-based LLM to handle\nimage-related tasks, we carefully design an input instruction that allows the\nLLM to produce segmentation results represented as polygons, and propose a\nregion-attribute table to simulate the human visual mechanism and provide\nmulti-modal guidance. We also synthesize pseudo samples and use curriculum\nlearning for pretraining to augment data and achieve better optimization. LLaFS\nachieves state-of-the-art results on multiple datasets, showing the potential\nof using LLMs for few-shot computer vision tasks. Code will be available at\nhttps://github.com/lanyunzhu99/LLaFS.\n","authors":["Lanyun Zhu","Tianrun Chen","Deyi Ji","Jieping Ye","Jun Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16926v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02625v1","updated":"2023-12-05T10:01:11Z","published":"2023-12-05T10:01:11Z","title":"Diffusion Noise Feature: Accurate and Fast Generated Image Detection","summary":" Generative models have reached an advanced stage where they can produce\nremarkably realistic images. However, this remarkable generative capability\nalso introduces the risk of disseminating false or misleading information.\nNotably, existing image detectors for generated images encounter challenges\nsuch as low accuracy and limited generalization. This paper seeks to address\nthis issue by seeking a representation with strong generalization capabilities\nto enhance the detection of generated images. Our investigation has revealed\nthat real and generated images display distinct latent Gaussian representations\nwhen subjected to an inverse diffusion process within a pre-trained diffusion\nmodel. Exploiting this disparity, we can amplify subtle artifacts in generated\nimages. Building upon this insight, we introduce a novel image representation\nknown as Diffusion Noise Feature (DNF). DNF is an ensemble representation that\nestimates the noise generated during the inverse diffusion process. A simple\nclassifier, e.g., ResNet, trained on DNF achieves high accuracy, robustness,\nand generalization capabilities for detecting generated images, even from\npreviously unseen classes or models. We conducted experiments using a widely\nrecognized and standard dataset, achieving state-of-the-art effects of\nDetection.\n","authors":["Yichi Zhang","Xiaogang Xu"],"pdf_url":"https://arxiv.org/pdf/2312.02625v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15260v2","updated":"2023-12-05T09:53:18Z","published":"2023-11-26T10:27:22Z","title":"NeuRAD: Neural Rendering for Autonomous Driving","summary":" Neural radiance fields (NeRFs) have gained popularity in the autonomous\ndriving (AD) community. Recent methods show NeRFs' potential for closed-loop\nsimulation, enabling testing of AD systems, and as an advanced training data\naugmentation technique. However, existing methods often require long training\ntimes, dense semantic supervision, or lack generalizability. This, in turn,\nhinders the application of NeRFs for AD at scale. In this paper, we propose\nNeuRAD, a robust novel view synthesis method tailored to dynamic AD data. Our\nmethod features simple network design, extensive sensor modeling for both\ncamera and lidar -- including rolling shutter, beam divergence and ray dropping\n-- and is applicable to multiple datasets out of the box. We verify its\nperformance on five popular AD datasets, achieving state-of-the-art performance\nacross the board. To encourage further development, we will openly release the\nNeuRAD source code. See https://github.com/georghess/NeuRAD .\n","authors":["Adam Tonderski","Carl Lindström","Georg Hess","William Ljungbergh","Lennart Svensson","Christoffer Petersson"],"pdf_url":"https://arxiv.org/pdf/2311.15260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02617v1","updated":"2023-12-05T09:47:37Z","published":"2023-12-05T09:47:37Z","title":"DreaMo: Articulated 3D Reconstruction From A Single Casual Video","summary":" Articulated 3D reconstruction has valuable applications in various domains,\nyet it remains costly and demands intensive work from domain experts. Recent\nadvancements in template-free learning methods show promising results with\nmonocular videos. Nevertheless, these approaches necessitate a comprehensive\ncoverage of all viewpoints of the subject in the input video, thus limiting\ntheir applicability to casually captured videos from online sources. In this\nwork, we study articulated 3D shape reconstruction from a single and casually\ncaptured internet video, where the subject's view coverage is incomplete. We\npropose DreaMo that jointly performs shape reconstruction while solving the\nchallenging low-coverage regions with view-conditioned diffusion prior and\nseveral tailored regularizations. In addition, we introduce a skeleton\ngeneration strategy to create human-interpretable skeletons from the learned\nneural bones and skinning weights. We conduct our study on a self-collected\ninternet video collection characterized by incomplete view coverage. DreaMo\nshows promising quality in novel-view rendering, detailed articulated shape\nreconstruction, and skeleton generation. Extensive qualitative and quantitative\nstudies validate the efficacy of each proposed component, and show existing\nmethods are unable to solve correct geometry due to the incomplete view\ncoverage.\n","authors":["Tao Tu","Ming-Feng Li","Chieh Hubert Lin","Yen-Chi Cheng","Min Sun","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02617v1.pdf","comment":"Project page: https://ttaoretw.github.io/dreamo/"},{"id":"http://arxiv.org/abs/2312.02616v1","updated":"2023-12-05T09:47:28Z","published":"2023-12-05T09:47:28Z","title":"Facilitating the Production of Well-tailored Video Summaries for Sharing\n on Social Media","summary":" This paper presents a web-based tool that facilitates the production of\ntailored summaries for online sharing on social media. Through an interactive\nuser interface, it supports a ``one-click'' video summarization process. Based\non the integrated AI models for video summarization and aspect ratio\ntransformation, it facilitates the generation of multiple summaries of a\nfull-length video according to the needs of target platforms with regard to the\nvideo's length and aspect ratio.\n","authors":["Evlampios Apostolidis","Konstantinos Apostolidis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2312.02616v1.pdf","comment":"Accepted for publication, 30th Int. Conf. on MultiMedia Modeling (MMM\n 2024), Amsterdam, NL, Jan.-Feb. 2024. This is the \"submitted manuscript\"\n version"},{"id":"http://arxiv.org/abs/2312.02615v1","updated":"2023-12-05T09:44:47Z","published":"2023-12-05T09:44:47Z","title":"Projection Regret: Reducing Background Bias for Novelty Detection via\n Diffusion Models","summary":" Novelty detection is a fundamental task of machine learning which aims to\ndetect abnormal ($\\textit{i.e.}$ out-of-distribution (OOD)) samples. Since\ndiffusion models have recently emerged as the de facto standard generative\nframework with surprising generation results, novelty detection via diffusion\nmodels has also gained much attention. Recent methods have mainly utilized the\nreconstruction property of in-distribution samples. However, they often suffer\nfrom detecting OOD samples that share similar background information to the\nin-distribution data. Based on our observation that diffusion models can\n\\emph{project} any sample to an in-distribution sample with similar background\ninformation, we propose \\emph{Projection Regret (PR)}, an efficient novelty\ndetection method that mitigates the bias of non-semantic information. To be\nspecific, PR computes the perceptual distance between the test image and its\ndiffusion-based projection to detect abnormality. Since the perceptual distance\noften fails to capture semantic changes when the background information is\ndominant, we cancel out the background bias by comparing it against recursive\nprojections. Extensive experiments demonstrate that PR outperforms the prior\nart of generative-model-based novelty detection methods by a significant\nmargin.\n","authors":["Sungik Choi","Hankook Lee","Honglak Lee","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02615v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02613v1","updated":"2023-12-05T09:43:27Z","published":"2023-12-05T09:43:27Z","title":"A Unified Simulation Framework for Visual and Behavioral Fidelity in\n Crowd Analysis","summary":" Simulation is a powerful tool to easily generate annotated data, and a highly\ndesirable feature, especially in those domains where learning models need large\ntraining datasets. Machine learning and deep learning solutions, have proven to\nbe extremely data-hungry and sometimes, the available real-world data are not\nsufficient to effectively model the given task. Despite the initial skepticism\nof a portion of the scientific community, the potential of simulation has been\nlargely confirmed in many application areas, and the recent developments in\nterms of rendering and virtualization engines, have shown a good ability also\nin representing complex scenes. This includes environmental factors, such as\nweather conditions and surface reflectance, as well as human-related events,\nlike human actions and behaviors. We present a human crowd simulator, called\nUniCrowd, and its associated validation pipeline. We show how the simulator can\ngenerate annotated data, suitable for computer vision tasks, in particular for\ndetection and segmentation, as well as the related applications, as crowd\ncounting, human pose estimation, trajectory analysis and prediction, and\nanomaly detection.\n","authors":["Niccolò Bisagno","Nicola Garau","Antonio Luigi Stefani","Nicola Conci"],"pdf_url":"https://arxiv.org/pdf/2312.02613v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02608v1","updated":"2023-12-05T09:34:56Z","published":"2023-12-05T09:34:56Z","title":"Panoptica -- instance-wise evaluation of 3D semantic and instance\n segmentation maps","summary":" This paper introduces panoptica, a versatile and performance-optimized\npackage designed for computing instance-wise segmentation quality metrics from\n2D and 3D segmentation maps. panoptica addresses the limitations of existing\nmetrics and provides a modular framework that complements the original\nintersection over union-based panoptic quality with other metrics, such as the\ndistance metric Average Symmetric Surface Distance. The package is open-source,\nimplemented in Python, and accompanied by comprehensive documentation and\ntutorials. panoptica employs a three-step metrics computation process to cover\ndiverse use cases. The efficacy of panoptica is demonstrated on various\nreal-world biomedical datasets, where an instance-wise evaluation is\ninstrumental for an accurate representation of the underlying clinical task.\nOverall, we envision panoptica as a valuable tool facilitating in-depth\nevaluation of segmentation methods.\n","authors":["Florian Kofler","Hendrik Möller","Josef A. Buchner","Ezequiel de la Rosa","Ivan Ezhov","Marcel Rosier","Isra Mekki","Suprosanna Shit","Moritz Negwer","Rami Al-Maskari","Ali Ertürk","Shankeeth Vinayahalingam","Fabian Isensee","Sarthak Pati","Daniel Rueckert","Jan S. Kirschke","Stefan K. Ehrlich","Annika Reinke","Bjoern Menze","Benedikt Wiestler","Marie Piraud"],"pdf_url":"https://arxiv.org/pdf/2312.02608v1.pdf","comment":"15 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.02605v1","updated":"2023-12-05T09:26:09Z","published":"2023-12-05T09:26:09Z","title":"Accelerating Learnt Video Codecs with Gradient Decay and Layer-wise\n Distillation","summary":" In recent years, end-to-end learnt video codecs have demonstrated their\npotential to compete with conventional coding algorithms in term of compression\nefficiency. However, most learning-based video compression models are\nassociated with high computational complexity and latency, in particular at the\ndecoder side, which limits their deployment in practical applications. In this\npaper, we present a novel model-agnostic pruning scheme based on gradient decay\nand adaptive layer-wise distillation. Gradient decay enhances parameter\nexploration during sparsification whilst preventing runaway sparsity and is\nsuperior to the standard Straight-Through Estimation. The adaptive layer-wise\ndistillation regulates the sparse training in various stages based on the\ndistortion of intermediate features. This stage-wise design efficiently updates\nparameters with minimal computational overhead. The proposed approach has been\napplied to three popular end-to-end learnt video codecs, FVC, DCVC, and\nDCVC-HEM. Results confirm that our method yields up to 65% reduction in MACs\nand 2x speed-up with less than 0.3dB drop in BD-PSNR. Supporting code and\nsupplementary material can be downloaded from:\nhttps://jasminepp.github.io/lightweightdvc/\n","authors":["Tianhao Peng","Ge Gao","Heming Sun","Fan Zhang","David Bull"],"pdf_url":"https://arxiv.org/pdf/2312.02605v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.19798v2","updated":"2023-12-05T09:26:05Z","published":"2023-05-31T12:38:24Z","title":"Primal-Attention: Self-attention through Asymmetric Kernel SVD in Primal\n Representation","summary":" Recently, a new line of works has emerged to understand and improve\nself-attention in Transformers by treating it as a kernel machine. However,\nexisting works apply the methods for symmetric kernels to the asymmetric\nself-attention, resulting in a nontrivial gap between the analytical\nunderstanding and numerical implementation. In this paper, we provide a new\nperspective to represent and optimize self-attention through asymmetric Kernel\nSingular Value Decomposition (KSVD), which is also motivated by the low-rank\nproperty of self-attention normally observed in deep layers. Through asymmetric\nKSVD, $i$) a primal-dual representation of self-attention is formulated, where\nthe optimization objective is cast to maximize the projection variances in the\nattention outputs; $ii$) a novel attention mechanism, i.e., Primal-Attention,\nis proposed via the primal representation of KSVD, avoiding explicit\ncomputation of the kernel matrix in the dual; $iii$) with KKT conditions, we\nprove that the stationary solution to the KSVD optimization in Primal-Attention\nyields a zero-value objective. In this manner, KSVD optimization can be\nimplemented by simply minimizing a regularization loss, so that low-rank\nproperty is promoted without extra decomposition. Numerical experiments show\nstate-of-the-art performance of our Primal-Attention with improved efficiency.\nMoreover, we demonstrate that the deployed KSVD optimization regularizes\nPrimal-Attention with a sharper singular value decay than that of the canonical\nself-attention, further verifying the great potential of our method. To the\nbest of our knowledge, this is the first work that provides a primal-dual\nrepresentation for the asymmetric kernel in self-attention and successfully\napplies it to modeling and optimization.\n","authors":["Yingyi Chen","Qinghua Tao","Francesco Tonin","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2305.19798v2.pdf","comment":"NeurIPS 2023. We provide a primal-dual representation for the\n asymmetric self-attention in transformer that allows to avoid explicit\n computation of the kernel matrix"},{"id":"http://arxiv.org/abs/2306.17046v4","updated":"2023-12-05T09:23:25Z","published":"2023-06-29T15:43:06Z","title":"Spiking Denoising Diffusion Probabilistic Models","summary":" Spiking neural networks (SNNs) have ultra-low energy consumption and high\nbiological plausibility due to their binary and bio-driven nature compared with\nartificial neural networks (ANNs). While previous research has primarily\nfocused on enhancing the performance of SNNs in classification tasks, the\ngenerative potential of SNNs remains relatively unexplored. In our paper, we\nput forward Spiking Denoising Diffusion Probabilistic Models (SDDPM), a new\nclass of SNN-based generative models that achieve high sample quality. To fully\nexploit the energy efficiency of SNNs, we propose a purely Spiking U-Net\narchitecture, which achieves comparable performance to its ANN counterpart\nusing only 4 time steps, resulting in significantly reduced energy consumption.\nExtensive experimental results reveal that our approach achieves\nstate-of-the-art on the generative tasks and substantially outperforms other\nSNN-based generative models, achieving up to 12x and 6x improvement on the\nCIFAR-10 and the CelebA datasets, respectively. Moreover, we propose a\nthreshold-guided strategy that can further improve the performances by 2.69% in\na training-free manner. The SDDPM symbolizes a significant advancement in the\nfield of SNN generation, injecting new perspectives and potential avenues of\nexploration. Our code is available at https://github.com/AndyCao1125/SDDPM.\n","authors":["Jiahang Cao","Ziqing Wang","Hanzhong Guo","Hao Cheng","Qiang Zhang","Renjing Xu"],"pdf_url":"https://arxiv.org/pdf/2306.17046v4.pdf","comment":"Accepted by WACV 2024"},{"id":"http://arxiv.org/abs/2304.06059v2","updated":"2023-12-05T09:16:40Z","published":"2023-04-12T15:29:28Z","title":"Efficient Deep Learning Models for Privacy-preserving People Counting on\n Low-resolution Infrared Arrays","summary":" Ultra-low-resolution Infrared (IR) array sensors offer a low-cost,\nenergy-efficient, and privacy-preserving solution for people counting, with\napplications such as occupancy monitoring. Previous work has shown that Deep\nLearning (DL) can yield superior performance on this task. However, the\nliterature was missing an extensive comparative analysis of various efficient\nDL architectures for IR array-based people counting, that considers not only\ntheir accuracy, but also the cost of deploying them on memory- and\nenergy-constrained Internet of Things (IoT) edge nodes. In this work, we\naddress this need by comparing 6 different DL architectures on a novel dataset\ncomposed of IR images collected from a commercial 8x8 array, which we made\nopenly available. With a wide architectural exploration of each model type, we\nobtain a rich set of Pareto-optimal solutions, spanning cross-validated\nbalanced accuracy scores in the 55.70-82.70% range. When deployed on a\ncommercial Microcontroller (MCU) by STMicroelectronics, the STM32L4A6ZG, these\nmodels occupy 0.41-9.28kB of memory, and require 1.10-7.74ms per inference,\nwhile consuming 17.18-120.43 $\\mu$J of energy. Our models are significantly\nmore accurate than a previous deterministic method (up to +39.9%), while being\nup to 3.53x faster and more energy efficient. Further, our models' accuracy is\ncomparable to state-of-the-art DL solutions on similar resolution sensors,\ndespite a much lower complexity. All our models enable continuous, real-time\ninference on a MCU-based IoT node, with years of autonomous operation without\nbattery recharging.\n","authors":["Chen Xie","Francesco Daghero","Yukai Chen","Marco Castellano","Luca Gandolfi","Andrea Calimera","Enrico Macii","Massimo Poncino","Daniele Jahier Pagliari"],"pdf_url":"https://arxiv.org/pdf/2304.06059v2.pdf","comment":"This article has been accepted for publication in IEEE Internet of\n Things Journal; Fixed typos"},{"id":"http://arxiv.org/abs/2311.00230v2","updated":"2023-12-05T09:13:53Z","published":"2023-11-01T02:22:17Z","title":"DINO-Mix: Enhancing Visual Place Recognition with Foundational Vision\n Model and Feature Mixing","summary":" Utilizing visual place recognition (VPR) technology to ascertain the\ngeographical location of publicly available images is a pressing issue for\nreal-world VPR applications. Although most current VPR methods achieve\nfavorable results under ideal conditions, their performance in complex\nenvironments, characterized by lighting variations, seasonal changes, and\nocclusions caused by moving objects, is generally unsatisfactory. In this\nstudy, we utilize the DINOv2 model as the backbone network for trimming and\nfine-tuning to extract robust image features. We propose a novel VPR\narchitecture called DINO-Mix, which combines a foundational vision model with\nfeature aggregation. This architecture relies on the powerful image feature\nextraction capabilities of foundational vision models. We employ an\nMLP-Mixer-based mix module to aggregate image features, resulting in globally\nrobust and generalizable descriptors that enable high-precision VPR. We\nexperimentally demonstrate that the proposed DINO-Mix architecture\nsignificantly outperforms current state-of-the-art (SOTA) methods. In test sets\nhaving lighting variations, seasonal changes, and occlusions (Tokyo24/7,\nNordland, SF-XL-Testv1), our proposed DINO-Mix architecture achieved Top-1\naccuracy rates of 91.75%, 80.18%, and 82%, respectively. Compared with SOTA\nmethods, our architecture exhibited an average accuracy improvement of 5.14%.\n","authors":["Gaoshuang Huang","Yang Zhou","Xiaofei Hu","Chenglong Zhang","Luying Zhao","Wenjian Gan","Mingbo Hou"],"pdf_url":"https://arxiv.org/pdf/2311.00230v2.pdf","comment":"Under review / Open source code"},{"id":"http://arxiv.org/abs/2311.09655v2","updated":"2023-12-05T09:10:37Z","published":"2023-11-16T08:17:02Z","title":"Multi-View Spectrogram Transformer for Respiratory Sound Classification","summary":" Deep neural networks have been applied to audio spectrograms for respiratory\nsound classification. Existing models often treat the spectrogram as a\nsynthetic image while overlooking its physical characteristics. In this paper,\na Multi-View Spectrogram Transformer (MVST) is proposed to embed different\nviews of time-frequency characteristics into the vision transformer.\nSpecifically, the proposed MVST splits the mel-spectrogram into different sized\npatches, representing the multi-view acoustic elements of a respiratory sound.\nThese patches and positional embeddings are then fed into transformer encoders\nto extract the attentional information among patches through a self-attention\nmechanism. Finally, a gated fusion scheme is designed to automatically weigh\nthe multi-view features to highlight the best one in a specific scenario.\nExperimental results on the ICBHI dataset demonstrate that the proposed MVST\nsignificantly outperforms state-of-the-art methods for classifying respiratory\nsounds.\n","authors":["Wentao He","Yuchen Yan","Jianfeng Ren","Ruibin Bai","Xudong Jiang"],"pdf_url":"https://arxiv.org/pdf/2311.09655v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2209.00383v3","updated":"2023-12-05T09:01:49Z","published":"2022-09-01T11:52:26Z","title":"TokenCut: Segmenting Objects in Images and Videos with Self-supervised\n Transformer and Normalized Cut","summary":" In this paper, we describe a graph-based algorithm that uses the features\nobtained by a self-supervised transformer to detect and segment salient objects\nin images and videos. With this approach, the image patches that compose an\nimage or video are organised into a fully connected graph, where the edge\nbetween each pair of patches is labeled with a similarity score between patches\nusing features learned by the transformer. Detection and segmentation of\nsalient objects is then formulated as a graph-cut problem and solved using the\nclassical Normalized Cut algorithm. Despite the simplicity of this approach, it\nachieves state-of-the-art results on several common image and video detection\nand segmentation tasks. For unsupervised object discovery, this approach\noutperforms the competing approaches by a margin of 6.1%, 5.7%, and 2.6%,\nrespectively, when tested with the VOC07, VOC12, and COCO20K datasets. For the\nunsupervised saliency detection task in images, this method improves the score\nfor Intersection over Union (IoU) by 4.4%, 5.6% and 5.2%. When tested with the\nECSSD, DUTS, and DUT-OMRON datasets, respectively, compared to current\nstate-of-the-art techniques. This method also achieves competitive results for\nunsupervised video object segmentation tasks with the DAVIS, SegTV2, and FBMS\ndatasets.\n","authors":["Yangtao Wang","Xi Shen","Yuan Yuan","Yuming Du","Maomao Li","Shell Xu Hu","James L Crowley","Dominique Vaufreydaz"],"pdf_url":"https://arxiv.org/pdf/2209.00383v3.pdf","comment":"arXiv admin note: text overlap with arXiv:2202.11539"},{"id":"http://arxiv.org/abs/2312.01529v2","updated":"2023-12-05T09:01:07Z","published":"2023-12-03T23:03:22Z","title":"T3D: Towards 3D Medical Image Understanding through Vision-Language\n Pre-training","summary":" Expert annotation of 3D medical image for downstream analysis is\nresource-intensive, posing challenges in clinical applications. Visual\nself-supervised learning (vSSL), though effective for learning visual\ninvariance, neglects the incorporation of domain knowledge from medicine. To\nincorporate medical knowledge into visual representation learning,\nvision-language pre-training (VLP) has shown promising results in 2D image.\nHowever, existing VLP approaches become generally impractical when applied to\nhigh-resolution 3D medical images due to GPU hardware constraints and the\npotential loss of critical details caused by downsampling, which is the\nintuitive solution to hardware constraints. To address the above limitations,\nwe introduce T3D, the first VLP framework designed for high-resolution 3D\nmedical images. T3D incorporates two text-informed pretext tasks:\n(\\lowerromannumeral{1}) text-informed contrastive learning;\n(\\lowerromannumeral{2}) text-informed image restoration. These tasks focus on\nlearning 3D visual representations from high-resolution 3D medical images and\nintegrating clinical knowledge from radiology reports, without distorting\ninformation through forced alignment of downsampled volumes with detailed\nanatomical text. Trained on a newly curated large-scale dataset of 3D medical\nimages and radiology reports, T3D significantly outperforms current vSSL\nmethods in tasks like organ and tumor segmentation, as well as disease\nclassification. This underlines T3D's potential in representation learning for\n3D medical image analysis. All data and code will be available upon acceptance.\n","authors":["Che Liu","Cheng Ouyang","Yinda Chen","Cesar César Quilodrán-Casas","Lei Ma","Jie Fu","Yike Guo","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2312.01529v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14062v2","updated":"2023-12-05T08:56:04Z","published":"2023-11-23T15:38:13Z","title":"Hardware Resilience Properties of Text-Guided Image Classifiers","summary":" This paper presents a novel method to enhance the reliability of image\nclassification models during deployment in the face of transient hardware\nerrors. By utilizing enriched text embeddings derived from GPT-3 with question\nprompts per class and CLIP pretrained text encoder, we investigate their impact\nas an initialization for the classification layer. Our approach achieves a\nremarkable $5.5\\times$ average increase in hardware reliability (and up to\n$14\\times$) across various architectures in the most critical layer, with\nminimal accuracy drop ($0.3\\%$ on average) compared to baseline PyTorch models.\nFurthermore, our method seamlessly integrates with any image classification\nbackbone, showcases results across various network architectures, decreases\nparameter and FLOPs overhead, and follows a consistent training recipe. This\nresearch offers a practical and efficient solution to bolster the robustness of\nimage classification models against hardware failures, with potential\nimplications for future studies in this domain. Our code and models are\nreleased at https://github.com/TalalWasim/TextGuidedResilience.\n","authors":["Syed Talal Wasim","Kabila Haile Soboka","Abdulrahman Mahmoud","Salman Khan","David Brooks","Gu-Yeon Wei"],"pdf_url":"https://arxiv.org/pdf/2311.14062v2.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02576v1","updated":"2023-12-05T08:48:31Z","published":"2023-12-05T08:48:31Z","title":"An Integrated System for Spatio-Temporal Summarization of 360-degrees\n Videos","summary":" In this work, we present an integrated system for spatiotemporal\nsummarization of 360-degrees videos. The video summary production mainly\ninvolves the detection of salient events and their synopsis into a concise\nsummary. The analysis relies on state-of-the-art methods for saliency detection\nin 360-degrees video (ATSal and SST-Sal) and video summarization (CA-SUM). It\nalso contains a mechanism that classifies a 360-degrees video based on the use\nof static or moving camera during recording and decides which saliency\ndetection method will be used, as well as a 2D video production component that\nis responsible to create a conventional 2D video containing the salient events\nin the 360-degrees video. Quantitative evaluations using two datasets for\n360-degrees video saliency detection (VR-EyeTracking, Sports-360) show the\naccuracy and positive impact of the developed decision mechanism, and justify\nour choice to use two different methods for detecting the salient events. A\nqualitative analysis using content from these datasets, gives further insights\nabout the functionality of the decision mechanism, shows the pros and cons of\neach used saliency detection method and demonstrates the advanced performance\nof the trained summarization method against a more conventional approach.\n","authors":["Ioannis Kontostathis","Evlampios Apostolidis","Vasileios Mezaris"],"pdf_url":"https://arxiv.org/pdf/2312.02576v1.pdf","comment":"Accepted for publication, 30th Int. Conf. on MultiMedia Modeling (MMM\n 2024), Amsterdam, NL, Jan.-Feb. 2024. This is the \"submitted manuscript\"\n version"},{"id":"http://arxiv.org/abs/2309.14162v2","updated":"2023-12-05T08:38:52Z","published":"2023-09-25T14:13:26Z","title":"Data Upcycling Knowledge Distillation for Image Super-Resolution","summary":" Knowledge distillation (KD) emerges as a promising yet challenging technique\nfor compressing deep neural networks, aiming to transfer extensive learning\nrepresentations from proficient and computationally intensive teacher models to\ncompact student models. However, current KD methods for super-resolution (SR)\nmodels have limited performance and restricted applications, since the\ncharacteristics of SR tasks are overlooked. In this paper, we put forth an\napproach from the perspective of effective data utilization, namely, the Data\nUpcycling Knowledge Distillation (DUKD), which facilitates the student model by\nthe prior knowledge the teacher provided through the upcycled in-domain data\nderived from the input images. Besides, for the first time, we realize the\nlabel consistency regularization in KD for SR models, which is implemented by\nthe paired invertible data augmentations. It constrains the training process of\nKD and leads to better generalization capability of the student model. The\nDUKD, due to its versatility, can be applied across a broad spectrum of\nteacher-student architectures (e.g., CNN and Transformer models) and SR tasks,\nsuch as single image SR, real-world SR, and SR quantization, and is in parallel\nwith other compression techniques. Comprehensive experiments on diverse\nbenchmarks demonstrate that the DUKD method significantly outperforms previous\nart.\n","authors":["Yun Zhang","Wei Li","Simiao Li","Jie Hu","Hanting Chen","Hailing Wang","Zhijun Tu","Wenjia Wang","Bingyi Jing","Yunhe Wang"],"pdf_url":"https://arxiv.org/pdf/2309.14162v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02568v1","updated":"2023-12-05T08:32:46Z","published":"2023-12-05T08:32:46Z","title":"Prompt2NeRF-PIL: Fast NeRF Generation via Pretrained Implicit Latent","summary":" This paper explores promptable NeRF generation (e.g., text prompt or single\nimage prompt) for direct conditioning and fast generation of NeRF parameters\nfor the underlying 3D scenes, thus undoing complex intermediate steps while\nproviding full 3D generation with conditional control. Unlike previous\ndiffusion-CLIP-based pipelines that involve tedious per-prompt optimizations,\nPrompt2NeRF-PIL is capable of generating a variety of 3D objects with a single\nforward pass, leveraging a pre-trained implicit latent space of NeRF\nparameters. Furthermore, in zero-shot tasks, our experiments demonstrate that\nthe NeRFs produced by our method serve as semantically informative\ninitializations, significantly accelerating the inference process of existing\nprompt-to-NeRF methods. Specifically, we will show that our approach speeds up\nthe text-to-NeRF model DreamFusion and the 3D reconstruction speed of the\nimage-to-NeRF method Zero-1-to-3 by 3 to 5 times.\n","authors":["Jianmeng Liu","Yuyao Zhang","Zeyuan Meng","Yu-Wing Tai","Chi-Keung Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02567v1","updated":"2023-12-05T08:32:27Z","published":"2023-12-05T08:32:27Z","title":"Think Twice Before Selection: Federated Evidential Active Learning for\n Medical Image Analysis with Domain Shifts","summary":" Federated learning facilitates the collaborative learning of a global model\nacross multiple distributed medical institutions without centralizing data.\nNevertheless, the expensive cost of annotation on local clients remains an\nobstacle to effectively utilizing local data. To mitigate this issue, federated\nactive learning methods suggest leveraging local and global model predictions\nto select a relatively small amount of informative local data for annotation.\nHowever, existing methods mainly focus on all local data sampled from the same\ndomain, making them unreliable in realistic medical scenarios with domain\nshifts among different clients. In this paper, we make the first attempt to\nassess the informativeness of local data derived from diverse domains and\npropose a novel methodology termed Federated Evidential Active Learning (FEAL)\nto calibrate the data evaluation under domain shift. Specifically, we introduce\na Dirichlet prior distribution in both local and global models to treat the\nprediction as a distribution over the probability simplex and capture both\naleatoric and epistemic uncertainties by using the Dirichlet-based evidential\nmodel. Then we employ the epistemic uncertainty to calibrate the aleatoric\nuncertainty. Afterward, we design a diversity relaxation strategy to reduce\ndata redundancy and maintain data diversity. Extensive experiments and analyses\nare conducted to show the superiority of FEAL over the state-of-the-art active\nlearning methods and the efficiency of FEAL under the federated active learning\nframework.\n","authors":["Jiayi Chen","Benteng Ma","Hengfei Cui","Yong Xia","Kwang-Ting Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.02567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05197v2","updated":"2023-12-05T08:13:34Z","published":"2023-11-09T08:23:44Z","title":"Deep Learning in Computed Tomography Pulmonary Angiography Imaging: A\n Dual-Pronged Approach for Pulmonary Embolism Detection","summary":" The increasing reliance on Computed Tomography Pulmonary Angiography for\nPulmonary Embolism (PE) diagnosis presents challenges and a pressing need for\nimproved diagnostic solutions. The primary objective of this study is to\nleverage deep learning techniques to enhance the Computer Assisted Diagnosis of\nPE. In this study, we propose a classifier-guided detection approach that\neffectively leverages the classifier's probabilistic inference to direct the\ndetection predictions, marking a novel contribution in the domain of automated\nPE diagnosis. Our end-to-end classification framework introduces an\nAttention-Guided Convolutional Neural Network (AG-CNN) that leverages local\ncontext by utilizing an attention mechanism. This approach emulates the\nattention of a human expert by looking at both global appearances and local\nlesion regions before forming a conclusive decision. The classifier achieves a\nnotable AUROC, sensitivity, specificity and F1-score of 0.927, 0.862, 0.879 and\n0.805 respectively on the FUMPE dataset with Inception-v3 backbone\narchitecture. Moreover, AG-CNN outperforms the baseline DenseNet-121 model,\nachieving an 8.1% AUROC gain. While prior studies have primarily focused on PE\ndetection in main arteries, our utilization of state-of-the-art object\ndetection models and ensembling techniques significantly enhances detection\naccuracy for small embolisms in the peripheral arteries. Finally, our proposed\nclassifier-guided detection approach further refines the detection metrics\ncontributing new state-of-the-art to the community: mAP$_{50}$, sensitivity and\nF1-score of 0.846, 0.901 and 0.779 respectively outperforming the former\nbenchmark with a significant 3.7% improvement in mAP$_{50}$. Our research aims\nto elevate PE patient care by integrating AI solutions into clinical workflows,\nhighlighting the potential of human-AI collaboration in medical diagnostics.\n","authors":["Fabiha Bushra","Muhammad E. H. Chowdhury","Rusab Sarmun","Saidul Kabir","Menatalla Said","Sohaib Bassam Zoghoul","Adam Mushtak","Israa Al-Hashimi","Abdulrahman Alqahtani","Anwarul Hasan"],"pdf_url":"https://arxiv.org/pdf/2311.05197v2.pdf","comment":"24 pages, 13 figures, Submitted to Expert Systems With Applications"},{"id":"http://arxiv.org/abs/2309.16948v3","updated":"2023-12-05T08:01:39Z","published":"2023-09-29T03:24:24Z","title":"Denoising Diffusion Bridge Models","summary":" Diffusion models are powerful generative models that map noise to data using\nstochastic processes. However, for many applications such as image editing, the\nmodel input comes from a distribution that is not random noise. As such,\ndiffusion models must rely on cumbersome methods like guidance or projected\nsampling to incorporate this information in the generative process. In our\nwork, we propose Denoising Diffusion Bridge Models (DDBMs), a natural\nalternative to this paradigm based on diffusion bridges, a family of processes\nthat interpolate between two paired distributions given as endpoints. Our\nmethod learns the score of the diffusion bridge from data and maps from one\nendpoint distribution to the other by solving a (stochastic) differential\nequation based on the learned score. Our method naturally unifies several\nclasses of generative models, such as score-based diffusion models and\nOT-Flow-Matching, allowing us to adapt existing design and architectural\nchoices to our more general problem. Empirically, we apply DDBMs to challenging\nimage datasets in both pixel and latent space. On standard image translation\nproblems, DDBMs achieve significant improvement over baseline methods, and,\nwhen we reduce the problem to image generation by setting the source\ndistribution to random noise, DDBMs achieve comparable FID scores to\nstate-of-the-art methods despite being built for a more general task.\n","authors":["Linqi Zhou","Aaron Lou","Samar Khanna","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2309.16948v3.pdf","comment":"Github: https://github.com/alexzhou907/DDBM/"},{"id":"http://arxiv.org/abs/2209.13177v6","updated":"2023-12-05T07:59:42Z","published":"2022-09-27T06:29:18Z","title":"Fairness in Medical Image Analysis and Healthcare: A Literature Survey","summary":" Machine learning-enabled medical imaging analysis has become a vital part of\nthe automatic diagnosis system. However, machine learning, especially deep\nlearning models have been shown to demonstrate a systematic bias towards\ncertain subgroups of people. For instance, they yield a preferential predictive\nperformance to males over females, which is unfair and potentially harmful\nespecially in healthcare scenarios. In this literature survey, we give a\ncomprehensive review of the current progress of fairness studies in medical\nimage analysis (MedIA) and healthcare. Specifically, we first discuss the\ndefinitions of fairness, the source of unfairness and potential solutions.\nThen, we discuss current research on fairness for MedIA categorized by fairness\nevaluation and unfairness mitigation. Furthermore, we conduct extensive\nexperiments to evaluate the fairness of different medical imaging tasks.\nFinally, we discuss the challenges and future directions in developing fair\nMedIA and healthcare applications\n","authors":["Zikang Xu","Jun Li","Qingsong Yao","Han Li","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2209.13177v6.pdf","comment":"Submitted to IEEE R-BME"},{"id":"http://arxiv.org/abs/2312.02549v1","updated":"2023-12-05T07:37:21Z","published":"2023-12-05T07:37:21Z","title":"DemaFormer: Damped Exponential Moving Average Transformer with\n Energy-Based Modeling for Temporal Language Grounding","summary":" Temporal Language Grounding seeks to localize video moments that semantically\ncorrespond to a natural language query. Recent advances employ the attention\nmechanism to learn the relations between video moments and the text query.\nHowever, naive attention might not be able to appropriately capture such\nrelations, resulting in ineffective distributions where target video moments\nare difficult to separate from the remaining ones. To resolve the issue, we\npropose an energy-based model framework to explicitly learn moment-query\ndistributions. Moreover, we propose DemaFormer, a novel Transformer-based\narchitecture that utilizes exponential moving average with a learnable damping\nfactor to effectively encode moment-query inputs. Comprehensive experiments on\nfour public temporal language grounding datasets showcase the superiority of\nour methods over the state-of-the-art baselines.\n","authors":["Thong Nguyen","Xiaobao Wu","Xinshuai Dong","Cong-Duy Nguyen","See-Kiong Ng","Luu Anh Tuan"],"pdf_url":"https://arxiv.org/pdf/2312.02549v1.pdf","comment":"Accepted at EMNLP 2023 (Findings)"},{"id":"http://arxiv.org/abs/2312.02548v1","updated":"2023-12-05T07:34:30Z","published":"2023-12-05T07:34:30Z","title":"GeNIe: Generative Hard Negative Images Through Diffusion","summary":" Data augmentation is crucial in training deep models, preventing them from\noverfitting to limited data. Common data augmentation methods are effective,\nbut recent advancements in generative AI, such as diffusion models for image\ngeneration, enable more sophisticated augmentation techniques that produce data\nresembling natural images. We recognize that augmented samples closer to the\nideal decision boundary of a classifier are particularly effective and\nefficient in guiding the learning process. We introduce GeNIe which leverages a\ndiffusion model conditioned on a text prompt to merge contrasting data points\n(an image from the source category and a text prompt from the target category)\nto generate challenging samples for the target category. Inspired by recent\nimage editing methods, we limit the number of diffusion iterations and the\namount of noise. This ensures that the generated image retains low-level and\ncontextual features from the source image, potentially conflicting with the\ntarget category. Our extensive experiments, in few-shot and also long-tail\ndistribution settings, demonstrate the effectiveness of our novel augmentation\nmethod, especially benefiting categories with a limited number of examples.\n","authors":["Soroush Abbasi Koohpayegani","Anuj Singh","K L Navaneet","Hadi Jamali-Rad","Hamed Pirsiavash"],"pdf_url":"https://arxiv.org/pdf/2312.02548v1.pdf","comment":"Our code is available https://github.com/UCDvision/GeNIe"},{"id":"http://arxiv.org/abs/2312.02546v1","updated":"2023-12-05T07:29:14Z","published":"2023-12-05T07:29:14Z","title":"Machine Vision Therapy: Multimodal Large Language Models Can Enhance\n Visual Robustness via Denoising In-Context Learning","summary":" Although vision models such as Contrastive Language-Image Pre-Training (CLIP)\nshow impressive generalization performance, their zero-shot robustness is still\nlimited under Out-of-Distribution (OOD) scenarios without fine-tuning. Instead\nof undesirably providing human supervision as commonly done, it is possible to\ntake advantage of Multi-modal Large Language Models (MLLMs) that hold powerful\nvisual understanding abilities. However, MLLMs are shown to struggle with\nvision problems due to the incompatibility of tasks, thus hindering their\nutilization. In this paper, we propose to effectively leverage MLLMs to conduct\nMachine Vision Therapy which aims to rectify the noisy predictions from vision\nmodels. By fine-tuning with the denoised labels, the learning model performance\ncan be boosted in an unsupervised manner. To solve the incompatibility issue,\nwe propose a novel Denoising In-Context Learning (DICL) strategy to align\nvision tasks with MLLMs. Concretely, by estimating a transition matrix that\ncaptures the probability of one class being confused with another, an\ninstruction containing a correct exemplar and an erroneous one from the most\nprobable noisy class can be constructed. Such an instruction can help any MLLMs\nwith ICL ability to detect and rectify incorrect predictions of vision models.\nThrough extensive experiments on ImageNet, WILDS, DomainBed, and other OOD\ndatasets, we carefully validate the quantitative and qualitative effectiveness\nof our method. Our code is available at\nhttps://github.com/tmllab/Machine_Vision_Therapy.\n","authors":["Zhuo Huang","Chang Liu","Yinpeng Dong","Hang Su","Shibao Zheng","Tongliang Liu"],"pdf_url":"https://arxiv.org/pdf/2312.02546v1.pdf","comment":"37 pages, 19 figures, and 13 tables"},{"id":"http://arxiv.org/abs/2312.02545v1","updated":"2023-12-05T07:23:22Z","published":"2023-12-05T07:23:22Z","title":"Graph Information Bottleneck for Remote Sensing Segmentation","summary":" Remote sensing segmentation has a wide range of applications in environmental\nprotection, and urban change detection, etc. Despite the success of deep\nlearning-based remote sensing segmentation methods (e.g., CNN and Transformer),\nthey are not flexible enough to model irregular objects. In addition, existing\ngraph contrastive learning methods usually adopt the way of maximizing mutual\ninformation to keep the node representations consistent between different graph\nviews, which may cause the model to learn task-independent redundant\ninformation. To tackle the above problems, this paper treats images as graph\nstructures and introduces a simple contrastive vision GNN (SC-ViG) architecture\nfor remote sensing segmentation. Specifically, we construct a node-masked and\nedge-masked graph view to obtain an optimal graph structure representation,\nwhich can adaptively learn whether to mask nodes and edges. Furthermore, this\npaper innovatively introduces information bottleneck theory into graph\ncontrastive learning to maximize task-related information while minimizing\ntask-independent redundant information. Finally, we replace the convolutional\nmodule in UNet with the SC-ViG module to complete the segmentation and\nclassification tasks of remote sensing images. Extensive experiments on\npublicly available real datasets demonstrate that our method outperforms\nstate-of-the-art remote sensing image segmentation methods.\n","authors":["Yuntao Shou","Wei Ai","Tao Meng"],"pdf_url":"https://arxiv.org/pdf/2312.02545v1.pdf","comment":"13 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.02541v1","updated":"2023-12-05T07:12:05Z","published":"2023-12-05T07:12:05Z","title":"Explainable Severity ranking via pairwise n-hidden comparison: a case\n study of glaucoma","summary":" Primary open-angle glaucoma (POAG) is a chronic and progressive optic nerve\ncondition that results in an acquired loss of optic nerve fibers and potential\nblindness. The gradual onset of glaucoma results in patients progressively\nlosing their vision without being consciously aware of the changes. To diagnose\nPOAG and determine its severity, patients must undergo a comprehensive dilated\neye examination. In this work, we build a framework to rank, compare, and\ninterpret the severity of glaucoma using fundus images. We introduce a\nsiamese-based severity ranking using pairwise n-hidden comparisons. We\nadditionally have a novel approach to explaining why a specific image is deemed\nmore severe than others. Our findings indicate that the proposed severity\nranking model surpasses traditional ones in terms of diagnostic accuracy and\ndelivers improved saliency explanations.\n","authors":["Hong Nguyen","Cuong V. Nguyen","Shrikanth Narayanan","Benjamin Y. Xu","Michael Pazzani"],"pdf_url":"https://arxiv.org/pdf/2312.02541v1.pdf","comment":"4 pages"},{"id":"http://arxiv.org/abs/2312.02535v1","updated":"2023-12-05T06:49:15Z","published":"2023-12-05T06:49:15Z","title":"Towards Open-set Gesture Recognition via Feature Activation Enhancement\n and Orthogonal Prototype Learning","summary":" Gesture recognition is a foundational task in human-machine interaction\n(HMI). While there has been significant progress in gesture recognition based\non surface electromyography (sEMG), accurate recognition of predefined gestures\nonly within a closed set is still inadequate in practice. It is essential to\neffectively discern and reject unknown gestures of disinterest in a robust\nsystem. Numerous methods based on prototype learning (PL) have been proposed to\ntackle this open set recognition (OSR) problem. However, they do not fully\nexplore the inherent distinctions between known and unknown classes. In this\npaper, we propose a more effective PL method leveraging two novel and inherent\ndistinctions, feature activation level and projection inconsistency.\nSpecifically, the Feature Activation Enhancement Mechanism (FAEM) widens the\ngap in feature activation values between known and unknown classes.\nFurthermore, we introduce Orthogonal Prototype Learning (OPL) to construct\nmultiple perspectives. OPL acts to project a sample from orthogonal directions\nto maximize the distinction between its two projections, where unknown samples\nwill be projected near the clusters of different known classes while known\nsamples still maintain intra-class similarity. Our proposed method\nsimultaneously achieves accurate closed-set classification for predefined\ngestures and effective rejection for unknown gestures. Extensive experiments\ndemonstrate its efficacy and superiority in open-set gesture recognition based\non sEMG.\n","authors":["Chen Liu","Can Han","Chengfeng Zhou","Crystal Cai","Suncheng Xiang","Hualiang Ni","Dahong Qian"],"pdf_url":"https://arxiv.org/pdf/2312.02535v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.06470v4","updated":"2023-12-05T06:43:34Z","published":"2023-03-29T15:26:44Z","title":"Qualitative Failures of Image Generation Models and Their Application in\n Detecting Deepfakes","summary":" The ability of image and video generation models to create photorealistic\nimages has reached unprecedented heights, making it difficult to distinguish\nbetween real and fake images in many cases. However, despite this progress, a\ngap remains between the quality of generated images and those found in the real\nworld. To address this, we have reviewed a vast body of literature from both\nacademic publications and social media to identify qualitative shortcomings in\nimage generation models, which we have classified into five categories. By\nunderstanding these failures, we can identify areas where these models need\nimprovement, as well as develop strategies for detecting deep fakes. The\nprevalence of deep fakes in today's society is a serious concern, and our\nfindings can help mitigate their negative impact.\n","authors":["Ali Borji"],"pdf_url":"https://arxiv.org/pdf/2304.06470v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02528v1","updated":"2023-12-05T06:18:38Z","published":"2023-12-05T06:18:38Z","title":"Towards Automatic Power Battery Detection: New Challenge, Benchmark\n Dataset and Baseline","summary":" We conduct a comprehensive study on a new task named power battery detection\n(PBD), which aims to localize the dense cathode and anode plates endpoints from\nX-ray images to evaluate the quality of power batteries. Existing manufacturers\nusually rely on human eye observation to complete PBD, which makes it difficult\nto balance the accuracy and efficiency of detection. To address this issue and\ndrive more attention into this meaningful task, we first elaborately collect a\ndataset, called X-ray PBD, which has $1,500$ diverse X-ray images selected from\nthousands of power batteries of $5$ manufacturers, with $7$ different visual\ninterference. Then, we propose a novel segmentation-based solution for PBD,\ntermed multi-dimensional collaborative network (MDCNet). With the help of line\nand counting predictors, the representation of the point segmentation branch\ncan be improved at both semantic and detail aspects. Besides, we design an\neffective distance-adaptive mask generation strategy, which can alleviate the\nvisual challenge caused by the inconsistent distribution density of plates to\nprovide MDCNet with stable supervision. Without any bells and whistles, our\nsegmentation-based MDCNet consistently outperforms various other corner\ndetection, crowd counting and general/tiny object detection-based solutions,\nmaking it a strong baseline that can help facilitate future research in PBD.\nFinally, we share some potential difficulties and works for future researches.\nThe source code and datasets will be publicly available at\n\\href{http://www.gy3000.company/x3000%e5%bc%80%e6%94%be%e5%b9%b3%e5%8f%b0}{X-ray\nPBD}.\n","authors":["Xiaoqi Zhao","Youwei Pang","Zhenyu Chen","Qian Yu","Lihe Zhang","Hanqi Liu","Jiaming Zuo","Huchuan Lu"],"pdf_url":"https://arxiv.org/pdf/2312.02528v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2208.06561v2","updated":"2023-12-05T06:18:10Z","published":"2022-08-13T03:25:50Z","title":"Finding Point with Image: A Simple and Efficient Method for UAV\n Self-Localization","summary":" Image retrieval has emerged as a prominent solution for the self-localization\ntask of unmanned aerial vehicles (UAVs). However, this approach involves\ncomplicated pre-processing and post-processing operations, placing significant\ndemands on both computational and storage resources. To mitigate this issue,\nthis paper presents an end-to-end positioning framework, namely Finding Point\nwith Image (FPI), which aims to directly identify the corresponding location of\na UAV in satellite-view images via a UAV-view image. To validate the\npracticality of our framework, we construct a paired dataset, namely UL14, that\nconsists of UAV and satellite views. In addition, we establish two\ntransformer-based baseline models, Post Fusion and Mix Fusion, for end-to-end\ntraining and inference. Through experiments, we can conclude that fusion in the\nbackbone network can achieve better performance than later fusion. Furthermore,\nconsidering the singleness of paired images, Random Scale Crop (RSC) is\nproposed to enrich the diversity of the paired data. Also, the ratio and weight\nof positive and negative samples play a key role in model convergence.\nTherefore, we conducted experimental verification and proposed a Weight Balance\nLoss (WBL) to weigh the impact of positive and negative samples. Last, our\nproposed baseline based on Mix Fusion structure exhibits superior performance\nin time and storage efficiency, amounting to just 1/24 and 1/68, respectively,\nwhile delivering comparable or even superior performance compared to the image\nretrieval method. The dataset and code will be made publicly available.\n","authors":["Ming Dai","Enhui Zheng","Zhenhua Feng","Jiahao Chen","Wankou Yang"],"pdf_url":"https://arxiv.org/pdf/2208.06561v2.pdf","comment":"15 pages, 14 figures"},{"id":"http://arxiv.org/abs/2312.02521v1","updated":"2023-12-05T06:04:16Z","published":"2023-12-05T06:04:16Z","title":"Retrieving Conditions from Reference Images for Diffusion Models","summary":" Recent diffusion-based subject driven generative methods have enabled image\ngenerations with good fidelity for specific objects or human portraits.\nHowever, to achieve better versatility for applications, we argue that not only\nimproved datasets and evaluations are desired, but also more careful methods to\nretrieve only relevant information from conditional images are anticipated. To\nthis end, we propose an anime figures dataset RetriBooru-V1, with enhanced\nidentity and clothing labels. We state new tasks enabled by this dataset, and\nintroduce a new diversity metric to measure success in completing these tasks,\nquantifying the flexibility of image generations. We establish an RAG-inspired\nbaseline method, designed to retrieve precise conditional information from\nreference images. Then, we compare with current methods on existing task to\ndemonstrate the capability of the proposed method. Finally, we provide baseline\nexperiment results on new tasks, and conduct ablation studies on the possible\nstructural choices.\n","authors":["Haoran Tang","Xin Zhou","Jieren Deng","Zhihong Pan","Hao Tian","Pratik Chaudhari"],"pdf_url":"https://arxiv.org/pdf/2312.02521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02520v1","updated":"2023-12-05T06:02:21Z","published":"2023-12-05T06:02:21Z","title":"Towards More Unified In-context Visual Understanding","summary":" The rapid advancement of large language models (LLMs) has accelerated the\nemergence of in-context learning (ICL) as a cutting-edge approach in the\nnatural language processing domain. Recently, ICL has been employed in visual\nunderstanding tasks, such as semantic segmentation and image captioning,\nyielding promising results. However, existing visual ICL framework can not\nenable producing content across multiple modalities, which limits their\npotential usage scenarios. To address this issue, we present a new ICL\nframework for visual understanding with multi-modal output enabled. First, we\nquantize and embed both text and visual prompt into a unified representational\nspace, structured as interleaved in-context sequences. Then a decoder-only\nsparse transformer architecture is employed to perform generative modeling on\nthem, facilitating in-context learning. Thanks to this design, the model is\ncapable of handling in-context vision understanding tasks with multimodal\noutput in a unified pipeline. Experimental results demonstrate that our model\nachieves competitive performance compared with specialized models and previous\nICL baselines. Overall, our research takes a further step toward unified\nmultimodal in-context learning.\n","authors":["Dianmo Sheng","Dongdong Chen","Zhentao Tan","Qiankun Liu","Qi Chu","Jianmin Bao","Tao Gong","Bin Liu","Shengwei Xu","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2312.02520v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01697v2","updated":"2023-12-05T05:37:25Z","published":"2023-12-04T07:36:04Z","title":"Hulk: A Universal Knowledge Translator for Human-Centric Tasks","summary":" Human-centric perception tasks, e.g., human mesh recovery, pedestrian\ndetection, skeleton-based action recognition, and pose estimation, have wide\nindustrial applications, such as metaverse and sports analysis. There is a\nrecent surge to develop human-centric foundation models that can benefit a\nbroad range of human-centric perception tasks. While many human-centric\nfoundation models have achieved success, most of them only excel in 2D vision\ntasks or require extensive fine-tuning for practical deployment in real-world\nscenarios. These limitations severely restrict their usability across various\ndownstream tasks and situations. To tackle these problems, we present Hulk, the\nfirst multimodal human-centric generalist model, capable of addressing most of\nthe mainstream tasks simultaneously without task-specific finetuning, covering\n2D vision, 3D vision, skeleton-based, and vision-language tasks. The key to\nachieving this is condensing various task-specific heads into two general\nheads, one for discrete representations, e.g., languages, and the other for\ncontinuous representations, e.g., location coordinates. The outputs of two\nheads can be further stacked into four distinct input and output modalities.\nThis uniform representation enables Hulk to treat human-centric tasks as\nmodality translation, integrating knowledge across a wide range of tasks. To\nvalidate the effectiveness of our proposed method, we conduct comprehensive\nexperiments on 11 benchmarks across 8 human-centric tasks. Experimental results\nsurpass previous methods substantially, demonstrating the superiority of our\nproposed method. The code will be available on\nhttps://github.com/OpenGVLab/HumanBench.\n","authors":["Yizhou Wang","Yixuan Wu","Shixiang Tang","Weizhen He","Xun Guo","Feng Zhu","Lei Bai","Rui Zhao","Jian Wu","Tong He","Wanli Ouyang"],"pdf_url":"https://arxiv.org/pdf/2312.01697v2.pdf","comment":"24 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.02512v1","updated":"2023-12-05T05:36:44Z","published":"2023-12-05T05:36:44Z","title":"AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation\n with Unified Audio-Visual Speech Representation","summary":" This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech\nTranslation (AV2AV) framework, where the input and output of the system are\nmultimodal (i.e., audio and visual speech). With the proposed AV2AV, two key\nadvantages can be brought: 1) We can perform real-like conversations with\nindividuals worldwide in a virtual meeting by utilizing our own primary\nlanguages. In contrast to Speech-to-Speech Translation (A2A), which solely\ntranslates between audio modalities, the proposed AV2AV directly translates\nbetween audio-visual speech. This capability enhances the dialogue experience\nby presenting synchronized lip movements along with the translated speech. 2)\nWe can improve the robustness of the spoken language translation system. By\nemploying the complementary information of audio-visual speech, the system can\neffectively translate spoken language even in the presence of acoustic noise,\nshowcasing robust performance. To mitigate the problem of the absence of a\nparallel AV2AV translation dataset, we propose to train our spoken language\ntranslation system with the audio-only dataset of A2A. This is done by learning\nunified audio-visual speech representations through self-supervised learning in\nadvance to train the translation system. Moreover, we propose an AV-Renderer\nthat can generate raw audio and video in parallel. It is designed with\nzero-shot speaker modeling, thus the speaker in source audio-visual speech can\nbe maintained at the target translated audio-visual speech. The effectiveness\nof AV2AV is evaluated with extensive experiments in a many-to-many language\ntranslation setting. The demo page is available on\nhttps://choijeongsoo.github.io/av2av.\n","authors":["Jeongsoo Choi","Se Jin Park","Minsu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2312.02512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.16894v4","updated":"2023-12-05T05:34:18Z","published":"2023-03-29T17:59:10Z","title":"ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with\n GPT and Prototype Guidance","summary":" Understanding 3D scenes from multi-view inputs has been proven to alleviate\nthe view discrepancy issue in 3D visual grounding. However, existing methods\nnormally neglect the view cues embedded in the text modality and fail to weigh\nthe relative importance of different views. In this paper, we propose\nViewRefer, a multi-view framework for 3D visual grounding exploring how to\ngrasp the view knowledge from both text and 3D modalities. For the text branch,\nViewRefer leverages the diverse linguistic knowledge of large-scale language\nmodels, e.g., GPT, to expand a single grounding text to multiple\ngeometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer\nfusion module with inter-view attention is introduced to boost the interaction\nof objects across views. On top of that, we further present a set of learnable\nmulti-view prototypes, which memorize scene-agnostic knowledge for different\nviews, and enhance the framework from two perspectives: a view-guided attention\nmodule for more robust text features, and a view-guided scoring strategy during\nthe final prediction. With our designed paradigm, ViewRefer achieves superior\nperformance on three benchmarks and surpasses the second-best by +2.8%, +1.5%,\nand +1.35% on Sr3D, Nr3D, and ScanRefer. Code is released at\nhttps://github.com/Ivan-Tang-3D/ViewRefer3D.\n","authors":["Zoey Guo","Yiwen Tang","Ray Zhang","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2303.16894v4.pdf","comment":"Accepted by ICCV 2023. Code is released at\n https://github.com/Ivan-Tang-3D/ViewRefer3D"},{"id":"http://arxiv.org/abs/2310.01415v3","updated":"2023-12-05T05:26:29Z","published":"2023-10-02T17:59:57Z","title":"GPT-Driver: Learning to Drive with GPT","summary":" We present a simple yet effective approach that can transform the OpenAI\nGPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion\nplanning is a core challenge in autonomous driving, aiming to plan a driving\ntrajectory that is safe and comfortable. Existing motion planners predominantly\nleverage heuristic methods to forecast driving trajectories, yet these\napproaches demonstrate insufficient generalization capabilities in the face of\nnovel and unseen driving scenarios. In this paper, we propose a novel approach\nto motion planning that capitalizes on the strong reasoning capabilities and\ngeneralization potential inherent to Large Language Models (LLMs). The\nfundamental insight of our approach is the reformulation of motion planning as\na language modeling problem, a perspective not previously explored.\nSpecifically, we represent the planner inputs and outputs as language tokens,\nand leverage the LLM to generate driving trajectories through a language\ndescription of coordinate positions. Furthermore, we propose a novel\nprompting-reasoning-finetuning strategy to stimulate the numerical reasoning\npotential of the LLM. With this strategy, the LLM can describe highly precise\ntrajectory coordinates and also its internal decision-making process in natural\nlanguage. We evaluate our approach on the large-scale nuScenes dataset, and\nextensive experiments substantiate the effectiveness, generalization ability,\nand interpretability of our GPT-based motion planner. Code is now available at\nhttps://github.com/PointsCoder/GPT-Driver.\n","authors":["Jiageng Mao","Yuxi Qian","Junjie Ye","Hang Zhao","Yue Wang"],"pdf_url":"https://arxiv.org/pdf/2310.01415v3.pdf","comment":"NeurIPS 2023 Foundation Models for Decision Making Workshop"},{"id":"http://arxiv.org/abs/2310.04406v2","updated":"2023-12-05T05:25:55Z","published":"2023-10-06T17:55:11Z","title":"Language Agent Tree Search Unifies Reasoning Acting and Planning in\n Language Models","summary":" While large language models (LLMs) have demonstrated impressive performance\non a range of decision-making tasks, they rely on simple acting processes and\nfall short of broad deployment as autonomous agents. We introduce LATS\n(Language Agent Tree Search), a general framework that synergizes the\ncapabilities of LLMs in planning, acting, and reasoning. Drawing inspiration\nfrom Monte Carlo tree search in model-based reinforcement learning, LATS\nemploys LLMs as agents, value functions, and optimizers, repurposing their\nlatent strengths for enhanced decision-making. What is crucial in this method\nis the use of an environment for external feedback, which offers a more\ndeliberate and adaptive problem-solving mechanism that moves beyond the\nlimitations of existing techniques. Our experimental evaluation across diverse\ndomains, such as programming, HotPotQA, and WebShop, illustrates the\napplicability of LATS for both reasoning and acting. In particular, LATS\nachieves 94.4% for programming on HumanEval with GPT-4 and an average score of\n75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness\nand generality of our method.\n","authors":["Andy Zhou","Kai Yan","Michal Shlapentokh-Rothman","Haohan Wang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.04406v2.pdf","comment":"Website and code can be found at\n https://andyz245.github.io/LanguageAgentTreeSearch"},{"id":"http://arxiv.org/abs/2312.02503v1","updated":"2023-12-05T05:13:20Z","published":"2023-12-05T05:13:20Z","title":"SAVE: Protagonist Diversification with Structure Agnostic Video Editing","summary":" Driven by the upsurge progress in text-to-image (T2I) generation models,\ntext-to-video (T2V) generation has experienced a significant advance as well.\nAccordingly, tasks such as modifying the object or changing the style in a\nvideo have been possible. However, previous works usually work well on trivial\nand consistent shapes, and easily collapse on a difficult target that has a\nlargely different body shape from the original one. In this paper, we spot the\nbias problem in the existing video editing method that restricts the range of\nchoices for the new protagonist and attempt to address this issue using the\nconventional image-level personalization method. We adopt motion\npersonalization that isolates the motion from a single source video and then\nmodifies the protagonist accordingly. To deal with the natural discrepancy\nbetween image and video, we propose a motion word with an inflated textual\nembedding to properly represent the motion in a source video. We also regulate\nthe motion word to attend to proper motion-related areas by introducing a novel\npseudo optical flow, efficiently computed from the pre-calculated attention\nmaps. Finally, we decouple the motion from the appearance of the source video\nwith an additional pseudo word. Extensive experiments demonstrate the editing\ncapability of our method, taking a step toward more diverse and extensive video\nediting.\n","authors":["Yeji Song","Wonsik Shin","Junsoo Lee","Jeesoo Kim","Nojun Kwak"],"pdf_url":"https://arxiv.org/pdf/2312.02503v1.pdf","comment":"Project website: https://ldynx.github.io/SAVE/"},{"id":"http://arxiv.org/abs/2312.02501v1","updated":"2023-12-05T05:08:08Z","published":"2023-12-05T05:08:08Z","title":"Inspecting Model Fairness in Ultrasound Segmentation Tasks","summary":" With the rapid expansion of machine learning and deep learning (DL),\nresearchers are increasingly employing learning-based algorithms to alleviate\ndiagnostic challenges across diverse medical tasks and applications. While\nadvancements in diagnostic precision are notable, some researchers have\nidentified a concerning trend: their models exhibit biased performance across\nsubgroups characterized by different sensitive attributes. This bias not only\ninfringes upon the rights of patients but also has the potential to lead to\nlife-altering consequences. In this paper, we inspect a series of DL\nsegmentation models using two ultrasound datasets, aiming to assess the\npresence of model unfairness in these specific tasks. Our findings reveal that\neven state-of-the-art DL algorithms demonstrate unfair behavior in ultrasound\nsegmentation tasks. These results serve as a crucial warning, underscoring the\nnecessity for careful model evaluation before their deployment in real-world\nscenarios. Such assessments are imperative to ensure ethical considerations and\nmitigate the risk of adverse impacts on patient outcomes.\n","authors":["Zikang Xu","Fenghe Tang","Quan Quan","Jianrui Ding","Chunping Ning","S. Kevin Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02501v1.pdf","comment":"Submitted to ISBI 2024"},{"id":"http://arxiv.org/abs/2310.17951v2","updated":"2023-12-05T05:00:20Z","published":"2023-10-27T07:48:36Z","title":"Understanding Parameter Saliency via Extreme Value Theory","summary":" Deep neural networks are being increasingly implemented throughout society in\nrecent years. It is useful to identify which parameters trigger\nmisclassification in diagnosing undesirable model behaviors. The concept of\nparameter saliency is proposed and used to diagnose convolutional neural\nnetworks (CNNs) by ranking convolution filters that may have caused\nmisclassification on the basis of parameter saliency. It is also shown that\nfine-tuning the top ranking salient filters efficiently corrects\nmisidentification on ImageNet. However, there is still a knowledge gap in terms\nof understanding why parameter saliency ranking can find the filters inducing\nmisidentification. In this work, we attempt to bridge the gap by analyzing\nparameter saliency ranking from a statistical viewpoint, namely, extreme value\ntheory. We first show that the existing work implicitly assumes that the\ngradient norm computed for each filter follows a normal distribution. Then, we\nclarify the relationship between parameter saliency and the score based on the\npeaks-over-threshold (POT) method, which is often used to model extreme values.\nFinally, we reformulate parameter saliency in terms of the POT method, where\nthis reformulation is regarded as statistical anomaly detection and does not\nrequire the implicit assumptions of the existing parameter-saliency\nformulation. Our experimental results demonstrate that our reformulation can\ndetect malicious filters as well. Furthermore, we show that the existing\nparameter saliency method exhibits a bias against the depth of layers in deep\nneural networks. In particular, this bias has the potential to inhibit the\ndiscovery of filters that cause misidentification in situations where domain\nshift occurs. In contrast, parameter saliency based on POT shows less of this\nbias.\n","authors":["Shuo Wang","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2310.17951v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02494v1","updated":"2023-12-05T04:51:42Z","published":"2023-12-05T04:51:42Z","title":"ReconU-Net: a direct PET image reconstruction using U-Net architecture\n with back projection-induced skip connection","summary":" [Objective] This study aims to introduce a novel back projection-induced\nU-Net-shaped architecture, called ReconU-Net, for deep learning-based direct\npositron emission tomography (PET) image reconstruction. Additionally, our\nobjective is to analyze the behavior of direct PET image reconstruction and\ngain deeper insights by comparing the proposed ReconU-Net architecture with\nother encoder-decoder architectures without skip connections. [Approach] The\nproposed ReconU-Net architecture uniquely integrates the physical model of the\nback projection operation into the skip connection. This distinctive feature\nfacilitates the effective transfer of intrinsic spatial information from the\ninput sinogram to the reconstructed image via an embedded physical model. The\nproposed ReconU-Net was trained using Monte Carlo simulation data from the\nBrainweb phantom and tested on both simulated and real Hoffman brain phantom\ndata. [Main results] The proposed ReconU-Net method generated a reconstructed\nimage with a more accurate structure compared to other deep learning-based\ndirect reconstruction methods. Further analysis showed that the proposed\nReconU-Net architecture has the ability to transfer features of multiple\nresolutions, especially non-abstract high-resolution information, through skip\nconnections. Despite limited training on simulated data, the proposed\nReconU-Net successfully reconstructed the real Hoffman brain phantom, unlike\nother deep learning-based direct reconstruction methods, which failed to\nproduce a reconstructed image. [Significance] The proposed ReconU-Net can\nimprove the fidelity of direct PET image reconstruction, even when dealing with\nsmall training datasets, by leveraging the synergistic relationship between\ndata-driven modeling and the physics model of the imaging process.\n","authors":["Fumio Hashimoto","Kibo Ote"],"pdf_url":"https://arxiv.org/pdf/2312.02494v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.02493v1","updated":"2023-12-05T04:51:19Z","published":"2023-12-05T04:51:19Z","title":"Flexible Communication for Optimal Distributed Learning over\n Unpredictable Networks","summary":" Gradient compression alleviates expensive communication in distributed deep\nlearning by sending fewer values and its corresponding indices, typically via\nAllgather (AG). Training with high compression ratio (CR) achieves high\naccuracy like DenseSGD, but has lower parallel scaling due to high\ncommunication cost (i.e., parallel efficiency). Using lower CRs improves\nparallel efficiency by lowering synchronization cost, but degrades model\naccuracy as well (statistical efficiency). Further, speedup attained with\ndifferent models and CRs also varies with network latency, effective bandwidth\nand collective op used for aggregation. In many cases, collectives like\nAllreduce (AR) have lower cost than AG to exchange the same amount of data. In\nthis paper, we propose an AR-compatible Topk compressor that is\nbandwidth-optimal and thus performs better than AG in certain network\nconfigurations. We develop a flexible communication strategy that switches\nbetween AG and AR based on which collective is optimal in the current settings,\nand model the pareto-relationship between parallel and statistical efficiency\nas a multi-objective optimization (MOO) problem to dynamically adjust CR and\naccelerate training while still converging to high accuracy.\n","authors":["Sahil Tyagi","Martin Swany"],"pdf_url":"https://arxiv.org/pdf/2312.02493v1.pdf","comment":"2023 IEEE International Conference on Big Data (BigData)"},{"id":"http://arxiv.org/abs/2312.02483v1","updated":"2023-12-05T04:15:56Z","published":"2023-12-05T04:15:56Z","title":"EtC: Temporal Boundary Expand then Clarify for Weakly Supervised Video\n Grounding with Multimodal Large Language Model","summary":" Early weakly supervised video grounding (WSVG) methods often struggle with\nincomplete boundary detection due to the absence of temporal boundary\nannotations. To bridge the gap between video-level and boundary-level\nannotation, explicit-supervision methods, i.e., generating pseudo-temporal\nboundaries for training, have achieved great success. However, data\naugmentations in these methods might disrupt critical temporal information,\nyielding poor pseudo boundaries. In this paper, we propose a new perspective\nthat maintains the integrity of the original temporal content while introducing\nmore valuable information for expanding the incomplete boundaries. To this end,\nwe propose EtC (Expand then Clarify), first use the additional information to\nexpand the initial incomplete pseudo boundaries, and subsequently refine these\nexpanded ones to achieve precise boundaries. Motivated by video continuity,\ni.e., visual similarity across adjacent frames, we use powerful multimodal\nlarge language models (MLLMs) to annotate each frame within initial pseudo\nboundaries, yielding more comprehensive descriptions for expanded boundaries.\nTo further clarify the noise of expanded boundaries, we combine mutual learning\nwith a tailored proposal-level contrastive objective to use a learnable\napproach to harmonize a balance between incomplete yet clean (initial) and\ncomprehensive yet noisy (expanded) boundaries for more precise ones.\nExperiments demonstrate the superiority of our method on two challenging WSVG\ndatasets.\n","authors":["Guozhang Li","Xinpeng Ding","De Cheng","Jie Li","Nannan Wang","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2312.02483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02481v1","updated":"2023-12-05T04:15:22Z","published":"2023-12-05T04:15:22Z","title":"Learning to Holistically Detect Bridges from Large-Size VHR Remote\n Sensing Imagery","summary":" Bridge detection in remote sensing images (RSIs) plays a crucial role in\nvarious applications, but it poses unique challenges compared to the detection\nof other objects. In RSIs, bridges exhibit considerable variations in terms of\ntheir spatial scales and aspect ratios. Therefore, to ensure the visibility and\nintegrity of bridges, it is essential to perform holistic bridge detection in\nlarge-size very-high-resolution (VHR) RSIs. However, the lack of datasets with\nlarge-size VHR RSIs limits the deep learning algorithms' performance on bridge\ndetection. Due to the limitation of GPU memory in tackling large-size images,\ndeep learning-based object detection methods commonly adopt the cropping\nstrategy, which inevitably results in label fragmentation and discontinuous\nprediction. To ameliorate the scarcity of datasets, this paper proposes a\nlarge-scale dataset named GLH-Bridge comprising 6,000 VHR RSIs sampled from\ndiverse geographic locations across the globe. These images encompass a wide\nrange of sizes, varying from 2,048*2,048 to 16,38*16,384 pixels, and\ncollectively feature 59,737 bridges. Furthermore, we present an efficient\nnetwork for holistic bridge detection (HBD-Net) in large-size RSIs. The HBD-Net\npresents a separate detector-based feature fusion (SDFF) architecture and is\noptimized via a shape-sensitive sample re-weighting (SSRW) strategy. Based on\nthe proposed GLH-Bridge dataset, we establish a bridge detection benchmark\nincluding the OBB and HBB tasks, and validate the effectiveness of the proposed\nHBD-Net. Additionally, cross-dataset generalization experiments on two publicly\navailable datasets illustrate the strong generalization capability of the\nGLH-Bridge dataset.\n","authors":["Yansheng Li","Junwei Luo","Yongjun Zhang","Yihua Tan","Jin-Gang Yu","Song Bai"],"pdf_url":"https://arxiv.org/pdf/2312.02481v1.pdf","comment":"16 pages, 11 figures, 6 tables; due to the limitation \"The abstract\n field cannot be longer than 1,920 characters\", the abstract appearing here is\n slightly shorter than that in the PDF file"},{"id":"http://arxiv.org/abs/2312.01239v2","updated":"2023-12-05T04:14:18Z","published":"2023-12-02T22:25:24Z","title":"Motion Informed Needle Segmentation in Ultrasound Images","summary":" Segmenting a moving needle in ultrasound images is challenging due to the\npresence of artifacts, noise, and needle occlusion. This task becomes even more\ndemanding in scenarios where data availability is limited. Convolutional Neural\nNetworks (CNNs) have been successful in many computer vision applications, but\nstruggle to accurately segment needles without considering their motion. In\nthis paper, we present a novel approach for needle segmentation that combines\nclassical Kalman Filter (KF) techniques with data-driven learning,\nincorporating both needle features and needle motion. Our method offers two key\ncontributions. First, we propose a compatible framework that seamlessly\nintegrates into commonly used encoder-decoder style architectures. Second, we\ndemonstrate superior performance compared to recent state-of-the-art needle\nsegmentation models using our novel convolutional neural network (CNN) based\nKF-inspired block, achieving a 15\\% reduction in pixel-wise needle tip error\nand an 8\\% reduction in length error. Third, to our knowledge we are the first\nto implement a learnable filter to incorporate non-linear needle motion for\nimproving needle segmentation.\n","authors":["Raghavv Goel","Cecilia Morales","Manpreet Singh","Artur Dubrawski","John Galeotti","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2312.01239v2.pdf","comment":"7 pages, 4 figures, under review"},{"id":"http://arxiv.org/abs/2312.02480v1","updated":"2023-12-05T04:13:31Z","published":"2023-12-05T04:13:31Z","title":"Differentiable Point-based Inverse Rendering","summary":" We present differentiable point-based inverse rendering, DPIR, an\nanalysis-by-synthesis method that processes images captured under diverse\nilluminations to estimate shape and spatially-varying BRDF. To this end, we\nadopt point-based rendering, eliminating the need for multiple samplings per\nray, typical of volumetric rendering, thus significantly enhancing the speed of\ninverse rendering. To realize this idea, we devise a hybrid point-volumetric\nrepresentation for geometry and a regularized basis-BRDF representation for\nreflectance. The hybrid geometric representation enables fast rendering through\npoint-based splatting while retaining the geometric details and stability\ninherent to SDF-based representations. The regularized basis-BRDF mitigates the\nill-posedness of inverse rendering stemming from limited light-view angular\nsamples. We also propose an efficient shadow detection method using point-based\nshadow map rendering. Our extensive evaluations demonstrate that DPIR\noutperforms prior works in terms of reconstruction accuracy, computational\nefficiency, and memory footprint. Furthermore, our explicit point-based\nrepresentation and rendering enables intuitive geometry and reflectance\nediting. The code will be publicly available.\n","authors":["Hoon-Gyu Chung","Seokjun Choi","Seung-Hwan Baek"],"pdf_url":"https://arxiv.org/pdf/2312.02480v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.13838v3","updated":"2023-12-05T03:55:38Z","published":"2022-11-25T00:19:21Z","title":"Signed Binary Weight Networks","summary":" Efficient inference of Deep Neural Networks (DNNs) is essential to making AI\nubiquitous. Two important algorithmic techniques have shown promise for\nenabling efficient inference - sparsity and binarization. These techniques\ntranslate into weight sparsity and weight repetition at the hardware-software\nlevel enabling the deployment of DNNs with critically low power and latency\nrequirements. We propose a new method called signed-binary networks to improve\nefficiency further (by exploiting both weight sparsity and weight repetition\ntogether) while maintaining similar accuracy. Our method achieves comparable\naccuracy on ImageNet and CIFAR10 datasets with binary and can lead to 69%\nsparsity. We observe real speedup when deploying these models on\ngeneral-purpose devices and show that this high percentage of unstructured\nsparsity can lead to a further reduction in energy consumption on ASICs.\n","authors":["Sachit Kuhar","Alexey Tumanov","Judy Hoffman"],"pdf_url":"https://arxiv.org/pdf/2211.13838v3.pdf","comment":"it is being updated"},{"id":"http://arxiv.org/abs/2312.01232v2","updated":"2023-12-05T03:46:27Z","published":"2023-12-02T21:38:16Z","title":"A Comprehensive Study of Vision Transformers in Image Classification\n Tasks","summary":" Image Classification is a fundamental task in the field of computer vision\nthat frequently serves as a benchmark for gauging advancements in Computer\nVision. Over the past few years, significant progress has been made in image\nclassification due to the emergence of deep learning. However, challenges still\nexist, such as modeling fine-grained visual information, high computation\ncosts, the parallelism of the model, and inconsistent evaluation protocols\nacross datasets. In this paper, we conduct a comprehensive survey of existing\npapers on Vision Transformers for image classification. We first introduce the\npopular image classification datasets that influenced the design of models.\nThen, we present Vision Transformers models in chronological order, starting\nwith early attempts at adapting attention mechanism to vision tasks followed by\nthe adoption of vision transformers, as they have demonstrated success in\ncapturing intricate patterns and long-range dependencies within images.\nFinally, we discuss open problems and shed light on opportunities for image\nclassification to facilitate new research ideas.\n","authors":["Mahmoud Khalil","Ahmad Khalil","Alioune Ngom"],"pdf_url":"https://arxiv.org/pdf/2312.01232v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2012.06567,\n arXiv:1406.6247, arXiv:1906.05909 by other authors. arXiv admin note: text\n overlap with arXiv:2012.06567, arXiv:1406.6247, arXiv:1906.05909 by other\n authors"},{"id":"http://arxiv.org/abs/2312.02470v1","updated":"2023-12-05T03:41:17Z","published":"2023-12-05T03:41:17Z","title":"Generator Born from Classifier","summary":" In this paper, we make a bold attempt toward an ambitious task: given a\npre-trained classifier, we aim to reconstruct an image generator, without\nrelying on any data samples. From a black-box perspective, this challenge seems\nintractable, since it inevitably involves identifying the inverse function for\na classifier, which is, by nature, an information extraction process. As such,\nwe resort to leveraging the knowledge encapsulated within the parameters of the\nneural network. Grounded on the theory of Maximum-Margin Bias of gradient\ndescent, we propose a novel learning paradigm, in which the generator is\ntrained to ensure that the convergence conditions of the network parameters are\nsatisfied over the generated distribution of the samples. Empirical validation\nfrom various image generation tasks substantiates the efficacy of our strategy.\n","authors":["Runpeng Yu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02469v1","updated":"2023-12-05T03:39:54Z","published":"2023-12-05T03:39:54Z","title":"Learning Energy-based Model via Dual-MCMC Teaching","summary":" This paper studies the fundamental learning problem of the energy-based model\n(EBM). Learning the EBM can be achieved using the maximum likelihood estimation\n(MLE), which typically involves the Markov Chain Monte Carlo (MCMC) sampling,\nsuch as the Langevin dynamics. However, the noise-initialized Langevin dynamics\ncan be challenging in practice and hard to mix. This motivates the exploration\nof joint training with the generator model where the generator model serves as\na complementary model to bypass MCMC sampling. However, such a method can be\nless accurate than the MCMC and result in biased EBM learning. While the\ngenerator can also serve as an initializer model for better MCMC sampling, its\nlearning can be biased since it only matches the EBM and has no access to\nempirical training examples. Such biased generator learning may limit the\npotential of learning the EBM. To address this issue, we present a joint\nlearning framework that interweaves the maximum likelihood learning algorithm\nfor both the EBM and the complementary generator model. In particular, the\ngenerator model is learned by MLE to match both the EBM and the empirical data\ndistribution, making it a more informative initializer for MCMC sampling of\nEBM. Learning generator with observed examples typically requires inference of\nthe generator posterior. To ensure accurate and efficient inference, we adopt\nthe MCMC posterior sampling and introduce a complementary inference model to\ninitialize such latent MCMC sampling. We show that three separate models can be\nseamlessly integrated into our joint framework through two (dual-) MCMC\nteaching, enabling effective and efficient EBM learning.\n","authors":["Jiali Cui","Tian Han"],"pdf_url":"https://arxiv.org/pdf/2312.02469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.11487v3","updated":"2023-12-05T03:37:33Z","published":"2023-08-22T15:06:14Z","title":"Free Lunch for Gait Recognition: A Novel Relation Descriptor","summary":" Gait recognition is to seek correct matches for query individuals by their\nunique walking patterns. However, current methods focus solely on extracting\nindividual-specific features, overlooking ``interpersonal\" relationships. In\nthis paper, we propose a novel $\\textbf{Relation Descriptor}$ that captures not\nonly individual features but also relations between test gaits and pre-selected\ngait anchors. Specifically, we reinterpret classifier weights as gait anchors\nand compute similarity scores between test features and these anchors, which\nre-expresses individual gait features into a similarity relation distribution.\nIn essence, the relation descriptor offers a holistic perspective that\nleverages the collective knowledge stored within the classifier's weights,\nemphasizing meaningful patterns and enhancing robustness. Despite its\npotential, relation descriptor poses dimensionality challenges since its\ndimension depends on the training set's identity count. To address this, we\npropose Farthest gait-Anchor Selection to identify the most discriminative gait\nanchors and an Orthogonal Regularization Loss to increase diversity within gait\nanchors. Compared to individual-specific features extracted from the backbone,\nour relation descriptor can boost the performance nearly without any extra\ncosts. We evaluate the effectiveness of our method on the popular GREW, Gait3D,\nOU-MVLP, CASIA-B, and CCPG, showing that our method consistently outperforms\nthe baselines and achieves state-of-the-art performance.\n","authors":["Jilong Wang","Saihui Hou","Yan Huang","Chunshui Cao","Xu Liu","Yongzhen Huang","Tianzhu Zhang","Liang Wang"],"pdf_url":"https://arxiv.org/pdf/2308.11487v3.pdf","comment":"Add new figures and fix some typos"},{"id":"http://arxiv.org/abs/2312.02464v1","updated":"2023-12-05T03:33:47Z","published":"2023-12-05T03:33:47Z","title":"SAM-Assisted Remote Sensing Imagery Semantic Segmentation with Object\n and Boundary Constraints","summary":" Semantic segmentation of remote sensing imagery plays a pivotal role in\nextracting precise information for diverse down-stream applications. Recent\ndevelopment of the Segment Anything Model (SAM), an advanced general-purpose\nsegmentation model, has revolutionized this field, presenting new avenues for\naccurate and efficient segmentation. However, SAM is limited to generating\nsegmentation results without class information. Consequently, the utilization\nof such a powerful general vision model for semantic segmentation in remote\nsensing images has become a focal point of research. In this paper, we present\na streamlined framework aimed at leveraging the raw output of SAM by exploiting\ntwo novel concepts called SAM-Generated Object (SGO) and SAM-Generated Boundary\n(SGB). More specifically, we propose a novel object loss and further introduce\na boundary loss as augmentative components to aid in model optimization in a\ngeneral semantic segmentation framework. Taking into account the content\ncharacteristics of SGO, we introduce the concept of object consistency to\nleverage segmented regions lacking semantic information. By imposing\nconstraints on the consistency of predicted values within objects, the object\nloss aims to enhance semantic segmentation performance. Furthermore, the\nboundary loss capitalizes on the distinctive features of SGB by directing the\nmodel's attention to the boundary information of the object. Experimental\nresults on two well-known datasets, namely ISPRS Vaihingen and LoveDA Urban,\ndemonstrate the effectiveness of our proposed method. The source code for this\nwork will be accessible at https://github.com/sstary/SSRS.\n","authors":["Xianping Ma","Qianqian Wu","Xingyu Zhao","Xiaokang Zhang","Man-On Pun","Bo Huang"],"pdf_url":"https://arxiv.org/pdf/2312.02464v1.pdf","comment":"10 pages, 4 figures"},{"id":"http://arxiv.org/abs/2307.00347v2","updated":"2023-12-05T03:28:55Z","published":"2023-07-01T13:53:14Z","title":"Spatial-Temporal Enhanced Transformer Towards Multi-Frame 3D Object\n Detection","summary":" The Detection Transformer (DETR) has revolutionized the design of CNN-based\nobject detection systems, showcasing impressive performance. However, its\npotential in the domain of multi-frame 3D object detection remains largely\nunexplored. In this paper, we present STEMD, a novel end-to-end framework for\nmulti-frame 3D object detection based on the DETR-like paradigm. STEMD treats\nmulti-frame 3D object detection as a sequence-to-sequence task and effectively\ncaptures spatial-temporal dependencies at both the feature and query levels.\nSpecifically, to model the inter-object spatial interaction and complex\ntemporal dependencies, we introduce the spatial-temporal graph attention\nnetwork, which represents queries as nodes in a graph and enables effective\nmodeling of object interactions within a social context. To solve the problem\nof missing hard cases in the proposed output of the encoder in the current\nframe, we incorporate the output of the previous frame to initialize the query\ninput of the decoder. Moreover, to mitigate the issue of redundant detection\nresults, where the model generates numerous overlapping boxes from similar\nqueries, we consider an IoU regularization term in the loss function, which can\ndistinguish between queries matched with the ground-truth box and queries that\nare similar but unmatched during the refinement process, leading to reduced\nredundancy and more accurate detections. Through extensive experiments, we\ndemonstrate the effectiveness of our approach in handling challenging\nscenarios, while incurring only a minor additional computational overhead. The\ncode is available at \\url{https://github.com/Eaphan/STEMD}.\n","authors":["Yifan Zhang","Zhiyu Zhu","Junhui Hou","Dapeng Wu"],"pdf_url":"https://arxiv.org/pdf/2307.00347v2.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2309.17059v2","updated":"2023-12-05T03:22:24Z","published":"2023-09-29T08:43:16Z","title":"GSDC Transformer: An Efficient and Effective Cue Fusion for Monocular\n Multi-Frame Depth Estimation","summary":" Depth estimation provides an alternative approach for perceiving 3D\ninformation in autonomous driving. Monocular depth estimation, whether with\nsingle-frame or multi-frame inputs, has achieved significant success by\nlearning various types of cues and specializing in either static or dynamic\nscenes. Recently, these cues fusion becomes an attractive topic, aiming to\nenable the combined cues to perform well in both types of scenes. However,\nadaptive cue fusion relies on attention mechanisms, where the quadratic\ncomplexity limits the granularity of cue representation. Additionally, explicit\ncue fusion depends on precise segmentation, which imposes a heavy burden on\nmask prediction. To address these issues, we propose the GSDC Transformer, an\nefficient and effective component for cue fusion in monocular multi-frame depth\nestimation. We utilize deformable attention to learn cue relationships at a\nfine scale, while sparse attention reduces computational requirements when\ngranularity increases. To compensate for the precision drop in dynamic scenes,\nwe represent scene attributes in the form of super tokens without relying on\nprecise shapes. Within each super token attributed to dynamic scenes, we gather\nits relevant cues and learn local dense relationships to enhance cue fusion.\nOur method achieves state-of-the-art performance on the KITTI dataset with\nefficient fusion speed.\n","authors":["Naiyu Fang","Lemiao Qiu","Shuyou Zhang","Zili Wang","Zheyuan Zhou","Kerui Hu"],"pdf_url":"https://arxiv.org/pdf/2309.17059v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.08956v2","updated":"2023-12-05T03:04:12Z","published":"2023-04-18T12:47:26Z","title":"PG-VTON: A Novel Image-Based Virtual Try-On Method via Progressive\n Inference Paradigm","summary":" Virtual try-on is a promising computer vision topic with a high commercial\nvalue wherein a new garment is visually worn on a person with a photo-realistic\neffect. Previous studies conduct their shape and content inference at one\nstage, employing a single-scale warping mechanism and a relatively\nunsophisticated content inference mechanism. These approaches have led to\nsuboptimal results in terms of garment warping and skin reservation under\nchallenging try-on scenarios. To address these limitations, we propose a novel\nvirtual try-on method via progressive inference paradigm (PGVTON) that\nleverages a top-down inference pipeline and a general garment try-on strategy.\nSpecifically, we propose a robust try-on parsing inference method by\ndisentangling semantic categories and introducing consistency. Exploiting the\ntry-on parsing as the shape guidance, we implement the garment try-on via\nwarping-mapping-composition. To facilitate adaptation to a wide range of try-on\nscenarios, we adopt a covering more and selecting one warping strategy and\nexplicitly distinguish tasks based on alignment. Additionally, we regulate\nStyleGAN2 to implement re-naked skin inpainting, conditioned on the target skin\nshape and spatial-agnostic skin features. Experiments demonstrate that our\nmethod has state-of-the-art performance under two challenging scenarios. The\ncode will be available at https://github.com/NerdFNY/PGVTON.\n","authors":["Naiyu Fang","Lemiao Qiu","Shuyou Zhang","Zili Wang","Kerui Hu"],"pdf_url":"https://arxiv.org/pdf/2304.08956v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02367v4","updated":"2023-12-05T02:58:06Z","published":"2023-02-05T12:13:27Z","title":"FastPillars: A Deployment-friendly Pillar-based 3D Detector","summary":" The deployment of 3D detectors strikes one of the major challenges in\nreal-world self-driving scenarios. Existing BEV-based (i.e., Bird Eye View)\ndetectors favor sparse convolutions (known as SPConv) to speed up training and\ninference, which puts a hard barrier for deployment, especially for on-device\napplications. In this paper, to tackle the challenge of efficient 3D object\ndetection from an industry perspective, we devise a deployment-friendly\npillar-based 3D detector, termed FastPillars. First, we introduce a novel\nlightweight Max-and-Attention Pillar Encoding (MAPE) module specially for\nenhancing small 3D objects. Second, we propose a simple yet effective principle\nfor designing a backbone in pillar-based 3D detection. We construct FastPillars\nbased on these designs, achieving high performance and low latency without\nSPConv. Extensive experiments on two large-scale datasets demonstrate the\neffectiveness and efficiency of FastPillars for on-device 3D detection\nregarding both performance and speed. Specifically, FastPillars delivers\nstate-of-the-art accuracy on Waymo Open Dataset with 1.8X speed up and 3.8\nmAPH/L2 improvement over CenterPoint (SPConv-based). Our code is publicly\navailable at: https://github.com/StiphyJay/FastPillars.\n","authors":["Sifan Zhou","Zhi Tian","Xiangxiang Chu","Xinyu Zhang","Bo Zhang","Xiaobo Lu","Chengjian Feng","Zequn Jie","Patrick Yin Chiang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2302.02367v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.08843v3","updated":"2023-12-05T02:46:37Z","published":"2023-11-15T10:33:20Z","title":"Personalized Video Relighting With an At-Home Light Stage","summary":" In this paper, we develop a personalized video relighting algorithm that\nproduces high-quality and temporally consistent relit videos under any pose,\nexpression, and lighting condition in real-time. Existing relighting algorithms\ntypically rely either on publicly available synthetic data, which yields poor\nrelighting results, or instead on light stage data which is difficult to\nobtain. We show that by just capturing video of a user watching YouTube videos\non a monitor we can train a personalized algorithm capable of performing\nhigh-quality relighting under any condition. Our key contribution is a novel\nneural relighting architecture that effectively separates the intrinsic\nappearance features - the geometry and reflectance of the face - from the\nsource lighting and then combines them with the target lighting to generate a\nrelit image. This neural network architecture enables smoothing of intrinsic\nappearance features leading to temporally stable video relighting. Both\nqualitative and quantitative evaluations show that our architecture improves\nportrait image relighting quality and temporal consistency over\nstate-of-the-art approaches on both casually captured `Light Stage at Your\nDesk' (LSYD) and light-stage-captured `One Light At a Time' (OLAT) datasets.\n","authors":["Jun Myeong Choi","Max Christman","Roni Sengupta"],"pdf_url":"https://arxiv.org/pdf/2311.08843v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.02848v2","updated":"2023-12-05T02:42:28Z","published":"2023-07-06T08:27:48Z","title":"Revisiting Computer-Aided Tuberculosis Diagnosis","summary":" Tuberculosis (TB) is a major global health threat, causing millions of deaths\nannually. Although early diagnosis and treatment can greatly improve the\nchances of survival, it remains a major challenge, especially in developing\ncountries. Recently, computer-aided tuberculosis diagnosis (CTD) using deep\nlearning has shown promise, but progress is hindered by limited training data.\nTo address this, we establish a large-scale dataset, namely the Tuberculosis\nX-ray (TBX11K) dataset, which contains 11,200 chest X-ray (CXR) images with\ncorresponding bounding box annotations for TB areas. This dataset enables the\ntraining of sophisticated detectors for high-quality CTD. Furthermore, we\npropose a strong baseline, SymFormer, for simultaneous CXR image classification\nand TB infection area detection. SymFormer incorporates Symmetric Search\nAttention (SymAttention) to tackle the bilateral symmetry property of CXR\nimages for learning discriminative features. Since CXR images may not strictly\nadhere to the bilateral symmetry property, we also propose Symmetric Positional\nEncoding (SPE) to facilitate SymAttention through feature recalibration. To\npromote future research on CTD, we build a benchmark by introducing evaluation\nmetrics, evaluating baseline models reformed from existing detectors, and\nrunning an online challenge. Experiments show that SymFormer achieves\nstate-of-the-art performance on the TBX11K dataset. The data, code, and models\nwill be released at https://github.com/yun-liu/Tuberculosis.\n","authors":["Yun Liu","Yu-Huan Wu","Shi-Chen Zhang","Li Liu","Min Wu","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2307.02848v2.pdf","comment":"Accepted by IEEE TPAMI; 17 pages"},{"id":"http://arxiv.org/abs/2312.02439v1","updated":"2023-12-05T02:41:57Z","published":"2023-12-05T02:41:57Z","title":"Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language\n Models with Creative Humor Generation","summary":" Chain-of-Thought (CoT) guides large language models (LLMs) to reason\nstep-by-step, and can motivate their logical reasoning ability. While effective\nfor logical tasks, CoT is not conducive to creative problem-solving which often\nrequires out-of-box thoughts and is crucial for innovation advancements. In\nthis paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a\nnon-sequential, creative paradigm involving strong associations and knowledge\nleaps. To this end, we study LLMs on the popular Oogiri game which needs\nparticipants to have good creativity and strong associative thinking for\nresponding unexpectedly and humorously to the given image, text, or both, and\nthus is suitable for LoT study. Then to investigate LLMs' LoT ability in the\nOogiri game, we first build a multimodal and multilingual Oogiri-GO dataset\nwhich contains over 130,000 samples from the Oogiri game, and observe the\ninsufficient LoT ability or failures of most existing LLMs on the Oogiri game.\nAccordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve\nLLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into\nLoT-oriented instruction tuning data to train pretrained LLM for achieving\ncertain LoT humor generation and discrimination abilities. Then CLoT designs an\nexplorative self-refinement that encourages the LLM to generate more creative\nLoT data via exploring parallels between seemingly unrelated concepts and\nselects high-quality data to train itself for self-refinement. CLoT not only\nexcels in humor generation in the Oogiri game but also boosts creative\nabilities in various tasks like cloud guessing game and divergent association\ntask. These findings advance our understanding and offer a pathway to improve\nLLMs' creative capacities for innovative applications across domains. The\ndataset, code, and models will be released online.\nhttps://github.com/sail-sg/CLoT.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Shanghua Gao","Wushao Wen","Liang Lin","Marinka Zitnik","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02439v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.02437v1","updated":"2023-12-05T02:33:55Z","published":"2023-12-05T02:33:55Z","title":"GDN: A Stacking Network Used for Skin Cancer Diagnosis","summary":" Skin cancer, the primary type of cancer that can be identified by visual\nrecognition, requires an automatic identification system that can accurately\nclassify different types of lesions. This paper presents GoogLe-Dense Network\n(GDN), which is an image-classification model to identify two types of skin\ncancer, Basal Cell Carcinoma, and Melanoma. GDN uses stacking of different\nnetworks to enhance the model performance. Specifically, GDN consists of two\nsequential levels in its structure. The first level performs basic\nclassification tasks accomplished by GoogLeNet and DenseNet, which are trained\nin parallel to enhance efficiency. To avoid low accuracy and long training\ntime, the second level takes the output of the GoogLeNet and DenseNet as the\ninput for a logistic regression model. We compare our method with four baseline\nnetworks including ResNet, VGGNet, DenseNet, and GoogLeNet on the dataset, in\nwhich GoogLeNet and DenseNet significantly outperform ResNet and VGGNet. In the\nsecond level, different stacking methods such as perceptron, logistic\nregression, SVM, decision trees and K-neighbor are studied in which Logistic\nRegression shows the best prediction result among all. The results prove that\nGDN, compared to a single network structure, has higher accuracy in optimizing\nskin cancer detection.\n","authors":["Jingmin Wei","Haoyang Shen","Ziyi Wang","Ziqian Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02437v1.pdf","comment":"Published at ICSPS 2021"},{"id":"http://arxiv.org/abs/2312.01656v2","updated":"2023-12-05T02:24:38Z","published":"2023-12-04T06:14:25Z","title":"The Contemporary Art of Image Search: Iterative User Intent Expansion\n via Vision-Language Model","summary":" Image search is an essential and user-friendly method to explore vast\ngalleries of digital images. However, existing image search methods heavily\nrely on proximity measurements like tag matching or image similarity, requiring\nprecise user inputs for satisfactory results. To meet the growing demand for a\ncontemporary image search engine that enables accurate comprehension of users'\nsearch intentions, we introduce an innovative user intent expansion framework.\nOur framework leverages visual-language models to parse and compose multi-modal\nuser inputs to provide more accurate and satisfying results. It comprises\ntwo-stage processes: 1) a parsing stage that incorporates a language parsing\nmodule with large language models to enhance the comprehension of textual\ninputs, along with a visual parsing module that integrates an interactive\nsegmentation module to swiftly identify detailed visual elements within images;\nand 2) a logic composition stage that combines multiple user search intents\ninto a unified logic expression for more sophisticated operations in complex\nsearching scenarios. Moreover, the intent expansion framework enables users to\nperform flexible contextualized interactions with the search results to further\nspecify or adjust their detailed search intents iteratively. We implemented the\nframework into an image search system for NFT (non-fungible token) search and\nconducted a user study to evaluate its usability and novel properties. The\nresults indicate that the proposed framework significantly improves users'\nimage search experience. Particularly the parsing and contextualized\ninteractions prove useful in allowing users to express their search intents\nmore accurately and engage in a more enjoyable iterative search experience.\n","authors":["Yilin Ye","Qian Zhu","Shishi Xiao","Kang Zhang","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01656v2.pdf","comment":"Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported\n Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024)"},{"id":"http://arxiv.org/abs/2312.02434v1","updated":"2023-12-05T02:23:41Z","published":"2023-12-05T02:23:41Z","title":"FINER: Flexible spectral-bias tuning in Implicit NEural Representation\n by Variable-periodic Activation Functions","summary":" Implicit Neural Representation (INR), which utilizes a neural network to map\ncoordinate inputs to corresponding attributes, is causing a revolution in the\nfield of signal processing. However, current INR techniques suffer from a\nrestricted capability to tune their supported frequency set, resulting in\nimperfect performance when representing complex signals with multiple\nfrequencies. We have identified that this frequency-related problem can be\ngreatly alleviated by introducing variable-periodic activation functions, for\nwhich we propose FINER. By initializing the bias of the neural network within\ndifferent ranges, sub-functions with various frequencies in the\nvariable-periodic function are selected for activation. Consequently, the\nsupported frequency set of FINER can be flexibly tuned, leading to improved\nperformance in signal representation. We demonstrate the capabilities of FINER\nin the contexts of 2D image fitting, 3D signed distance field representation,\nand 5D neural radiance fields optimization, and we show that it outperforms\nexisting INRs.\n","authors":["Zhen Liu","Hao Zhu","Qi Zhang","Jingde Fu","Weibing Deng","Zhan Ma","Yanwen Guo","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2312.02434v1.pdf","comment":"10 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.02433v1","updated":"2023-12-05T02:19:35Z","published":"2023-12-05T02:19:35Z","title":"Lenna: Language Enhanced Reasoning Detection Assistant","summary":" With the fast-paced development of multimodal large language models (MLLMs),\nwe can now converse with AI systems in natural languages to understand images.\nHowever, the reasoning power and world knowledge embedded in the large language\nmodels have been much less investigated and exploited for image perception\ntasks. In this paper, we propose Lenna, a language-enhanced reasoning detection\nassistant, which utilizes the robust multimodal feature representation of\nMLLMs, while preserving location information for detection. This is achieved by\nincorporating an additional token in the MLLM vocabulary that is free of\nexplicit semantic context but serves as a prompt for the detector to identify\nthe corresponding position. To evaluate the reasoning capability of Lenna, we\nconstruct a ReasonDet dataset to measure its performance on reasoning-based\ndetection. Remarkably, Lenna demonstrates outstanding performance on ReasonDet\nand comes with significantly low training costs. It also incurs minimal\ntransferring overhead when extended to other tasks. Our code and model will be\navailable at https://git.io/Lenna.\n","authors":["Fei Wei","Xinyu Zhang","Ailing Zhang","Bo Zhang","Xiangxiang Chu"],"pdf_url":"https://arxiv.org/pdf/2312.02433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02432v1","updated":"2023-12-05T02:17:48Z","published":"2023-12-05T02:17:48Z","title":"Orthogonal Adaptation for Modular Customization of Diffusion Models","summary":" Customization techniques for text-to-image models have paved the way for a\nwide range of previously unattainable applications, enabling the generation of\nspecific concepts across diverse contexts and styles. While existing methods\nfacilitate high-fidelity customization for individual concepts or a limited,\npre-defined set of them, they fall short of achieving scalability, where a\nsingle model can seamlessly render countless concepts. In this paper, we\naddress a new problem called Modular Customization, with the goal of\nefficiently merging customized models that were fine-tuned independently for\nindividual concepts. This allows the merged model to jointly synthesize\nconcepts in one image without compromising fidelity or incurring any additional\ncomputational costs.\n To address this problem, we introduce Orthogonal Adaptation, a method\ndesigned to encourage the customized models, which do not have access to each\nother during fine-tuning, to have orthogonal residual weights. This ensures\nthat during inference time, the customized models can be summed with minimal\ninterference.\n Our proposed method is both simple and versatile, applicable to nearly all\noptimizable weights in the model architecture. Through an extensive set of\nquantitative and qualitative evaluations, our method consistently outperforms\nrelevant baselines in terms of efficiency and identity preservation,\ndemonstrating a significant leap toward scalable customization of diffusion\nmodels.\n","authors":["Ryan Po","Guandao Yang","Kfir Aberman","Gordon Wetzstein"],"pdf_url":"https://arxiv.org/pdf/2312.02432v1.pdf","comment":"Project page: https://ryanpo.com/ortha/"},{"id":"http://arxiv.org/abs/2312.02428v1","updated":"2023-12-05T02:07:31Z","published":"2023-12-05T02:07:31Z","title":"FreestyleRet: Retrieving Images from Style-Diversified Queries","summary":" Image Retrieval aims to retrieve corresponding images based on a given query.\nIn application scenarios, users intend to express their retrieval intent\nthrough various query styles. However, current retrieval tasks predominantly\nfocus on text-query retrieval exploration, leading to limited retrieval query\noptions and potential ambiguity or bias in user intention. In this paper, we\npropose the Style-Diversified Query-Based Image Retrieval task, which enables\nretrieval based on various query styles. To facilitate the novel setting, we\npropose the first Diverse-Style Retrieval dataset, encompassing diverse query\nstyles including text, sketch, low-resolution, and art. We also propose a\nlight-weighted style-diversified retrieval framework. For various query style\ninputs, we apply the Gram Matrix to extract the query's textural features and\ncluster them into a style space with style-specific bases. Then we employ the\nstyle-init prompt tuning module to enable the visual encoder to comprehend the\ntexture and style information of the query. Experiments demonstrate that our\nmodel, employing the style-init prompt tuning strategy, outperforms existing\nretrieval models on the style-diversified retrieval task. Moreover,\nstyle-diversified queries~(sketch+text, art+text, etc) can be simultaneously\nretrieved in our model. The auxiliary information from other queries enhances\nthe retrieval performance within the respective query.\n","authors":["Hao Li","Curise Jia","Peng Jin","Zesen Cheng","Kehan Li","Jialu Sui","Chang Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.02428v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.00072v2","updated":"2023-12-05T01:58:50Z","published":"2023-11-29T22:07:25Z","title":"CRAFT: Contextual Re-Activation of Filters for face recognition Training","summary":" The first layer of a deep CNN backbone applies filters to an image to extract\nthe basic features available to later layers. During training, some filters may\ngo inactive, mean ing all weights in the filter approach zero. An inactive fil\nter in the final model represents a missed opportunity to extract a useful\nfeature. This phenomenon is especially prevalent in specialized CNNs such as\nfor face recogni tion (as opposed to, e.g., ImageNet). For example, in one the\nmost widely face recognition model (ArcFace), about half of the convolution\nfilters in the first layer are inactive. We propose a novel approach designed\nand tested specif ically for face recognition networks, known as \"CRAFT:\nContextual Re-Activation of Filters for Face Recognition Training\". CRAFT\nidentifies inactive filters during training and reinitializes them based on the\ncontext of strong filters at that stage in training. We show that CRAFT reduces\nfraction of inactive filters from 44% to 32% on average and discovers filter\npatterns not found by standard training. Compared to standard training without\nreactivation, CRAFT demonstrates enhanced model accuracy on standard\nface-recognition benchmark datasets including AgeDB-30, CPLFW, LFW, CALFW, and\nCFP-FP, as well as on more challenging datasets like IJBB and IJBC.\n","authors":["Aman Bhatta","Domingo Mery","Haiyu Wu","Kevin W. Bowyer"],"pdf_url":"https://arxiv.org/pdf/2312.00072v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02420v1","updated":"2023-12-05T01:37:18Z","published":"2023-12-05T01:37:18Z","title":"Towards Granularity-adjusted Pixel-level Semantic Annotation","summary":" Recent advancements in computer vision predominantly rely on learning-based\nsystems, leveraging annotations as the driving force to develop specialized\nmodels. However, annotating pixel-level information, particularly in semantic\nsegmentation, presents a challenging and labor-intensive task, prompting the\nneed for autonomous processes. In this work, we propose GranSAM which\ndistinguishes itself by providing semantic segmentation at the user-defined\ngranularity level on unlabeled data without the need for any manual\nsupervision, offering a unique contribution in the realm of semantic mask\nannotation method. Specifically, we propose an approach to enable the Segment\nAnything Model (SAM) with semantic recognition capability to generate\npixel-level annotations for images without any manual supervision. For this, we\naccumulate semantic information from synthetic images generated by the Stable\nDiffusion model or web crawled images and employ this data to learn a mapping\nfunction between SAM mask embeddings and object class labels. As a result, SAM,\nenabled with granularity-adjusted mask recognition, can be used for pixel-level\nsemantic annotation purposes. We conducted experiments on the PASCAL VOC 2012\nand COCO-80 datasets and observed a +17.95% and +5.17% increase in mIoU,\nrespectively, compared to existing state-of-the-art methods when evaluated\nunder our problem setting.\n","authors":["Rohit Kundu","Sudipta Paul","Rohit Lal","Amit K. Roy-Chowdhury"],"pdf_url":"https://arxiv.org/pdf/2312.02420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02409v1","updated":"2023-12-05T00:48:31Z","published":"2023-12-05T00:48:31Z","title":"MGTR: Multi-Granular Transformer for Motion Prediction with LiDAR","summary":" Motion prediction has been an essential component of autonomous driving\nsystems since it handles highly uncertain and complex scenarios involving\nmoving agents of different types. In this paper, we propose a Multi-Granular\nTRansformer (MGTR) framework, an encoder-decoder network that exploits context\nfeatures in different granularities for different kinds of traffic agents. To\nfurther enhance MGTR's capabilities, we leverage LiDAR point cloud data by\nincorporating LiDAR semantic features from an off-the-shelf LiDAR feature\nextractor. We evaluate MGTR on Waymo Open Dataset motion prediction benchmark\nand show that the proposed method achieved state-of-the-art performance,\nranking 1st on its leaderboard\n(https://waymo.com/open/challenges/2023/motion-prediction/).\n","authors":["Yiqian Gan","Hao Xiao","Yizhe Zhao","Ethan Zhang","Zhe Huang","Xin Ye","Lingting Ge"],"pdf_url":"https://arxiv.org/pdf/2312.02409v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10191v3","updated":"2023-12-05T00:24:56Z","published":"2023-06-16T21:53:16Z","title":"Neural Priming for Sample-Efficient Adaptation","summary":" We propose Neural Priming, a technique for adapting large pretrained models\nto distribution shifts and downstream tasks given few or no labeled examples.\nPresented with class names or unlabeled test samples, Neural Priming enables\nthe model to recall and conditions its parameters on relevant data seen\nthroughout pretraining, thereby priming it for the test distribution. Neural\nPriming can be performed at test time, even for pretraining datasets as large\nas LAION-2B. Performing lightweight updates on the recalled data significantly\nimproves accuracy across a variety of distribution shift and transfer learning\nbenchmarks. Concretely, in the zero-shot setting, we see a 2.45% improvement in\naccuracy on ImageNet and 3.81% accuracy improvement on average across standard\ntransfer learning benchmarks. Further, using Neural Priming at inference to\nadapt to distribution shift, we see a 1.41% accuracy improvement on ImageNetV2.\nThese results demonstrate the effectiveness of Neural Priming in addressing the\nchallenge of limited labeled data and changing distributions. Code is available\nat github.com/RAIVNLab/neural-priming.\n","authors":["Matthew Wallingford","Vivek Ramanujan","Alex Fang","Aditya Kusupati","Roozbeh Mottaghi","Aniruddha Kembhavi","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2306.10191v3.pdf","comment":"18 pages, 7 figures, 9 tables"},{"id":"http://arxiv.org/abs/2311.11602v3","updated":"2023-12-05T00:07:12Z","published":"2023-11-20T08:29:55Z","title":"A Multi-In-Single-Out Network for Video Frame Interpolation without\n Optical Flow","summary":" In general, deep learning-based video frame interpolation (VFI) methods have\npredominantly focused on estimating motion vectors between two input frames and\nwarping them to the target time. While this approach has shown impressive\nperformance for linear motion between two input frames, it exhibits limitations\nwhen dealing with occlusions and nonlinear movements. Recently, generative\nmodels have been applied to VFI to address these issues. However, as VFI is not\na task focused on generating plausible images, but rather on predicting\naccurate intermediate frames between two given frames, performance limitations\nstill persist. In this paper, we propose a multi-in-single-out (MISO) based VFI\nmethod that does not rely on motion vector estimation, allowing it to\neffectively model occlusions and nonlinear motion. Additionally, we introduce a\nnovel motion perceptual loss that enables MISO-VFI to better capture the\nspatio-temporal correlations within the video frames. Our MISO-VFI method\nachieves state-of-the-art results on VFI benchmarks Vimeo90K, Middlebury, and\nUCF101, with a significant performance gap compared to existing approaches.\n","authors":["Jaemin Lee","Minseok Seo","Sangwoo Lee","Hyobin Park","Dong-Geol Choi"],"pdf_url":"https://arxiv.org/pdf/2311.11602v3.pdf","comment":"Discovering a problem with the manuscript"},{"id":"http://arxiv.org/abs/2208.03934v3","updated":"2023-12-05T23:59:59Z","published":"2022-08-08T06:31:00Z","title":"Inflating 2D Convolution Weights for Efficient Generation of 3D Medical\n Images","summary":" The generation of three-dimensional (3D) medical images has great application\npotential since it takes into account the 3D anatomical structure. Two problems\nprevent effective training of a 3D medical generative model: (1) 3D medical\nimages are expensive to acquire and annotate, resulting in an insufficient\nnumber of training images, and (2) a large number of parameters are involved in\n3D convolution.\n Methods: We propose a novel GAN model called 3D Split&Shuffle-GAN. To address\nthe 3D data scarcity issue, we first pre-train a two-dimensional (2D) GAN model\nusing abundant image slices and inflate the 2D convolution weights to improve\nthe initialization of the 3D GAN. Novel 3D network architectures are proposed\nfor both the generator and discriminator of the GAN model to significantly\nreduce the number of parameters while maintaining the quality of image\ngeneration. Several weight inflation strategies and parameter-efficient 3D\narchitectures are investigated.\n Results: Experiments on both heart (Stanford AIMI Coronary Calcium) and brain\n(Alzheimer's Disease Neuroimaging Initiative) datasets show that our method\nleads to improved 3D image generation quality (14.7 improvements on Fr\\'echet\ninception distance) with significantly fewer parameters (only 48.5% of the\nbaseline method).\n Conclusions: We built a parameter-efficient 3D medical image generation\nmodel. Due to the efficiency and effectiveness, it has the potential to\ngenerate high-quality 3D brain and heart images for real use cases.\n","authors":["Yanbin Liu","Girish Dwivedi","Farid Boussaid","Frank Sanfilippo","Makoto Yamada","Mohammed Bennamoun"],"pdf_url":"https://arxiv.org/pdf/2208.03934v3.pdf","comment":"Published at Computer Methods and Programs in Biomedicine (CMPB) 2023"},{"id":"http://arxiv.org/abs/2312.03187v1","updated":"2023-12-05T23:33:49Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n Generation from Spontaneous Facial Expression Reaction","summary":" Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically annotate user\npreferences from their spontaneous facial expression reaction to the generated\nimages. We collect a dataset of Facial Expression Reaction to Generated Images\n(FERGI) and show that the activations of multiple facial action units (AUs) are\nhighly correlated with user evaluations of the generated images. Specifically,\nAU4 (brow lowerer) is most consistently reflective of negative evaluations of\nthe generated image. This can be useful in two ways. Firstly, we can\nautomatically annotate user preferences between image pairs with substantial\ndifference in AU4 responses to them with an accuracy significantly\noutperforming state-of-the-art scoring models. Secondly, directly integrating\nthe AU4 responses with the scoring models improves their consistency with human\npreferences. Additionally, the AU4 response best reflects the user's evaluation\nof the image fidelity, making it complementary to the state-of-the-art scoring\nmodels, which are generally better at reflecting image-text alignment. Finally,\nthis method of automatic annotation with facial expression analysis can be\npotentially generalized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.04979v3","updated":"2023-12-05T23:05:32Z","published":"2022-06-10T10:28:55Z","title":"Convolutional layers are equivariant to discrete shifts but not\n continuous translations","summary":" The purpose of this short and simple note is to clarify a common\nmisconception about convolutional neural networks (CNNs). CNNs are made up of\nconvolutional layers which are shift equivariant due to weight sharing.\nHowever, convolutional layers are not translation equivariant, even when\nboundary effects are ignored and when pooling and subsampling are absent. This\nis because shift equivariance is a discrete symmetry while translation\nequivariance is a continuous symmetry. This fact is well known among\nresearchers in equivariant machine learning, but is usually overlooked among\nnon-experts. To minimize confusion, we suggest using the term `shift\nequivariance' to refer to discrete shifts in pixels and `translation\nequivariance' to refer to continuous translations.\n","authors":["Nick McGreivy","Ammar Hakim"],"pdf_url":"https://arxiv.org/pdf/2206.04979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.02890v5","updated":"2023-12-05T22:16:26Z","published":"2022-09-07T02:23:40Z","title":"Data-Driven Target Localization Using Adaptive Radar Processing and\n Convolutional Neural Networks","summary":" Leveraging the advanced functionalities of modern radio frequency (RF)\nmodeling and simulation tools, specifically designed for adaptive radar\nprocessing applications, this paper presents a data-driven approach to improve\naccuracy in radar target localization post adaptive radar detection. To this\nend, we generate a large number of radar returns by randomly placing targets of\nvariable strengths in a predefined area, using RFView, a high-fidelity,\nsite-specific, RF modeling & simulation tool. We produce heatmap tensors from\nthe radar returns, in range, azimuth [and Doppler], of the normalized adaptive\nmatched filter (NAMF) test statistic. We then train a regression convolutional\nneural network (CNN) to estimate target locations from these heatmap tensors,\nand we compare the target localization accuracy of this approach with that of\npeak-finding and local search methods. This empirical study shows that our\nregression CNN achieves a considerable improvement in target location\nestimation accuracy. The regression CNN offers significant gains and reasonable\naccuracy even at signal-to-clutter-plus-noise ratio (SCNR) regimes that are\nclose to the breakdown threshold SCNR of the NAMF. We also study the robustness\nof our trained CNN to mismatches in the radar data, where the CNN is tested on\nheatmap tensors collected from areas that it was not trained on. We show that\nour CNN can be made robust to mismatches in the radar data through few-shot\nlearning, using a relatively small number of new training samples.\n","authors":["Shyam Venkatasubramanian","Sandeep Gogineni","Bosung Kang","Ali Pezeshki","Muralidhar Rangaswamy","Vahid Tarokh"],"pdf_url":"https://arxiv.org/pdf/2209.02890v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03160v1","updated":"2023-12-05T22:04:49Z","published":"2023-12-05T22:04:49Z","title":"HybridNeRF: Efficient Neural Rendering via Adaptive Volumetric Surfaces","summary":" Neural radiance fields provide state-of-the-art view synthesis quality but\ntend to be slow to render. One reason is that they make use of volume\nrendering, thus requiring many samples (and model queries) per ray at render\ntime. Although this representation is flexible and easy to optimize, most\nreal-world objects can be modeled more efficiently with surfaces instead of\nvolumes, requiring far fewer samples per ray. This observation has spurred\nconsiderable progress in surface representations such as signed distance\nfunctions, but these may struggle to model semi-opaque and thin structures. We\npropose a method, HybridNeRF, that leverages the strengths of both\nrepresentations by rendering most objects as surfaces while modeling the\n(typically) small fraction of challenging regions volumetrically. We evaluate\nHybridNeRF against the challenging Eyeful Tower dataset along with other\ncommonly used view synthesis datasets. When comparing to state-of-the-art\nbaselines, including recent rasterization-based approaches, we improve error\nrates by 15-30% while achieving real-time framerates (at least 36 FPS) for\nvirtual-reality resolutions (2Kx2K).\n","authors":["Haithem Turki","Vasu Agrawal","Samuel Rota Bulò","Lorenzo Porzi","Peter Kontschieder","Deva Ramanan","Michael Zollhöfer","Christian Richardt"],"pdf_url":"https://arxiv.org/pdf/2312.03160v1.pdf","comment":"Project page: https://haithemturki.com/hybrid-nerf/"},{"id":"http://arxiv.org/abs/2311.14948v3","updated":"2023-12-05T21:52:47Z","published":"2023-11-25T06:55:13Z","title":"Effective Backdoor Mitigation Depends on the Pre-training Objective","summary":" Despite the advanced capabilities of contemporary machine learning (ML)\nmodels, they remain vulnerable to adversarial and backdoor attacks. This\nvulnerability is particularly concerning in real-world deployments, where\ncompromised models may exhibit unpredictable behavior in critical scenarios.\nSuch risks are heightened by the prevalent practice of collecting massive,\ninternet-sourced datasets for pre-training multimodal models, as these datasets\nmay harbor backdoors. Various techniques have been proposed to mitigate the\neffects of backdooring in these models such as CleanCLIP which is the current\nstate-of-the-art approach. In this work, we demonstrate that the efficacy of\nCleanCLIP in mitigating backdoors is highly dependent on the particular\nobjective used during model pre-training. We observe that stronger pre-training\nobjectives correlate with harder to remove backdoors behaviors. We show this by\ntraining multimodal models on two large datasets consisting of 3 million (CC3M)\nand 6 million (CC6M) datapoints, under various pre-training objectives,\nfollowed by poison removal using CleanCLIP. We find that CleanCLIP is\nineffective when stronger pre-training objectives are used, even with extensive\nhyperparameter tuning. Our findings underscore critical considerations for ML\npractitioners who pre-train models using large-scale web-curated data and are\nconcerned about potential backdoor threats. Notably, our results suggest that\nsimpler pre-training objectives are more amenable to effective backdoor\nremoval. This insight is pivotal for practitioners seeking to balance the\ntrade-offs between using stronger pre-training objectives and security against\nbackdoor attacks.\n","authors":["Sahil Verma","Gantavya Bhatt","Avi Schwarzschild","Soumye Singhal","Arnav Mohanty Das","Chirag Shah","John P Dickerson","Jeff Bilmes"],"pdf_url":"https://arxiv.org/pdf/2311.14948v3.pdf","comment":"Accepted for oral presentation at BUGS workshop @ NeurIPS 2023\n (https://neurips2023-bugs.github.io/)"},{"id":"http://arxiv.org/abs/2312.03154v1","updated":"2023-12-05T21:41:17Z","published":"2023-12-05T21:41:17Z","title":"ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for\n ControlNet","summary":" This paper introduces ViscoNet, a novel method that enhances text-to-image\nhuman generation models with visual prompting. Unlike existing methods that\nrely on lengthy text descriptions to control the image structure, ViscoNet\nallows users to specify the visual appearance of the target object with a\nreference image. ViscoNet disentangles the object's appearance from the image\nbackground and injects it into a pre-trained latent diffusion model (LDM) model\nvia a ControlNet branch. This way, ViscoNet mitigates the style mode collapse\nproblem and enables precise and flexible visual control. We demonstrate the\neffectiveness of ViscoNet on human image generation, where it can manipulate\nvisual attributes and artistic styles with text and image prompts. We also show\nthat ViscoNet can learn visual conditioning from small and specific object\ndomains while preserving the generative power of the LDM backbone.\n","authors":["Soon Yau Cheong","Armin Mustafa","Andrew Gilbert"],"pdf_url":"https://arxiv.org/pdf/2312.03154v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03133v1","updated":"2023-12-05T21:00:08Z","published":"2023-12-05T21:00:08Z","title":"Predicting Bone Degradation Using Vision Transformer and Synthetic\n Cellular Microstructures Dataset","summary":" Bone degradation, especially for astronauts in microgravity conditions, is\ncrucial for space exploration missions since the lower applied external forces\naccelerate the diminution in bone stiffness and strength substantially.\nAlthough existing computational models help us understand this phenomenon and\npossibly restrict its effect in the future, they are time-consuming to simulate\nthe changes in the bones, not just the bone microstructures, of each individual\nin detail. In this study, a robust yet fast computational method to predict and\nvisualize bone degradation has been developed. Our deep-learning method,\nTransVNet, can take in different 3D voxelized images and predict their\nevolution throughout months utilizing a hybrid 3D-CNN-VisionTransformer\nautoencoder architecture. Because of limited available experimental data and\nchallenges of obtaining new samples, a digital twin dataset of diverse and\ninitial bone-like microstructures was generated to train our TransVNet on the\nevolution of the 3D images through a previously developed degradation model for\nmicrogravity.\n","authors":["Mohammad Saber Hashemi","Azadeh Sheidaei"],"pdf_url":"https://arxiv.org/pdf/2312.03133v1.pdf","comment":"8 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03119v1","updated":"2023-12-05T20:37:38Z","published":"2023-12-05T20:37:38Z","title":"AI-SAM: Automatic and Interactive Segment Anything Model","summary":" Semantic segmentation is a core task in computer vision. Existing methods are\ngenerally divided into two categories: automatic and interactive. Interactive\napproaches, exemplified by the Segment Anything Model (SAM), have shown promise\nas pre-trained models. However, current adaptation strategies for these models\ntend to lean towards either automatic or interactive approaches. Interactive\nmethods depend on prompts user input to operate, while automatic ones bypass\nthe interactive promptability entirely. Addressing these limitations, we\nintroduce a novel paradigm and its first model: the Automatic and Interactive\nSegment Anything Model (AI-SAM). In this paradigm, we conduct a comprehensive\nanalysis of prompt quality and introduce the pioneering Automatic and\nInteractive Prompter (AI-Prompter) that automatically generates initial point\nprompts while accepting additional user inputs. Our experimental results\ndemonstrate AI-SAM's effectiveness in the automatic setting, achieving\nstate-of-the-art performance. Significantly, it offers the flexibility to\nincorporate additional user prompts, thereby further enhancing its performance.\nThe project page is available at https://github.com/ymp5078/AI-SAM.\n","authors":["Yimu Pan","Sitao Zhang","Alison D. Gernand","Jeffery A. Goldstein","James Z. Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03119v1.pdf","comment":"11 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.03110v1","updated":"2023-12-05T20:12:31Z","published":"2023-12-05T20:12:31Z","title":"The Automated Bias Triangle Feature Extraction Framework","summary":" Bias triangles represent features in stability diagrams of Quantum Dot (QD)\ndevices, whose occurrence and property analysis are crucial indicators for spin\nphysics. Nevertheless, challenges associated with quality and availability of\ndata as well as the subtlety of physical phenomena of interest have hindered an\nautomatic and bespoke analysis framework, often still relying (in part) on\nhuman labelling and verification. We introduce a feature extraction framework\nfor bias triangles, built from unsupervised, segmentation-based computer vision\nmethods, which facilitates the direct identification and quantification of\nphysical properties of the former. Thereby, the need for human input or large\ntraining datasets to inform supervised learning approaches is circumvented,\nwhile additionally enabling the automation of pixelwise shape and feature\nlabeling. In particular, we demonstrate that Pauli Spin Blockade (PSB)\ndetection can be conducted effectively, efficiently and without any training\ndata as a direct result of this approach.\n","authors":["Madeleine Kotzagiannidis","Jonas Schuff","Nathan Korda"],"pdf_url":"https://arxiv.org/pdf/2312.03110v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.08769v2","updated":"2023-12-05T20:08:25Z","published":"2023-09-15T21:22:51Z","title":"The Use of Multi-Scale Fiducial Markers To Aid Takeoff and Landing\n Navigation by Rotorcraft","summary":" This paper quantifies the performance of visual SLAM that leverages\nmulti-scale fiducial markers (i.e., artificial landmarks that can be detected\nat a wide range of distances) to show its potential for reliable takeoff and\nlanding navigation in rotorcraft. Prior work has shown that square markers with\na black-and-white pattern of grid cells can be used to improve the performance\nof visual SLAM with color cameras. We extend this prior work to allow nested\nmarker layouts. We evaluate performance during semi-autonomous takeoff and\nlanding operations in a variety of environmental conditions by a DJI Matrice\n300 RTK rotorcraft with two FLIR Blackfly color cameras, using RTK GNSS to\nobtain ground truth pose estimates. Performance measures include absolute\ntrajectory error and the fraction of the number of estimated poses to the total\nframe. We release all of our results -- our dataset and the code of the\nimplementation of the visual SLAM with fiducial markers -- to the public as\nopen-source.\n","authors":["Jongwon Lee","Su Yeon Choi","Timothy Bretl"],"pdf_url":"https://arxiv.org/pdf/2309.08769v2.pdf","comment":"Extended abstract accepted at the 2024 AIAA SciTech"},{"id":"http://arxiv.org/abs/2312.03102v1","updated":"2023-12-05T19:45:44Z","published":"2023-12-05T19:45:44Z","title":"Fully Convolutional Slice-to-Volume Reconstruction for Single-Stack MRI","summary":" In magnetic resonance imaging (MRI), slice-to-volume reconstruction (SVR)\nrefers to computational reconstruction of an unknown 3D magnetic resonance\nvolume from stacks of 2D slices corrupted by motion. While promising, current\nSVR methods require multiple slice stacks for accurate 3D reconstruction,\nleading to long scans and limiting their use in time-sensitive applications\nsuch as fetal fMRI. Here, we propose a SVR method that overcomes the\nshortcomings of previous work and produces state-of-the-art reconstructions in\nthe presence of extreme inter-slice motion. Inspired by the recent success of\nsingle-view depth estimation methods, we formulate SVR as a single-stack motion\nestimation task and train a fully convolutional network to predict a motion\nstack for a given slice stack, producing a 3D reconstruction as a byproduct of\nthe predicted motion. Extensive experiments on the SVR of adult and fetal\nbrains demonstrate that our fully convolutional method is twice as accurate as\nprevious SVR methods. Our code is available at github.com/seannz/svr.\n","authors":["Sean I. Young","Yaël Balbastre","Bruce Fischl","Polina Golland","Juan Eugenio Iglesias"],"pdf_url":"https://arxiv.org/pdf/2312.03102v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03085v1","updated":"2023-12-05T19:03:33Z","published":"2023-12-05T19:03:33Z","title":"ScAR: Scaling Adversarial Robustness for LiDAR Object Detection","summary":" The adversarial robustness of a model is its ability to resist adversarial\nattacks in the form of small perturbations to input data. Universal adversarial\nattack methods such as Fast Sign Gradient Method (FSGM) and Projected Gradient\nDescend (PGD) are popular for LiDAR object detection, but they are often\ndeficient compared to task-specific adversarial attacks. Additionally, these\nuniversal methods typically require unrestricted access to the model's\ninformation, which is difficult to obtain in real-world applications. To\naddress these limitations, we present a black-box Scaling Adversarial\nRobustness (ScAR) method for LiDAR object detection. By analyzing the\nstatistical characteristics of 3D object detection datasets such as KITTI,\nWaymo, and nuScenes, we have found that the model's prediction is sensitive to\nscaling of 3D instances. We propose three black-box scaling adversarial attack\nmethods based on the available information: model-aware attack,\ndistribution-aware attack, and blind attack. We also introduce a strategy for\ngenerating scaling adversarial examples to improve the model's robustness\nagainst these three scaling adversarial attacks. Comparison with other methods\non public datasets under different 3D object detection architectures\ndemonstrates the effectiveness of our proposed method.\n","authors":["Xiaohu Lu","Hayder Radha"],"pdf_url":"https://arxiv.org/pdf/2312.03085v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03079v1","updated":"2023-12-05T19:00:20Z","published":"2023-12-05T19:00:20Z","title":"LooseControl: Lifting ControlNet for Generalized Depth Conditioning","summary":" We present LooseControl to allow generalized depth conditioning for\ndiffusion-based image generation. ControlNet, the SOTA for depth-conditioned\nimage generation, produces remarkable results but relies on having access to\ndetailed depth maps for guidance. Creating such exact depth maps, in many\nscenarios, is challenging. This paper introduces a generalized version of depth\nconditioning that enables many new content-creation workflows. Specifically, we\nallow (C1) scene boundary control for loosely specifying scenes with only\nboundary conditions, and (C2) 3D box control for specifying layout locations of\nthe target objects rather than the exact shape and appearance of the objects.\nUsing LooseControl, along with text guidance, users can create complex\nenvironments (e.g., rooms, street views, etc.) by specifying only scene\nboundaries and locations of primary objects. Further, we provide two editing\nmechanisms to refine the results: (E1) 3D box editing enables the user to\nrefine images by changing, adding, or removing boxes while freezing the style\nof the image. This yields minimal changes apart from changes induced by the\nedited boxes. (E2) Attribute editing proposes possible editing directions to\nchange one particular aspect of the scene, such as the overall object density\nor a particular object. Extensive tests and comparisons with baselines\ndemonstrate the generality of our method. We believe that LooseControl can\nbecome an important design tool for easily creating complex environments and be\nextended to other forms of guidance channels. Code and more information are\navailable at https://shariqfarooq123.github.io/loose-control/ .\n","authors":["Shariq Farooq Bhat","Niloy J. Mitra","Peter Wonka"],"pdf_url":"https://arxiv.org/pdf/2312.03079v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03053v1","updated":"2023-12-05T18:59:41Z","published":"2023-12-05T18:59:41Z","title":"DiffusionPCR: Diffusion Models for Robust Multi-Step Point Cloud\n Registration","summary":" Point Cloud Registration (PCR) estimates the relative rigid transformation\nbetween two point clouds. We propose formulating PCR as a denoising diffusion\nprobabilistic process, mapping noisy transformations to the ground truth.\nHowever, using diffusion models for PCR has nontrivial challenges, such as\nadapting a generative model to a discriminative task and leveraging the\nestimated nonlinear transformation from the previous step. Instead of training\na diffusion model to directly map pure noise to ground truth, we map the\npredictions of an off-the-shelf PCR model to ground truth. The predictions of\noff-the-shelf models are often imperfect, especially in challenging cases where\nthe two points clouds have low overlap, and thus could be seen as noisy\nversions of the real rigid transformation. In addition, we transform the\nrotation matrix into a spherical linear space for interpolation between samples\nin the forward process, and convert rigid transformations into auxiliary\ninformation to implicitly exploit last-step estimations in the reverse process.\nAs a result, conditioned on time step, the denoising model adapts to the\nincreasing accuracy across steps and refines registrations. Our extensive\nexperiments showcase the effectiveness of our DiffusionPCR, yielding\nstate-of-the-art registration recall rates (95.3%/81.6%) on 3DMatch and\n3DLoMatch. The code will be made public upon publication.\n","authors":["Zhi Chen","Yufan Ren","Tong Zhang","Zheng Dang","Wenbing Tao","Sabine Süsstrunk","Mathieu Salzmann"],"pdf_url":"https://arxiv.org/pdf/2312.03053v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03052v1","updated":"2023-12-05T18:58:37Z","published":"2023-12-05T18:58:37Z","title":"Visual Program Distillation: Distilling Tools and Programmatic Reasoning\n into Vision-Language Models","summary":" Solving complex visual tasks such as \"Who invented the musical instrument on\nthe right?\" involves a composition of skills: understanding space, recognizing\ninstruments, and also retrieving prior knowledge. Recent work shows promise by\ndecomposing such tasks using a large language model (LLM) into an executable\nprogram that invokes specialized vision models. However, generated programs are\nerror-prone: they omit necessary steps, include spurious ones, and are unable\nto recover when the specialized models give incorrect outputs. Moreover, they\nrequire loading multiple models, incurring high latency and computation costs.\nWe propose Visual Program Distillation (VPD), an instruction tuning framework\nthat produces a vision-language model (VLM) capable of solving complex visual\ntasks with a single forward pass. VPD distills the reasoning ability of LLMs by\nusing them to sample multiple candidate programs, which are then executed and\nverified to identify a correct one. It translates each correct program into a\nlanguage description of the reasoning steps, which are then distilled into a\nVLM. Extensive experiments show that VPD improves the VLM's ability to count,\nunderstand spatial relations, and reason compositionally. Our VPD-trained\nPaLI-X outperforms all prior VLMs, achieving state-of-the-art performance\nacross complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE,\nand Hateful Memes. An evaluation with human annotators also confirms that VPD\nimproves model response factuality and consistency. Finally, experiments on\ncontent moderation demonstrate that VPD is also helpful for adaptation to\nreal-world applications with limited data.\n","authors":["Yushi Hu","Otilia Stretcu","Chun-Ta Lu","Krishnamurthy Viswanathan","Kenji Hata","Enming Luo","Ranjay Krishna","Ariel Fuxman"],"pdf_url":"https://arxiv.org/pdf/2312.03052v1.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.02969v1","updated":"2023-12-05T18:57:40Z","published":"2023-12-05T18:57:40Z","title":"Rank-without-GPT: Building GPT-Independent Listwise Rerankers on\n Open-Source Large Language Models","summary":" Listwise rerankers based on large language models (LLM) are the zero-shot\nstate-of-the-art. However, current works in this direction all depend on the\nGPT models, making it a single point of failure in scientific reproducibility.\nMoreover, it raises the concern that the current research findings only hold\nfor GPT models but not LLM in general. In this work, we lift this pre-condition\nand build for the first time effective listwise rerankers without any form of\ndependency on GPT. Our passage retrieval experiments show that our best list se\nreranker surpasses the listwise rerankers based on GPT-3.5 by 13% and achieves\n97% effectiveness of the ones built on GPT-4. Our results also show that the\nexisting training datasets, which were expressly constructed for pointwise\nranking, are insufficient for building such listwise rerankers. Instead,\nhigh-quality listwise ranking data is required and crucial, calling for further\nwork on building human-annotated listwise data resources.\n","authors":["Xinyu Zhang","Sebastian Hofstätter","Patrick Lewis","Raphael Tang","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.02969v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02913v1","updated":"2023-12-05T17:38:02Z","published":"2023-12-05T17:38:02Z","title":"Let the LLMs Talk: Simulating Human-to-Human Conversational QA via\n Zero-Shot LLM-to-LLM Interactions","summary":" Conversational question-answering (CQA) systems aim to create interactive\nsearch systems that effectively retrieve information by interacting with users.\nTo replicate human-to-human conversations, existing work uses human annotators\nto play the roles of the questioner (student) and the answerer (teacher).\nDespite its effectiveness, challenges exist as human annotation is\ntime-consuming, inconsistent, and not scalable. To address this issue and\ninvestigate the applicability of large language models (LLMs) in CQA\nsimulation, we propose a simulation framework that employs zero-shot learner\nLLMs for simulating teacher-student interactions. Our framework involves two\nLLMs interacting on a specific topic, with the first LLM acting as a student,\ngenerating questions to explore a given search topic. The second LLM plays the\nrole of a teacher by answering questions and is equipped with additional\ninformation, including a text on the given topic. We implement both the student\nand teacher by zero-shot prompting the GPT-4 model. To assess the effectiveness\nof LLMs in simulating CQA interactions and understand the disparities between\nLLM- and human-generated conversations, we evaluate the simulated data from\nvarious perspectives. We begin by evaluating the teacher's performance through\nboth automatic and human assessment. Next, we evaluate the performance of the\nstudent, analyzing and comparing the disparities between questions generated by\nthe LLM and those generated by humans. Furthermore, we conduct extensive\nanalyses to thoroughly examine the LLM performance by benchmarking\nstate-of-the-art reading comprehension models on both datasets. Our results\nreveal that the teacher LLM generates lengthier answers that tend to be more\naccurate and complete. The student LLM generates more diverse questions,\ncovering more aspects of a given topic.\n","authors":["Zahra Abbasiantaeb","Yifei Yuan","Evangelos Kanoulas","Mohammad Aliannejadi"],"pdf_url":"https://arxiv.org/pdf/2312.02913v1.pdf","comment":"Accepted at WSDM 2024"},{"id":"http://arxiv.org/abs/2312.02901v1","updated":"2023-12-05T17:15:16Z","published":"2023-12-05T17:15:16Z","title":"Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive\n Review","summary":" Due to the advent and increase in the popularity of the Internet, people have\nbeen producing and disseminating textual data in several ways, such as reviews,\nsocial media posts, and news articles. As a result, numerous researchers have\nbeen working on discovering patterns in textual data, especially because social\nmedia posts function as social sensors, indicating peoples' opinions,\ninterests, etc. However, most tasks regarding natural language processing are\naddressed using traditional machine learning methods and static datasets. This\nsetting can lead to several problems, such as an outdated dataset, which may\nnot correspond to reality, and an outdated model, which has its performance\ndegrading over time. Concept drift is another aspect that emphasizes these\nissues, which corresponds to data distribution and pattern changes. In a text\nstream scenario, it is even more challenging due to its characteristics, such\nas the high speed and data arriving sequentially. In addition, models for this\ntype of scenario must adhere to the constraints mentioned above while learning\nfrom the stream by storing texts for a limited time and consuming low memory.\nIn this study, we performed a systematic literature review regarding concept\ndrift adaptation in text stream scenarios. Considering well-defined criteria,\nwe selected 40 papers to unravel aspects such as text drift categories, types\nof text drift detection, model update mechanism, the addressed stream mining\ntasks, types of text representations, and text representation update mechanism.\nIn addition, we discussed drift visualization and simulation and listed\nreal-world datasets used in the selected papers. Therefore, this paper\ncomprehensively reviews the concept drift adaptation in text stream mining\nscenarios.\n","authors":["Cristiano Mesquita Garcia","Ramon Simoes Abilio","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr.","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2312.02901v1.pdf","comment":"49 pages"},{"id":"http://arxiv.org/abs/2312.02724v1","updated":"2023-12-05T12:39:00Z","published":"2023-12-05T12:39:00Z","title":"RankZephyr: Effective and Robust Zero-Shot Listwise Reranking is a\n Breeze!","summary":" In information retrieval, proprietary large language models (LLMs) such as\nGPT-4 and open-source counterparts such as LLaMA and Vicuna have played a vital\nrole in reranking. However, the gap between open-source and closed models\npersists, with reliance on proprietary, non-transparent models constraining\nreproducibility. Addressing this gap, we introduce RankZephyr, a\nstate-of-the-art, open-source LLM for listwise zero-shot reranking. RankZephyr\nnot only bridges the effectiveness gap with GPT-4 but in some cases surpasses\nthe proprietary model. Our comprehensive evaluations across several datasets\n(TREC Deep Learning Tracks; NEWS and COVID from BEIR) showcase this ability.\nRankZephyr benefits from strategic training choices and is resilient against\nvariations in initial document ordering and the number of documents reranked.\nAdditionally, our model outperforms GPT-4 on the NovelEval test set, comprising\nqueries and passages past its training period, which addresses concerns about\ndata contamination. To foster further research in this rapidly evolving field,\nwe provide all code necessary to reproduce our results at\nhttps://github.com/castorini/rank_llm.\n","authors":["Ronak Pradeep","Sahel Sharifymoghaddam","Jimmy Lin"],"pdf_url":"https://arxiv.org/pdf/2312.02724v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12810v3","updated":"2023-12-05T11:47:49Z","published":"2023-07-24T14:00:07Z","title":"HeteFedRec: Federated Recommender Systems with Model Heterogeneity","summary":" Owing to the nature of privacy protection, federated recommender systems\n(FedRecs) have garnered increasing interest in the realm of on-device\nrecommender systems. However, most existing FedRecs only allow participating\nclients to collaboratively train a recommendation model of the same public\nparameter size. Training a model of the same size for all clients can lead to\nsuboptimal performance since clients possess varying resources. For example,\nclients with limited training data may prefer to train a smaller recommendation\nmodel to avoid excessive data consumption, while clients with sufficient data\nwould benefit from a larger model to achieve higher recommendation accuracy. To\naddress the above challenge, this paper introduces HeteFedRec, a novel FedRec\nframework that enables the assignment of personalized model sizes to\nparticipants. In HeteFedRec, we present a heterogeneous recommendation model\naggregation strategy, including a unified dual-task learning mechanism and a\ndimensional decorrelation regularization, to allow knowledge aggregation among\nrecommender models of different sizes. Additionally, a relation-based ensemble\nknowledge distillation method is proposed to effectively distil knowledge from\nheterogeneous item embeddings. Extensive experiments conducted on three\nreal-world recommendation datasets demonstrate the effectiveness and efficiency\nof HeteFedRec in training federated recommender systems under heterogeneous\nsettings.\n","authors":["Wei Yuan","Liang Qu","Lizhen Cui","Yongxin Tong","Xiaofang Zhou","Hongzhi Yin"],"pdf_url":"https://arxiv.org/pdf/2307.12810v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02550v1","updated":"2023-12-05T07:37:43Z","published":"2023-12-05T07:37:43Z","title":"An empirical study of next-basket recommendations","summary":" Next Basket Recommender Systems (NBRs) function to recommend the subsequent\nshopping baskets for users through the modeling of their preferences derived\nfrom purchase history, typically manifested as a sequence of historical\nbaskets. Given their widespread applicability in the E-commerce industry,\ninvestigations into NBRs have garnered increased attention in recent years.\nDespite the proliferation of diverse NBR methodologies, a substantial challenge\nlies in the absence of a systematic and unified evaluation framework across\nthese methodologies. Various studies frequently appraise NBR approaches using\ndisparate datasets and diverse experimental settings, impeding a fair and\neffective comparative assessment of methodological performance. To bridge this\ngap, this study undertakes a systematic empirical inquiry into NBRs, reviewing\nseminal works within the domain and scrutinizing their respective merits and\ndrawbacks. Subsequently, we implement designated NBR algorithms on uniform\ndatasets, employing consistent experimental configurations, and assess their\nperformances via identical metrics. This methodological rigor establishes a\ncohesive framework for the impartial evaluation of diverse NBR approaches. It\nis anticipated that this study will furnish a robust foundation and serve as a\npivotal reference for forthcoming research endeavors in this dynamic field.\n","authors":["Zhufeng Shao","Shoujin Wang","Qian Zhang","Wenpeng Lu","Zhao Li","Xueping Peng"],"pdf_url":"https://arxiv.org/pdf/2312.02550v1.pdf","comment":"6 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.02538v1","updated":"2023-12-05T07:08:08Z","published":"2023-12-05T07:08:08Z","title":"A Multi-Granularity-Aware Aspect Learning Model for Multi-Aspect Dense\n Retrieval","summary":" Dense retrieval methods have been mostly focused on unstructured text and\nless attention has been drawn to structured data with various aspects, e.g.,\nproducts with aspects such as category and brand. Recent work has proposed two\napproaches to incorporate the aspect information into item representations for\neffective retrieval by predicting the values associated with the item aspects.\nDespite their efficacy, they treat the values as isolated classes (e.g., \"Smart\nHomes\", \"Home, Garden & Tools\", and \"Beauty & Health\") and ignore their\nfine-grained semantic relation. Furthermore, they either enforce the learning\nof aspects into the CLS token, which could confuse it from its designated use\nfor representing the entire content semantics, or learn extra aspect embeddings\nonly with the value prediction objective, which could be insufficient\nespecially when there are no annotated values for an item aspect. Aware of\nthese limitations, we propose a MUlti-granulaRity-aware Aspect Learning model\n(MURAL) for multi-aspect dense retrieval. It leverages aspect information\nacross various granularities to capture both coarse and fine-grained semantic\nrelations between values. Moreover, MURAL incorporates separate aspect\nembeddings as input to transformer encoders so that the masked language model\nobjective can assist implicit aspect learning even without aspect-value\nannotations. Extensive experiments on two real-world datasets of products and\nmini-programs show that MURAL outperforms state-of-the-art baselines\nsignificantly.\n","authors":["Xiaojie Sun","Keping Bi","Jiafeng Guo","Sihui Yang","Qishen Zhang","Zhongyi Liu","Guannan Zhang","Xueqi Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.02538v1.pdf","comment":"Accepted by WSDM2024"},{"id":"http://arxiv.org/abs/2312.02532v1","updated":"2023-12-05T06:28:45Z","published":"2023-12-05T06:28:45Z","title":"DRAFT: Dense Retrieval Augmented Few-shot Topic classifier Framework","summary":" With the growing volume of diverse information, the demand for classifying\narbitrary topics has become increasingly critical. To address this challenge,\nwe introduce DRAFT, a simple framework designed to train a classifier for\nfew-shot topic classification. DRAFT uses a few examples of a specific topic as\nqueries to construct Customized dataset with a dense retriever model.\nMulti-query retrieval (MQR) algorithm, which effectively handles multiple\nqueries related to a specific topic, is applied to construct the Customized\ndataset. Subsequently, we fine-tune a classifier using the Customized dataset\nto identify the topic. To demonstrate the efficacy of our proposed approach, we\nconduct evaluations on both widely used classification benchmark datasets and\nmanually constructed datasets with 291 diverse topics, which simulate diverse\ncontents encountered in real-world applications. DRAFT shows competitive or\nsuperior performance compared to baselines that use in-context learning, such\nas GPT-3 175B and InstructGPT 175B, on few-shot topic classification tasks\ndespite having 177 times fewer parameters, demonstrating its effectiveness.\n","authors":["Keonwoo Kim","Younggun Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02532v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02445v1","updated":"2023-12-05T02:53:46Z","published":"2023-12-05T02:53:46Z","title":"LLaRA: Aligning Large Language Models with Sequential Recommenders","summary":" Sequential recommendation aims to predict the subsequent items matching user\npreference based on her/his historical interactions. With the development of\nLarge Language Models (LLMs), there is growing interest in exploring the\npotential of LLMs for sequential recommendation by framing it as a language\nmodeling task. Prior works represent items in the textual prompts using either\nID indexing or text indexing and feed the prompts into LLMs, but falling short\nof either encapsulating comprehensive world knowledge or exhibiting sufficient\nsequential understanding. To harness the complementary strengths of traditional\nrecommenders (which encode user behavioral knowledge) and LLMs (which possess\nworld knowledge about items), we propose LLaRA -- a Large Language and\nRecommendation Assistant framework. Specifically, LLaRA represents items in\nLLM's input prompts using a novel hybrid approach that integrates ID-based item\nembeddings from traditional recommenders with textual item features. Viewing\nthe ``sequential behavior of the user'' as a new modality in recommendation, we\nemploy an adapter to bridge the modality gap between ID embeddings of the\ntraditional recommenders and the input space of LLMs. Furthermore, instead of\ndirectly exposing the hybrid prompt to LLMs, we apply a curriculum learning\napproach to gradually ramp up training complexity. We first warm up the LLM\nwith text-only prompting, which aligns more naturally with the LLM's language\nmodeling capabilities. Thereafter, we progressively transition to hybrid\nprompting, training the adapter to incorporate behavioral knowledge from the\ntraditional sequential recommender into the LLM. Extensive experiments\ndemonstrate the efficacy of LLaRA framework. Our code and data are available at\nhttps://github.com/ljy0ustc/LLaRA .\n","authors":["Jiayi Liao","Sihang Li","Zhengyi Yang","Jiancan Wu","Yancheng Yuan","Xiang Wang","Xiangnan He"],"pdf_url":"https://arxiv.org/pdf/2312.02445v1.pdf","comment":"12 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.02443v1","updated":"2023-12-05T02:50:18Z","published":"2023-12-05T02:50:18Z","title":"E4SRec: An Elegant Effective Efficient Extensible Solution of Large\n Language Models for Sequential Recommendation","summary":" The recent advancements in Large Language Models (LLMs) have sparked interest\nin harnessing their potential within recommender systems. Since LLMs are\ndesigned for natural language tasks, existing recommendation approaches have\npredominantly transformed recommendation tasks into open-domain natural\nlanguage generation tasks. However, this approach necessitates items to possess\nrich semantic information, often generates out-of-range results, and suffers\nfrom notably low efficiency and limited extensibility. Furthermore, practical\nID-based recommendation strategies, reliant on a huge number of unique\nidentities (IDs) to represent users and items, have gained prominence in\nreal-world recommender systems due to their effectiveness and efficiency.\nNevertheless, the incapacity of LLMs to model IDs presents a formidable\nchallenge when seeking to leverage LLMs for personalized recommendations. In\nthis paper, we introduce an Elegant Effective Efficient Extensible solution for\nlarge language models for Sequential Recommendation (E4SRec), which seamlessly\nintegrates LLMs with traditional recommender systems that exclusively utilize\nIDs to represent items. Specifically, E4SRec takes ID sequences as inputs,\nensuring that the generated outputs fall within the candidate lists.\nFurthermore, E4SRec possesses the capability to generate the entire ranking\nlist in a single forward process, and demands only a minimal set of pluggable\nparameters, which are trained for each dataset while keeping the entire LLM\nfrozen. We substantiate the effectiveness, efficiency, and extensibility of our\nproposed E4SRec through comprehensive experiments conducted on four widely-used\nreal-world datasets. The implementation code is accessible at\nhttps://github.com/HestiaSky/E4SRec/.\n","authors":["Xinhang Li","Chong Chen","Xiangyu Zhao","Yong Zhang","Chunxiao Xing"],"pdf_url":"https://arxiv.org/pdf/2312.02443v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01656v2","updated":"2023-12-05T02:24:38Z","published":"2023-12-04T06:14:25Z","title":"The Contemporary Art of Image Search: Iterative User Intent Expansion\n via Vision-Language Model","summary":" Image search is an essential and user-friendly method to explore vast\ngalleries of digital images. However, existing image search methods heavily\nrely on proximity measurements like tag matching or image similarity, requiring\nprecise user inputs for satisfactory results. To meet the growing demand for a\ncontemporary image search engine that enables accurate comprehension of users'\nsearch intentions, we introduce an innovative user intent expansion framework.\nOur framework leverages visual-language models to parse and compose multi-modal\nuser inputs to provide more accurate and satisfying results. It comprises\ntwo-stage processes: 1) a parsing stage that incorporates a language parsing\nmodule with large language models to enhance the comprehension of textual\ninputs, along with a visual parsing module that integrates an interactive\nsegmentation module to swiftly identify detailed visual elements within images;\nand 2) a logic composition stage that combines multiple user search intents\ninto a unified logic expression for more sophisticated operations in complex\nsearching scenarios. Moreover, the intent expansion framework enables users to\nperform flexible contextualized interactions with the search results to further\nspecify or adjust their detailed search intents iteratively. We implemented the\nframework into an image search system for NFT (non-fungible token) search and\nconducted a user study to evaluate its usability and novel properties. The\nresults indicate that the proposed framework significantly improves users'\nimage search experience. Particularly the parsing and contextualized\ninteractions prove useful in allowing users to express their search intents\nmore accurately and engage in a more enjoyable iterative search experience.\n","authors":["Yilin Ye","Qian Zhu","Shishi Xiao","Kang Zhang","Wei Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01656v2.pdf","comment":"Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported\n Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024)"},{"id":"http://arxiv.org/abs/2312.02429v1","updated":"2023-12-05T02:08:48Z","published":"2023-12-05T02:08:48Z","title":"PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval\n Models","summary":" Embedding-based Retrieval Models (ERMs) have emerged as a promising framework\nfor large-scale text retrieval problems due to powerful large language models.\nNevertheless, fine-tuning ERMs to reach state-of-the-art results can be\nexpensive due to the extreme scale of data as well as the complexity of\nmulti-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this\nwork, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast\ntuning of ERMs without any backward pass in the optimization. At index building\nstage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN)\ncomponent. At inference stage, PEFA performs a convex combination of two\nscoring functions, one from the ERM and the other from the kNN. Based on the\nneighborhood definition, PEFA framework induces two realizations, namely\nPEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra\nsmall) using a single ANN index. Empirically, PEFA achieves significant\nimprovement on two retrieval applications. For document retrieval, regarding\nRecall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an\naverage of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%,\nrespectively. For product search, PEFA improves the Recall@100 of the\nfine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL,\nrespectively. Our code is available at https://github.com/\namzn/pecos/tree/mainline/examples/pefa-wsdm24\n","authors":["Wei-Cheng Chang","Jyun-Yu Jiang","Jiong Zhang","Mutasem Al-Darabsah","Choon Hui Teo","Cho-Jui Hsieh","Hsiang-Fu Yu","S. V. N. Vishwanathan"],"pdf_url":"https://arxiv.org/pdf/2312.02429v1.pdf","comment":"Accept by WSDM 2024"},{"id":"http://arxiv.org/abs/2312.02428v1","updated":"2023-12-05T02:07:31Z","published":"2023-12-05T02:07:31Z","title":"FreestyleRet: Retrieving Images from Style-Diversified Queries","summary":" Image Retrieval aims to retrieve corresponding images based on a given query.\nIn application scenarios, users intend to express their retrieval intent\nthrough various query styles. However, current retrieval tasks predominantly\nfocus on text-query retrieval exploration, leading to limited retrieval query\noptions and potential ambiguity or bias in user intention. In this paper, we\npropose the Style-Diversified Query-Based Image Retrieval task, which enables\nretrieval based on various query styles. To facilitate the novel setting, we\npropose the first Diverse-Style Retrieval dataset, encompassing diverse query\nstyles including text, sketch, low-resolution, and art. We also propose a\nlight-weighted style-diversified retrieval framework. For various query style\ninputs, we apply the Gram Matrix to extract the query's textural features and\ncluster them into a style space with style-specific bases. Then we employ the\nstyle-init prompt tuning module to enable the visual encoder to comprehend the\ntexture and style information of the query. Experiments demonstrate that our\nmodel, employing the style-init prompt tuning strategy, outperforms existing\nretrieval models on the style-diversified retrieval task. Moreover,\nstyle-diversified queries~(sketch+text, art+text, etc) can be simultaneously\nretrieved in our model. The auxiliary information from other queries enhances\nthe retrieval performance within the respective query.\n","authors":["Hao Li","Curise Jia","Peng Jin","Zesen Cheng","Kehan Li","Jialu Sui","Chang Liu","Li Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.02428v1.pdf","comment":"16 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.03171v1","updated":"2023-12-05T22:28:42Z","published":"2023-12-05T22:28:42Z","title":"Combining Counting Processes and Classification Improves a Stopping Rule\n for Technology Assisted Review","summary":" Technology Assisted Review (TAR) stopping rules aim to reduce the cost of\nmanually assessing documents for relevance by minimising the number of\ndocuments that need to be examined to ensure a desired level of recall. This\npaper extends an effective stopping rule using information derived from a text\nclassifier that can be trained without the need for any additional annotation.\nExperiments on multiple data sets (CLEF e-Health, TREC Total Recall, TREC Legal\nand RCV1) showed that the proposed approach consistently improves performance\nand outperforms several alternative methods.\n","authors":["Reem Bin-Hezam","Mark Stevenson"],"pdf_url":"https://arxiv.org/pdf/2312.03171v1.pdf","comment":"Accepted at EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2312.03167v1","updated":"2023-12-05T22:22:25Z","published":"2023-12-05T22:22:25Z","title":"Adaptive spectral graph wavelets for collaborative filtering","summary":" Collaborative filtering is a popular approach in recommender systems, whose\nobjective is to provide personalized item suggestions to potential users based\non their purchase or browsing history. However, personalized recommendations\nrequire considerable amount of behavioral data on users, which is usually\nunavailable for new users, giving rise to the cold-start problem. To help\nalleviate this challenging problem, we introduce a spectral graph wavelet\ncollaborative filtering framework for implicit feedback data, where users,\nitems and their interactions are represented as a bipartite graph.\nSpecifically, we first propose an adaptive transfer function by leveraging a\npower transform with the goal of stabilizing the variance of graph frequencies\nin the spectral domain. Then, we design a deep recommendation model for\nefficient learning of low-dimensional embeddings of users and items using\nspectral graph wavelets in an end-to-end fashion. In addition to capturing the\ngraph's local and global structures, our approach yields localization of graph\nsignals in both spatial and spectral domains, and hence not only learns\ndiscriminative representations of users and items, but also promotes the\nrecommendation quality. The effectiveness of our proposed model is demonstrated\nthrough extensive experiments on real-world benchmark datasets, achieving\nbetter recommendation performance compared with strong baseline methods.\n","authors":["Osama Alshareet","A. Ben Hamza"],"pdf_url":"https://arxiv.org/pdf/2312.03167v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14815v4","updated":"2023-12-05T20:59:58Z","published":"2023-05-24T07:09:56Z","title":"Machine Reading Comprehension using Case-based Reasoning","summary":" We present an accurate and interpretable method for answer extraction in\nmachine reading comprehension that is reminiscent of case-based reasoning (CBR)\nfrom classical AI. Our method (CBR-MRC) builds upon the hypothesis that\ncontextualized answers to similar questions share semantic similarities with\neach other. Given a test question, CBR-MRC first retrieves a set of similar\ncases from a nonparametric memory and then predicts an answer by selecting the\nspan in the test context that is most similar to the contextualized\nrepresentations of answers in the retrieved cases. The semi-parametric nature\nof our approach allows it to attribute a prediction to the specific set of\nevidence cases, making it a desirable choice for building reliable and\ndebuggable QA systems. We show that CBR-MRC provides high accuracy comparable\nwith large reader models and outperforms baselines by 11.5 and 8.4 EM on\nNaturalQuestions and NewsQA, respectively. Further, we demonstrate the ability\nof CBR-MRC in identifying not just the correct answer tokens but also the span\nwith the most relevant supporting evidence. Lastly, we observe that contexts\nfor certain question types show higher lexical diversity than others and find\nthat CBR-MRC is robust to these variations while performance using\nfully-parametric methods drops.\n","authors":["Dung Thai","Dhruv Agarwal","Mudit Chaudhary","Wenlong Zhao","Rajarshi Das","Manzil Zaheer","Jay-Yoon Lee","Hannaneh Hajishirzi","Andrew McCallum"],"pdf_url":"https://arxiv.org/pdf/2305.14815v4.pdf","comment":"9 pages, 2 figures"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.02975v1","updated":"2023-12-05T18:59:23Z","published":"2023-12-05T18:59:23Z","title":"Dexterous Functional Grasping","summary":" While there have been significant strides in dexterous manipulation, most of\nit is limited to benchmark tasks like in-hand reorientation which are of\nlimited utility in the real world. The main benefit of dexterous hands over\ntwo-fingered ones is their ability to pickup tools and other objects (including\nthin ones) and grasp them firmly to apply force. However, this task requires\nboth a complex understanding of functional affordances as well as precise\nlow-level control. While prior work obtains affordances from human data this\napproach doesn't scale to low-level control. Similarly, simulation training\ncannot give the robot an understanding of real-world semantics. In this paper,\nwe aim to combine the best of both worlds to accomplish functional grasping for\nin-the-wild objects. We use a modular approach. First, affordances are obtained\nby matching corresponding regions of different objects and then a low-level\npolicy trained in sim is run to grasp it. We propose a novel application of\neigengrasps to reduce the search space of RL using a small amount of human data\nand find that it leads to more stable and physically realistic motion. We find\nthat eigengrasp action space beats baselines in simulation and outperforms\nhardcoded grasping in real and matches or outperforms a trained human\nteleoperator. Results visualizations and videos at https://dexfunc.github.io/\n","authors":["Ananye Agarwal","Shagun Uppal","Kenneth Shaw","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2312.02975v1.pdf","comment":"In CoRL 2023. Website at https://dexfunc.github.io/"},{"id":"http://arxiv.org/abs/2312.02974v1","updated":"2023-12-05T18:59:16Z","published":"2023-12-05T18:59:16Z","title":"Describing Differences in Image Sets with Natural Language","summary":" How do two sets of images differ? Discerning set-level differences is crucial\nfor understanding model behaviors and analyzing datasets, yet manually sifting\nthrough thousands of images is impractical. To aid in this discovery process,\nwe explore the task of automatically describing the differences between two\n$\\textbf{sets}$ of images, which we term Set Difference Captioning. This task\ntakes in image sets $D_A$ and $D_B$, and outputs a description that is more\noften true on $D_A$ than $D_B$. We outline a two-stage approach that first\nproposes candidate difference descriptions from image sets and then re-ranks\nthe candidates by checking how well they can differentiate the two sets. We\nintroduce VisDiff, which first captions the images and prompts a language model\nto propose candidate descriptions, then re-ranks these descriptions using CLIP.\nTo evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image\nsets with ground truth difference descriptions. We apply VisDiff to various\ndomains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing\nclassification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing\nmodel failure modes (supervised ResNet), characterizing differences between\ngenerative models (e.g., StableDiffusionV1 and V2), and discovering what makes\nimages memorable. Using VisDiff, we are able to find interesting and previously\nunknown differences in datasets and models, demonstrating its utility in\nrevealing nuanced insights.\n","authors":["Lisa Dunlap","Yuhui Zhang","Xiaohan Wang","Ruiqi Zhong","Trevor Darrell","Jacob Steinhardt","Joseph E. Gonzalez","Serena Yeung-Levy"],"pdf_url":"https://arxiv.org/pdf/2312.02974v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18826v2","updated":"2023-12-05T18:57:28Z","published":"2023-11-30T18:59:05Z","title":"Geometry-Aware Normalizing Wasserstein Flows for Optimal Causal\n Inference","summary":" This manuscript enriches the framework of continuous normalizing flows (CNFs)\nwithin causal inference, primarily to augment the geometric properties of\nparametric submodels used in targeted maximum likelihood estimation (TMLE). By\nintroducing an innovative application of CNFs, we construct a refined series of\nparametric submodels that enable a directed interpolation between the prior\ndistribution $p_0$ and the empirical distribution $p_1$. This proposed\nmethodology serves to optimize the semiparametric efficiency bound in causal\ninference by orchestrating CNFs to align with Wasserstein gradient flows. Our\napproach not only endeavors to minimize the mean squared error in the\nestimation but also imbues the estimators with geometric sophistication,\nthereby enhancing robustness against misspecification. This robustness is\ncrucial, as it alleviates the dependence on the standard $n^{\\frac{1}{4}}$ rate\nfor a doubly-robust perturbation direction in TMLE. By incorporating robust\noptimization principles and differential geometry into the estimators, the\ndeveloped geometry-aware CNFs represent a significant advancement in the\npursuit of doubly robust causal inference.\n","authors":["Kaiwen Hou"],"pdf_url":"https://arxiv.org/pdf/2311.18826v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02959v1","updated":"2023-12-05T18:47:34Z","published":"2023-12-05T18:47:34Z","title":"Detecting algorithmic bias in medical AI-models","summary":" With the growing prevalence of machine learning and artificial\nintelligence-based medical decision support systems, it is equally important to\nensure that these systems provide patient outcomes in a fair and equitable\nfashion. This paper presents an innovative framework for detecting areas of\nalgorithmic bias in medical-AI decision support systems. Our approach\nefficiently identifies potential biases in medical-AI models, specifically in\nthe context of sepsis prediction, by employing the Classification and\nRegression Trees (CART) algorithm. We verify our methodology by conducting a\nseries of synthetic data experiments, showcasing its ability to estimate areas\nof bias in controlled settings precisely. The effectiveness of the concept is\nfurther validated by experiments using electronic medical records from Grady\nMemorial Hospital in Atlanta, Georgia. These tests demonstrate the practical\nimplementation of our strategy in a clinical environment, where it can function\nas a vital instrument for guaranteeing fairness and equity in AI-based medical\ndecisions.\n","authors":["Jeffrey Smith","Andre Holder","Rishikesan Kamaleswaran","Yao Xie"],"pdf_url":"https://arxiv.org/pdf/2312.02959v1.pdf","comment":"26 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.01203v2","updated":"2023-12-05T18:45:24Z","published":"2023-12-02T18:55:26Z","title":"Harnessing Discrete Representations For Continual Reinforcement Learning","summary":" Reinforcement learning (RL) agents make decisions using nothing but\nobservations from the environment, and consequently, heavily rely on the\nrepresentations of those observations. Though some recent breakthroughs have\nused vector-based categorical representations of observations, often referred\nto as discrete representations, there is little work explicitly assessing the\nsignificance of such a choice. In this work, we provide a thorough empirical\ninvestigation of the advantages of representing observations as vectors of\ncategorical values within the context of reinforcement learning. We perform\nevaluations on world-model learning, model-free RL, and ultimately continual RL\nproblems, where the benefits best align with the needs of the problem setting.\nWe find that, when compared to traditional continuous representations, world\nmodels learned over discrete representations accurately model more of the world\nwith less capacity, and that agents trained with discrete representations learn\nbetter policies with less data. In the context of continual RL, these benefits\ntranslate into faster adapting agents. Additionally, our analysis suggests that\nthe observed performance improvements can be attributed to the information\ncontained within the latent vectors and potentially the encoding of the\ndiscrete representation itself.\n","authors":["Edan Meyer","Adam White","Marlos C. Machado"],"pdf_url":"https://arxiv.org/pdf/2312.01203v2.pdf","comment":"23 pages, 16 figures, submitted to ICLR 2024"},{"id":"http://arxiv.org/abs/2312.02957v1","updated":"2023-12-05T18:41:03Z","published":"2023-12-05T18:41:03Z","title":"Classification for everyone : Building geography agnostic models for\n fairer recognition","summary":" In this paper, we analyze different methods to mitigate inherent geographical\nbiases present in state of the art image classification models. We first\nquantitatively present this bias in two datasets - The Dollar Street Dataset\nand ImageNet, using images with location information. We then present different\nmethods which can be employed to reduce this bias. Finally, we analyze the\neffectiveness of the different techniques on making these models more robust to\ngeographical locations of the images.\n","authors":["Akshat Jindal","Shreya Singh","Soham Gadgil"],"pdf_url":"https://arxiv.org/pdf/2312.02957v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02956v1","updated":"2023-12-05T18:40:40Z","published":"2023-12-05T18:40:40Z","title":"Choroidalyzer: An open-source, end-to-end pipeline for choroidal\n analysis in optical coherence tomography","summary":" Purpose: To develop Choroidalyzer, an open-source, end-to-end pipeline for\nsegmenting the choroid region, vessels, and fovea, and deriving choroidal\nthickness, area, and vascular index.\n Methods: We used 5,600 OCT B-scans (233 subjects, 6 systemic disease cohorts,\n3 device types, 2 manufacturers). To generate region and vessel ground-truths,\nwe used state-of-the-art automatic methods following manual correction of\ninaccurate segmentations, with foveal positions manually annotated. We trained\na U-Net deep-learning model to detect the region, vessels, and fovea to\ncalculate choroid thickness, area, and vascular index in a fovea-centred region\nof interest. We analysed segmentation agreement (AUC, Dice) and choroid metrics\nagreement (Pearson, Spearman, mean absolute error (MAE)) in internal and\nexternal test sets. We compared Choroidalyzer to two manual graders on a small\nsubset of external test images and examined cases of high error.\n Results: Choroidalyzer took 0.299 seconds per image on a standard laptop and\nachieved excellent region (Dice: internal 0.9789, external 0.9749), very good\nvessel segmentation performance (Dice: internal 0.8817, external 0.8703) and\nexcellent fovea location prediction (MAE: internal 3.9 pixels, external 3.4\npixels). For thickness, area, and vascular index, Pearson correlations were\n0.9754, 0.9815, and 0.8285 (internal) / 0.9831, 0.9779, 0.7948 (external),\nrespectively (all p<0.0001). Choroidalyzer's agreement with graders was\ncomparable to the inter-grader agreement across all metrics.\n Conclusions: Choroidalyzer is an open-source, end-to-end pipeline that\naccurately segments the choroid and reliably extracts thickness, area, and\nvascular index. Especially choroidal vessel segmentation is a difficult and\nsubjective task, and fully-automatic methods like Choroidalyzer could provide\nobjectivity and standardisation.\n","authors":["Justin Engelmann","Jamie Burke","Charlene Hamid","Megan Reid-Schachter","Dan Pugh","Neeraj Dhaun","Diana Moukaddem","Lyle Gray","Niall Strang","Paul McGraw","Amos Storkey","Paul J. Steptoe","Stuart King","Tom MacGillivray","Miguel O. Bernabeu","Ian J. C. MacCormick"],"pdf_url":"https://arxiv.org/pdf/2312.02956v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02941v1","updated":"2023-12-05T18:09:47Z","published":"2023-12-05T18:09:47Z","title":"Fast CT anatomic localization algorithm","summary":" Automatically determining the position of every slice in a CT scan is a basic\nyet powerful capability allowing fast retrieval of region of interest for\nvisual inspection and automated analysis. Unlike conventional localization\napproaches which work at the slice level, we directly localize only a fraction\nof the slices and and then fit a linear model which maps slice index to its\nestimated axial anatomical position based on those slices. The model is then\nused to assign axial position to every slices of the scan. This approach proves\nto be both computationally efficient, with a typical processing time of less\nthan a second per scan (regardless of its size), accurate, with a typical\nmedian localization error of 1 cm, and robust to different noise sources,\nimaging protocols, metal induced artifacts, anatomical deformations etc.\nAnother key element of our approach is the introduction of a mapping confidence\nscore. This score acts as a fail safe mechanism which allows a rejection of\nunreliable localization results in rare cases of anomalous scans. Our algorithm\nsets new State Of The Art results in terms of localization accuracy. It also\noffers a decrease of two orders of magnitude in processing time with respect to\nall published processing times. It was designed to be invariant to various scan\nresolutions, scan protocols, patient orientations, strong artifacts and various\ndeformations and abnormalities. Additionally, our algorithm is the first one to\nthe best of our knowledge which supports the entire body from head to feet and\nis not confined to specific anatomical region. This algorithm was tested on\nthousands of scans and proves to be very reliable and useful as a preprocessing\nstage for many applications.\n","authors":["Amit Oved"],"pdf_url":"https://arxiv.org/pdf/2312.02941v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01678v2","updated":"2023-12-05T18:02:46Z","published":"2023-12-04T07:01:54Z","title":"Jellyfish: A Large Language Model for Data Preprocessing","summary":" In this paper, we present Jellyfish, an open-source LLM as a universal task\nsolver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned\nwith the datasets of several typical DP tasks including error detection, data\nimputation, schema matching, and entity matching, and delivers generalizability\nto other tasks. Remarkably, Jellyfish can operate on a local, single, and\nlow-priced GPU with its 13 billion parameters, ensuring data security and\nenabling further tuning. Its proficiency in understanding natural language\nallows users to manually craft instructions for DP tasks. Unlike many existing\nmethods that heavily rely on prior knowledge, Jellyfish acquires domain\nknowledge during its tuning process and integrates optional knowledge injection\nduring inference. A distinctive feature of Jellyfish is its interpreter, which\nelucidates its output decisions. To construct Jellyfish, we develop a series of\npre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance\nserializer, which automatically translates raw data into model prompts, and a\nknowledge injector, which optionally introduces task- and dataset-specific\nknowledge to enhance DP performance. Our evaluation of Jellyfish, using a range\nof real datasets, shows its competitiveness compared to state-of-the-art\nmethods and its strong generalizability to unseen tasks. Jellyfish's\nperformance rivals that of GPT series models, and its interpreter offers\nenhanced reasoning capabilities compared to GPT-3.5. Furthermore, our\nevaluation highlights the effectiveness of the techniques employed in\nconstructing Jellyfish. Our model is available at Hugging Face:\nhttps://huggingface.co/NECOUDBFM/Jellyfish .\n","authors":["Haochen Zhang","Yuyang Dong","Chuan Xiao","Masafumi Oyamada"],"pdf_url":"https://arxiv.org/pdf/2312.01678v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02495v2","updated":"2023-12-05T18:00:59Z","published":"2023-11-04T19:40:16Z","title":"Uncertainty Quantification in Multivariable Regression for Material\n Property Prediction with Bayesian Neural Networks","summary":" With the increased use of data-driven approaches and machine learning-based\nmethods in material science, the importance of reliable uncertainty\nquantification (UQ) of the predicted variables for informed decision-making\ncannot be overstated. UQ in material property prediction poses unique\nchallenges, including the multi-scale and multi-physics nature of advanced\nmaterials, intricate interactions between numerous factors, limited\navailability of large curated datasets for model training, etc. Recently,\nBayesian Neural Networks (BNNs) have emerged as a promising approach for UQ,\noffering a probabilistic framework for capturing uncertainties within neural\nnetworks. In this work, we introduce an approach for UQ within physics-informed\nBNNs, which integrates knowledge from governing laws in material modeling to\nguide the models toward physically consistent predictions. To evaluate the\neffectiveness of this approach, we present case studies for predicting the\ncreep rupture life of steel alloys. Experimental validation with three datasets\nof collected measurements from creep tests demonstrates the ability of BNNs to\nproduce accurate point and uncertainty estimates that are competitive or exceed\nthe performance of the conventional method of Gaussian Process Regression.\nSimilarly, we evaluated the suitability of BNNs for UQ in an active learning\napplication and reported competitive performance. The most promising framework\nfor creep life prediction is BNNs based on Markov Chain Monte Carlo\napproximation of the posterior distribution of network parameters, as it\nprovided more reliable results in comparison to BNNs based on variational\ninference approximation or related NNs with probabilistic outputs. The codes\nare available at:\nhttps://github.com/avakanski/Creep-uncertainty-quantification.\n","authors":["Longze Li","Jiang Chang","Aleksandar Vakanski","Yachun Wang","Tiankai Yao","Min Xian"],"pdf_url":"https://arxiv.org/pdf/2311.02495v2.pdf","comment":"24 pages, 4 figures"},{"id":"http://arxiv.org/abs/2306.10797v3","updated":"2023-12-05T17:50:52Z","published":"2023-06-19T09:37:18Z","title":"Variability of echo state network prediction horizon for partially\n observed dynamical systems","summary":" Study of dynamical systems using partial state observation is an important\nproblem due to its applicability to many real-world systems. We address the\nproblem by studying an echo state network (ESN) framework with partial state\ninput with partial or full state output. Application to the Lorenz system and\nChua's oscillator (both numerically simulated and experimental systems)\ndemonstrate the effectiveness of our method. We show that the ESN, as an\nautonomous dynamical system, is capable of making short-term predictions up to\na few Lyapunov times. However, the prediction horizon has high variability\ndepending on the initial condition-an aspect that we explore in detail using\nthe distribution of the prediction horizon. Further, using a variety of\nstatistical metrics to compare the long-term dynamics of the ESN predictions\nwith numerically simulated or experimental dynamics and observed similar\nresults, we show that the ESN can effectively learn the system's dynamics even\nwhen trained with noisy numerical or experimental datasets. Thus, we\ndemonstrate the potential of ESNs to serve as cheap surrogate models for\nsimulating the dynamics of systems where complete observations are unavailable.\n","authors":["Ajit Mahata","Reetish Padhi","Amit Apte"],"pdf_url":"https://arxiv.org/pdf/2306.10797v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02916v1","updated":"2023-12-05T17:46:52Z","published":"2023-12-05T17:46:52Z","title":"MIND: Multi-Task Incremental Network Distillation","summary":" The recent surge in pervasive devices generating dynamic data streams has\nunderscored the necessity for learning systems to adapt to data distributional\nshifts continually. To tackle this challenge, the research community has put\nforth a spectrum of methodologies, including the demanding pursuit of\nclass-incremental learning without replay data. In this study, we present MIND,\na parameter isolation method that aims to significantly enhance the performance\nof replay-free solutions and achieve state-of-the-art results on several widely\nstudied datasets. Our approach introduces two main contributions: two\nalternative distillation procedures that significantly improve the efficiency\nof MIND increasing the accumulated knowledge of each sub-network, and the\noptimization of the BachNorm layers across tasks inside the sub-networks.\nOverall, MIND outperforms all the state-of-the-art methods for rehearsal-free\nClass-Incremental learning (with an increment in classification accuracy of\napprox. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx.\n+40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each\ncontribution to demonstrate its impact on performance improvement. Our results\nshowcase the superior performance of MIND indicating its potential for\naddressing the challenges posed by Class-incremental and Domain-Incremental\nlearning in resource-constrained environments.\n","authors":["Jacopo Bonato","Francesco Pelosin","Luigi Sabetta","Alessandro Nicolosi"],"pdf_url":"https://arxiv.org/pdf/2312.02916v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02914v1","updated":"2023-12-05T17:39:19Z","published":"2023-12-05T17:39:19Z","title":"Unsupervised Video Domain Adaptation with Masked Pre-Training and\n Collaborative Self-Training","summary":" In this work, we tackle the problem of unsupervised domain adaptation (UDA)\nfor video action recognition. Our approach, which we call UNITE, uses an image\nteacher model to adapt a video student model to the target domain. UNITE first\nemploys self-supervised pre-training to promote discriminative feature learning\non target domain videos using a teacher-guided masked distillation objective.\nWe then perform self-training on masked target data, using the video student\nmodel and image teacher model together to generate improved pseudolabels for\nunlabeled target videos. Our self-training process successfully leverages the\nstrengths of both models to achieve strong transfer performance across domains.\nWe evaluate our approach on multiple video domain adaptation benchmarks and\nobserve significant improvements upon previously reported results.\n","authors":["Arun Reddy","William Paul","Corban Rivera","Ketul Shah","Celso M. de Melo","Rama Chellappa"],"pdf_url":"https://arxiv.org/pdf/2312.02914v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15930v3","updated":"2023-12-05T17:21:18Z","published":"2023-05-25T10:58:46Z","title":"End-to-End Meta-Bayesian Optimisation with Transformer Neural Processes","summary":" Meta-Bayesian optimisation (meta-BO) aims to improve the sample efficiency of\nBayesian optimisation by leveraging data from related tasks. While previous\nmethods successfully meta-learn either a surrogate model or an acquisition\nfunction independently, joint training of both components remains an open\nchallenge. This paper proposes the first end-to-end differentiable meta-BO\nframework that generalises neural processes to learn acquisition functions via\ntransformer architectures. We enable this end-to-end framework with\nreinforcement learning (RL) to tackle the lack of labelled acquisition data.\nEarly on, we notice that training transformer-based neural processes from\nscratch with RL is challenging due to insufficient supervision, especially when\nrewards are sparse. We formalise this claim with a combinatorial analysis\nshowing that the widely used notion of regret as a reward signal exhibits a\nlogarithmic sparsity pattern in trajectory lengths. To tackle this problem, we\naugment the RL objective with an auxiliary task that guides part of the\narchitecture to learn a valid probabilistic model as an inductive bias. We\ndemonstrate that our method achieves state-of-the-art regret results against\nvarious baselines in experiments on standard hyperparameter optimisation tasks\nand also outperforms others in the real-world problems of mixed-integer\nprogramming tuning, antibody design, and logic synthesis for electronic design\nautomation.\n","authors":["Alexandre Maraval","Matthieu Zimmer","Antoine Grosnit","Haitham Bou Ammar"],"pdf_url":"https://arxiv.org/pdf/2305.15930v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01090v2","updated":"2023-12-05T17:16:57Z","published":"2023-05-01T21:14:47Z","title":"Autoencoders for discovering manifold dimension and coordinates in data\n from complex dynamical systems","summary":" While many phenomena in physics and engineering are formally\nhigh-dimensional, their long-time dynamics often live on a lower-dimensional\nmanifold. The present work introduces an autoencoder framework that combines\nimplicit regularization with internal linear layers and $L_2$ regularization\n(weight decay) to automatically estimate the underlying dimensionality of a\ndata set, produce an orthogonal manifold coordinate system, and provide the\nmapping functions between the ambient space and manifold space, allowing for\nout-of-sample projections. We validate our framework's ability to estimate the\nmanifold dimension for a series of datasets from dynamical systems of varying\ncomplexities and compare to other state-of-the-art estimators. We analyze the\ntraining dynamics of the network to glean insight into the mechanism of\nlow-rank learning and find that collectively each of the implicit regularizing\nlayers compound the low-rank representation and even self-correct during\ntraining. Analysis of gradient descent dynamics for this architecture in the\nlinear case reveals the role of the internal linear layers in leading to faster\ndecay of a \"collective weight variable\" incorporating all layers, and the role\nof weight decay in breaking degeneracies and thus driving convergence along\ndirections in which no decay would occur in its absence. We show that this\nframework can be naturally extended for applications of state-space modeling\nand forecasting by generating a data-driven dynamic model of a spatiotemporally\nchaotic partial differential equation using only the manifold coordinates.\nFinally, we demonstrate that our framework is robust to hyperparameter choices.\n","authors":["Kevin Zeng","Carlos E. Pérez De Jesús","Andrew J. Fox","Michael D. Graham"],"pdf_url":"https://arxiv.org/pdf/2305.01090v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02903v2","updated":"2023-12-05T17:16:36Z","published":"2023-10-04T15:42:23Z","title":"FroSSL: Frobenius Norm Minimization for Self-Supervised Learning","summary":" Self-supervised learning (SSL) is an increasingly popular paradigm for\nrepresentation learning. Recent methods can be classified as\nsample-contrastive, dimension-contrastive, or asymmetric network-based, with\neach family having its own approach to avoiding informational collapse. While\ndimension-contrastive methods converge to similar solutions as\nsample-contrastive methods, it can be empirically shown that some methods\nrequire more epochs of training to converge. Motivated by closing this divide,\nwe present the objective function FroSSL which is both sample- and\ndimension-contrastive up to embedding normalization. FroSSL works by minimizing\ncovariance Frobenius norms for avoiding collapse and minimizing mean-squared\nerror for augmentation invariance. We show that FroSSL converges more quickly\nthan a variety of other SSL methods and provide theoretical and empirical\nsupport that this faster convergence is due to how FroSSL affects the\neigenvalues of the embedding covariance matrices. We also show that FroSSL\nlearns competitive representations on linear probe evaluation when used to\ntrain a ResNet18 on the CIFAR-10, CIFAR-100, STL-10, and ImageNet datasets.\n","authors":["Oscar Skean","Aayush Dhakal","Nathan Jacobs","Luis Gonzalo Sanchez Giraldo"],"pdf_url":"https://arxiv.org/pdf/2310.02903v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02901v1","updated":"2023-12-05T17:15:16Z","published":"2023-12-05T17:15:16Z","title":"Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive\n Review","summary":" Due to the advent and increase in the popularity of the Internet, people have\nbeen producing and disseminating textual data in several ways, such as reviews,\nsocial media posts, and news articles. As a result, numerous researchers have\nbeen working on discovering patterns in textual data, especially because social\nmedia posts function as social sensors, indicating peoples' opinions,\ninterests, etc. However, most tasks regarding natural language processing are\naddressed using traditional machine learning methods and static datasets. This\nsetting can lead to several problems, such as an outdated dataset, which may\nnot correspond to reality, and an outdated model, which has its performance\ndegrading over time. Concept drift is another aspect that emphasizes these\nissues, which corresponds to data distribution and pattern changes. In a text\nstream scenario, it is even more challenging due to its characteristics, such\nas the high speed and data arriving sequentially. In addition, models for this\ntype of scenario must adhere to the constraints mentioned above while learning\nfrom the stream by storing texts for a limited time and consuming low memory.\nIn this study, we performed a systematic literature review regarding concept\ndrift adaptation in text stream scenarios. Considering well-defined criteria,\nwe selected 40 papers to unravel aspects such as text drift categories, types\nof text drift detection, model update mechanism, the addressed stream mining\ntasks, types of text representations, and text representation update mechanism.\nIn addition, we discussed drift visualization and simulation and listed\nreal-world datasets used in the selected papers. Therefore, this paper\ncomprehensively reviews the concept drift adaptation in text stream mining\nscenarios.\n","authors":["Cristiano Mesquita Garcia","Ramon Simoes Abilio","Alessandro Lameiras Koerich","Alceu de Souza Britto Jr.","Jean Paul Barddal"],"pdf_url":"https://arxiv.org/pdf/2312.02901v1.pdf","comment":"49 pages"},{"id":"http://arxiv.org/abs/2306.16334v2","updated":"2023-12-05T16:46:11Z","published":"2023-06-28T16:10:01Z","title":"On the Identifiability of Quantized Factors","summary":" Disentanglement aims to recover meaningful latent ground-truth factors from\nthe observed distribution solely, and is formalized through the theory of\nidentifiability. The identifiability of independent latent factors is proven to\nbe impossible in the unsupervised i.i.d. setting under a general nonlinear map\nfrom factors to observations. In this work, however, we demonstrate that it is\npossible to recover quantized latent factors under a generic nonlinear\ndiffeomorphism. We only assume that the latent factors have independent\ndiscontinuities in their density, without requiring the factors to be\nstatistically independent. We introduce this novel form of identifiability,\ntermed quantized factor identifiability, and provide a comprehensive proof of\nthe recovery of the quantized factors.\n","authors":["Vitória Barin-Pacela","Kartik Ahuja","Simon Lacoste-Julien","Pascal Vincent"],"pdf_url":"https://arxiv.org/pdf/2306.16334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.05440v2","updated":"2023-12-05T16:46:09Z","published":"2023-11-09T15:24:44Z","title":"A Practical Approach to Novel Class Discovery in Tabular Data","summary":" The problem of Novel Class Discovery (NCD) consists in extracting knowledge\nfrom a labeled set of known classes to accurately partition an unlabeled set of\nnovel classes. While NCD has recently received a lot of attention from the\ncommunity, it is often solved on computer vision problems and under unrealistic\nconditions. In particular, the number of novel classes is usually assumed to be\nknown in advance, and their labels are sometimes used to tune hyperparameters.\nMethods that rely on these assumptions are not applicable in real-world\nscenarios. In this work, we focus on solving NCD in tabular data when no prior\nknowledge of the novel classes is available. To this end, we propose to tune\nthe hyperparameters of NCD methods by adapting the $k$-fold cross-validation\nprocess and hiding some of the known classes in each fold. Since we have found\nthat methods with too many hyperparameters are likely to overfit these hidden\nclasses, we define a simple deep NCD model. This method is composed of only the\nessential elements necessary for the NCD problem and performs impressively well\nunder realistic conditions. Furthermore, we find that the latent space of this\nmethod can be used to reliably estimate the number of novel classes.\nAdditionally, we adapt two unsupervised clustering algorithms ($k$-means and\nSpectral Clustering) to leverage the knowledge of the known classes. Extensive\nexperiments are conducted on 7 tabular datasets and demonstrate the\neffectiveness of the proposed method and hyperparameter tuning process, and\nshow that the NCD problem can be solved without relying on knowledge from the\nnovel classes.\n","authors":["Colin Troisemaine","Alexandre Reiffers-Masson","Stéphane Gosselin","Vincent Lemaire","Sandrine Vaton"],"pdf_url":"https://arxiv.org/pdf/2311.05440v2.pdf","comment":"25 pages, including 3 pages of annexes"},{"id":"http://arxiv.org/abs/2312.02873v1","updated":"2023-12-05T16:39:41Z","published":"2023-12-05T16:39:41Z","title":"Toward autocorrection of chemical process flowsheets using large\n language models","summary":" The process engineering domain widely uses Process Flow Diagrams (PFDs) and\nProcess and Instrumentation Diagrams (P&IDs) to represent process flows and\nequipment configurations. However, the P&IDs and PFDs, hereafter called\nflowsheets, can contain errors causing safety hazards, inefficient operation,\nand unnecessary expenses. Correcting and verifying flowsheets is a tedious,\nmanual process. We propose a novel generative AI methodology for automatically\nidentifying errors in flowsheets and suggesting corrections to the user, i.e.,\nautocorrecting flowsheets. Inspired by the breakthrough of Large Language\nModels (LLMs) for grammatical autocorrection of human language, we investigate\nLLMs for the autocorrection of flowsheets. The input to the model is a\npotentially erroneous flowsheet and the output of the model are suggestions for\na corrected flowsheet. We train our autocorrection model on a synthetic dataset\nin a supervised manner. The model achieves a top-1 accuracy of 80% and a top-5\naccuracy of 84% on an independent test dataset of synthetically generated\nflowsheets. The results suggest that the model can learn to autocorrect the\nsynthetic flowsheets. We envision that flowsheet autocorrection will become a\nuseful tool for chemical engineers.\n","authors":["Lukas Schulze Balhorn","Marc Caballero","Artur M. Schweidtmann"],"pdf_url":"https://arxiv.org/pdf/2312.02873v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02872v1","updated":"2023-12-05T16:39:32Z","published":"2023-12-05T16:39:32Z","title":"Experimental Insights Towards Explainable and Interpretable Pedestrian\n Crossing Prediction","summary":" In the context of autonomous driving, pedestrian crossing prediction is a key\ncomponent for improving road safety. Presently, the focus of these predictions\nextends beyond achieving trustworthy results; it is shifting towards the\nexplainability and interpretability of these predictions. This research\nintroduces a novel neuro-symbolic approach that combines deep learning and\nfuzzy logic for an explainable and interpretable pedestrian crossing\nprediction. We have developed an explainable predictor (ExPedCross), which\nutilizes a set of explainable features and employs a fuzzy inference system to\npredict whether the pedestrian will cross or not. Our approach was evaluated on\nboth the PIE and JAAD datasets. The results offer experimental insights into\nachieving explainability and interpretability in the pedestrian crossing\nprediction task. Furthermore, the testing results yield a set of guidelines and\nrecommendations regarding the process of dataset selection, feature selection,\nand explainability.\n","authors":["Angie Nataly Melo","Carlota Salinas","Miguel Angel Sotelo"],"pdf_url":"https://arxiv.org/pdf/2312.02872v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02871v1","updated":"2023-12-05T16:39:24Z","published":"2023-12-05T16:39:24Z","title":"Attention-enhanced neural differential equations for physics-informed\n deep learning of ion transport","summary":" Species transport models typically combine partial differential equations\n(PDEs) with relations from hindered transport theory to quantify\nelectromigrative, convective, and diffusive transport through complex\nnanoporous systems; however, these formulations are frequently substantial\nsimplifications of the governing dynamics, leading to the poor generalization\nperformance of PDE-based models. Given the growing interest in deep learning\nmethods for the physical sciences, we develop a machine learning-based approach\nto characterize ion transport across nanoporous membranes. Our proposed\nframework centers around attention-enhanced neural differential equations that\nincorporate electroneutrality-based inductive biases to improve generalization\nperformance relative to conventional PDE-based methods. In addition, we study\nthe role of the attention mechanism in illuminating physically-meaningful\nion-pairing relationships across diverse mixture compositions. Further, we\ninvestigate the importance of pre-training on simulated data from PDE-based\nmodels, as well as the performance benefits from hard vs. soft inductive\nbiases. Our results indicate that physics-informed deep learning solutions can\noutperform their classical PDE-based counterparts and provide promising avenues\nfor modelling complex transport phenomena across diverse applications.\n","authors":["Danyal Rehman","John H. Lienhard"],"pdf_url":"https://arxiv.org/pdf/2312.02871v1.pdf","comment":"8 pages, 2 figures. Accepted in the NeurIPS Machine Learning and the\n Physical Sciences Workshop"},{"id":"http://arxiv.org/abs/2312.02867v1","updated":"2023-12-05T16:27:51Z","published":"2023-12-05T16:27:51Z","title":"Semi-Supervised Health Index Monitoring with Feature Generation and\n Fusion","summary":" The Health Index (HI) is crucial for evaluating system health, aiding tasks\nlike anomaly detection and predicting remaining useful life for systems\ndemanding high safety and reliability. Tight monitoring is crucial for\nachieving high precision at a lower cost, with applications such as spray\ncoating. Obtaining HI labels in real-world applications is often\ncost-prohibitive, requiring continuous, precise health measurements. Therefore,\nit is more convenient to leverage run-to failure datasets that may provide\npotential indications of machine wear condition, making it necessary to apply\nsemi-supervised tools for HI construction. In this study, we adapt the Deep\nSemi-supervised Anomaly Detection (DeepSAD) method for HI construction. We use\nthe DeepSAD embedding as a condition indicators to address interpretability\nchallenges and sensitivity to system-specific factors. Then, we introduce a\ndiversity loss to enrich condition indicators. We employ an alternating\nprojection algorithm with isotonic constraints to transform the DeepSAD\nembedding into a normalized HI with an increasing trend. Validation on the PHME\n2010 milling dataset, a recognized benchmark with ground truth HIs demonstrates\nmeaningful HIs estimations. Our methodology is then applied to monitor wear\nstates of thermal spray coatings using high-frequency voltage. Our\ncontributions create opportunities for more accessible and reliable HI\nestimation, particularly in cases where obtaining ground truth HI labels is\nunfeasible.\n","authors":["Gaëtan Frusque","Ismail Nejjar","Majid Nabavi","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2312.02867v1.pdf","comment":"13 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.02859v1","updated":"2023-12-05T16:13:50Z","published":"2023-12-05T16:13:50Z","title":"Lessons from Usable ML Deployments and Application to Wind Turbine\n Monitoring","summary":" Through past experiences deploying what we call usable ML (one step beyond\nexplainable ML, including both explanations and other augmenting information)\nto real-world domains, we have learned three key lessons. First, many\norganizations are beginning to hire people who we call ``bridges'' because they\nbridge the gap between ML developers and domain experts, and these people fill\na valuable role in developing usable ML applications. Second, a configurable\nsystem that enables easily iterating on usable ML interfaces during\ncollaborations with bridges is key. Finally, there is a need for continuous,\nin-deployment evaluations to quantify the real-world impact of usable ML.\nThroughout this paper, we apply these lessons to the task of wind turbine\nmonitoring, an essential task in the renewable energy domain. Turbine engineers\nand data analysts must decide whether to perform costly in-person\ninvestigations on turbines to prevent potential cases of brakepad failure, and\nwell-tuned usable ML interfaces can aid with this decision-making process.\nThrough the applications of our lessons to this task, we hope to demonstrate\nthe potential real-world impact of usable ML in the renewable energy domain.\n","authors":["Alexandra Zytek","Wei-En Wang","Sofia Koukoura","Kalyan Veeramachaneni"],"pdf_url":"https://arxiv.org/pdf/2312.02859v1.pdf","comment":"Presented in XAI in Action: Past, Present, and Future Applications @\n NeurIPS 2023. 8 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.02858v1","updated":"2023-12-05T16:13:34Z","published":"2023-12-05T16:13:34Z","title":"Towards Causal Representations of Climate Model Data","summary":" Climate models, such as Earth system models (ESMs), are crucial for\nsimulating future climate change based on projected Shared Socioeconomic\nPathways (SSP) greenhouse gas emissions scenarios. While ESMs are sophisticated\nand invaluable, machine learning-based emulators trained on existing simulation\ndata can project additional climate scenarios much faster and are\ncomputationally efficient. However, they often lack generalizability and\ninterpretability. This work delves into the potential of causal representation\nlearning, specifically the \\emph{Causal Discovery with Single-parent Decoding}\n(CDSD) method, which could render climate model emulation efficient\n\\textit{and} interpretable. We evaluate CDSD on multiple climate datasets,\nfocusing on emissions, temperature, and precipitation. Our findings shed light\non the challenges, limitations, and promise of using CDSD as a stepping stone\ntowards more interpretable and robust climate model emulation.\n","authors":["Julien Boussard","Chandni Nagda","Julia Kaltenborn","Charlotte Emilie Elektra Lange","Philippe Brouillard","Yaniv Gurwicz","Peer Nowack","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2312.02858v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02855v1","updated":"2023-12-05T16:11:52Z","published":"2023-12-05T16:11:52Z","title":"Exploring Error Bits for Memory Failure Prediction: An In-Depth\n Correlative Study","summary":" In large-scale datacenters, memory failure is a common cause of server\ncrashes, with uncorrectable errors (UEs) being a major indicator of Dual Inline\nMemory Module (DIMM) defects. Existing approaches primarily focus on predicting\nUEs using correctable errors (CEs), without fully considering the information\nprovided by error bits. However, error bit patterns have a strong correlation\nwith the occurrence of uncorrectable errors (UEs). In this paper, we present a\ncomprehensive study on the correlation between CEs and UEs, specifically\nemphasizing the importance of spatio-temporal error bit information. Our\nanalysis reveals a strong correlation between spatio-temporal error bits and UE\noccurrence. Through evaluations using real-world datasets, we demonstrate that\nour approach significantly improves prediction performance by 15% in F1-score\ncompared to the state-of-the-art algorithms. Overall, our approach effectively\nreduces the number of virtual machine interruptions caused by UEs by\napproximately 59%.\n","authors":["Qiao Yu","Wengui Zhang","Jorge Cardoso","Odej Kao"],"pdf_url":"https://arxiv.org/pdf/2312.02855v1.pdf","comment":"Published at ICCAD 2023"},{"id":"http://arxiv.org/abs/2312.02852v1","updated":"2023-12-05T16:09:31Z","published":"2023-12-05T16:09:31Z","title":"Expert-guided Bayesian Optimisation for Human-in-the-loop Experimental\n Design of Known Systems","summary":" Domain experts often possess valuable physical insights that are overlooked\nin fully automated decision-making processes such as Bayesian optimisation. In\nthis article we apply high-throughput (batch) Bayesian optimisation alongside\nanthropological decision theory to enable domain experts to influence the\nselection of optimal experiments. Our methodology exploits the hypothesis that\nhumans are better at making discrete choices than continuous ones and enables\nexperts to influence critical early decisions. At each iteration we solve an\naugmented multi-objective optimisation problem across a number of alternate\nsolutions, maximising both the sum of their utility function values and the\ndeterminant of their covariance matrix, equivalent to their total variability.\nBy taking the solution at the knee point of the Pareto front, we return a set\nof alternate solutions at each iteration that have both high utility values and\nare reasonably distinct, from which the expert selects one for evaluation. We\ndemonstrate that even in the case of an uninformed practitioner, our algorithm\nrecovers the regret of standard Bayesian optimisation.\n","authors":["Tom Savage","Ehecatl Antonio del Rio Chanona"],"pdf_url":"https://arxiv.org/pdf/2312.02852v1.pdf","comment":"NeurIPS 2023 Workshop on Adaptive Experimental Design and Active\n Learning in the Real World. Main text: 6 pages"},{"id":"http://arxiv.org/abs/2301.03962v2","updated":"2023-12-05T16:09:24Z","published":"2023-01-10T13:51:07Z","title":"A Unified Theory of Diversity in Ensemble Learning","summary":" We present a theory of ensemble diversity, explaining the nature of diversity\nfor a wide range of supervised learning scenarios. This challenge, of\nunderstanding ensemble diversity, has been referred to as the \"holy grail\" of\nensemble learning, an open research issue for over 30 years. Our framework\nreveals that diversity is in fact a hidden dimension in the bias-variance\ndecomposition of the ensemble loss. We prove a family of exact\nbias-variance-diversity decompositions, for both regression and classification,\ne.g., squared, cross-entropy, and Poisson losses. For losses where an additive\nbias-variance decomposition is not available (e.g., 0/1 loss) we present an\nalternative approach, which precisely quantifies the effects of diversity,\nturning out to be dependent on the label distribution. Experiments show how we\ncan use our framework to understand the diversity-encouraging mechanisms of\npopular methods: Bagging, Boosting, and Random Forests.\n","authors":["Danny Wood","Tingting Mu","Andrew Webb","Henry Reeve","Mikel Lujan","Gavin Brown"],"pdf_url":"https://arxiv.org/pdf/2301.03962v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02850v1","updated":"2023-12-05T16:06:23Z","published":"2023-12-05T16:06:23Z","title":"A Kernel-Based Neural Network Test for High-dimensional Sequencing Data\n Analysis","summary":" The recent development of artificial intelligence (AI) technology, especially\nthe advance of deep neural network (DNN) technology, has revolutionized many\nfields. While DNN plays a central role in modern AI technology, it has been\nrarely used in sequencing data analysis due to challenges brought by\nhigh-dimensional sequencing data (e.g., overfitting). Moreover, due to the\ncomplexity of neural networks and their unknown limiting distributions,\nbuilding association tests on neural networks for genetic association analysis\nremains a great challenge. To address these challenges and fill the important\ngap of using AI in high-dimensional sequencing data analysis, we introduce a\nnew kernel-based neural network (KNN) test for complex association analysis of\nsequencing data. The test is built on our previously developed KNN framework,\nwhich uses random effects to model the overall effects of high-dimensional\ngenetic data and adopts kernel-based neural network structures to model complex\ngenotype-phenotype relationships. Based on KNN, a Wald-type test is then\nintroduced to evaluate the joint association of high-dimensional genetic data\nwith a disease phenotype of interest, considering non-linear and non-additive\neffects (e.g., interaction effects). Through simulations, we demonstrated that\nour proposed method attained higher power compared to the sequence kernel\nassociation test (SKAT), especially in the presence of non-linear and\ninteraction effects. Finally, we apply the methods to the whole genome\nsequencing (WGS) dataset from the Alzheimer's Disease Neuroimaging Initiative\n(ADNI) study, investigating new genes associated with the hippocampal volume\nchange over time.\n","authors":["Tingting Hou","Chang Jiang","Qing Lu"],"pdf_url":"https://arxiv.org/pdf/2312.02850v1.pdf","comment":"31 pages, 5 figures and 3 tabels"},{"id":"http://arxiv.org/abs/2301.03573v2","updated":"2023-12-05T16:05:00Z","published":"2023-01-09T18:50:03Z","title":"Balance is Essence: Accelerating Sparse Training via Adaptive Gradient\n Correction","summary":" Despite impressive performance, deep neural networks require significant\nmemory and computation costs, prohibiting their application in\nresource-constrained scenarios. Sparse training is one of the most common\ntechniques to reduce these costs, however, the sparsity constraints add\ndifficulty to the optimization, resulting in an increase in training time and\ninstability. In this work, we aim to overcome this problem and achieve\nspace-time co-efficiency. To accelerate and stabilize the convergence of sparse\ntraining, we analyze the gradient changes and develop an adaptive gradient\ncorrection method. Specifically, we approximate the correlation between the\ncurrent and previous gradients, which is used to balance the two gradients to\nobtain a corrected gradient. Our method can be used with the most popular\nsparse training pipelines under both standard and adversarial setups.\nTheoretically, we prove that our method can accelerate the convergence rate of\nsparse training. Extensive experiments on multiple datasets, model\narchitectures, and sparsities demonstrate that our method outperforms leading\nsparse training methods by up to \\textbf{5.0\\%} in accuracy given the same\nnumber of training epochs, and reduces the number of training epochs by up to\n\\textbf{52.1\\%} to achieve the same accuracy. Our code is available on:\n\\url{https://github.com/StevenBoys/AGENT}.\n","authors":["Bowen Lei","Dongkuan Xu","Ruqi Zhang","Shuren He","Bani K. Mallick"],"pdf_url":"https://arxiv.org/pdf/2301.03573v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18743v3","updated":"2023-12-05T16:04:15Z","published":"2023-11-30T17:41:30Z","title":"AlignBench: Benchmarking Chinese Alignment of Large Language Models","summary":" Alignment has become a critical step for instruction-tuned Large Language\nModels (LLMs) to become helpful assistants. However, effective evaluation of\nalignment for emerging Chinese LLMs is still significantly lacking, calling for\nreal-scenario grounded, open-ended, challenging and automatic evaluations\ntailored for alignment. To fill in this gap, we introduce AlignBench, a\ncomprehensive multi-dimensional benchmark for evaluating LLMs' alignment in\nChinese. Equipped with a human-in-the-loop data curation pipeline, our\nbenchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with\nChain-of-Thought to generate explanations and final ratings as evaluations,\nensuring high reliability and interpretability. Furthermore, we report\nAlignBench evaluated by CritiqueLLM, a dedicated Chinese evaluator LLM that\nrecovers 95% of GPT-4's evaluation ability. We will provide public APIs for\nevaluating AlignBench with CritiqueLLM to facilitate the evaluation of LLMs'\nChinese alignment. All evaluation codes, data, and LLM generations are\navailable at \\url{https://github.com/THUDM/AlignBench}.\n","authors":["Xiao Liu","Xuanyu Lei","Shengyuan Wang","Yue Huang","Zhuoer Feng","Bosi Wen","Jiale Cheng","Pei Ke","Yifan Xu","Weng Lam Tam","Xiaohan Zhang","Lichao Sun","Hongning Wang","Jing Zhang","Minlie Huang","Yuxiao Dong","Jie Tang"],"pdf_url":"https://arxiv.org/pdf/2311.18743v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02849v1","updated":"2023-12-05T16:02:04Z","published":"2023-12-05T16:02:04Z","title":"Algorithms for mean-field variational inference via polyhedral\n optimization in the Wasserstein space","summary":" We develop a theory of finite-dimensional polyhedral subsets over the\nWasserstein space and optimization of functionals over them via first-order\nmethods. Our main application is to the problem of mean-field variational\ninference, which seeks to approximate a distribution $\\pi$ over $\\mathbb{R}^d$\nby a product measure $\\pi^\\star$. When $\\pi$ is strongly log-concave and\nlog-smooth, we provide (1) approximation rates certifying that $\\pi^\\star$ is\nclose to the minimizer $\\pi^\\star_\\diamond$ of the KL divergence over a\n\\emph{polyhedral} set $\\mathcal{P}_\\diamond$, and (2) an algorithm for\nminimizing $\\text{KL}(\\cdot\\|\\pi)$ over $\\mathcal{P}_\\diamond$ with accelerated\ncomplexity $O(\\sqrt \\kappa \\log(\\kappa d/\\varepsilon^2))$, where $\\kappa$ is\nthe condition number of $\\pi$.\n","authors":["Yiheng Jiang","Sinho Chewi","Aram-Alexandre Pooladian"],"pdf_url":"https://arxiv.org/pdf/2312.02849v1.pdf","comment":"40 pages"},{"id":"http://arxiv.org/abs/2312.02843v1","updated":"2023-12-05T15:53:24Z","published":"2023-12-05T15:53:24Z","title":"Are Vision Transformers More Data Hungry Than Newborn Visual Systems?","summary":" Vision transformers (ViTs) are top performing models on many computer vision\nbenchmarks and can accurately predict human behavior on object recognition\ntasks. However, researchers question the value of using ViTs as models of\nbiological learning because ViTs are thought to be more data hungry than\nbrains, with ViTs requiring more training data to reach similar levels of\nperformance. To test this assumption, we directly compared the learning\nabilities of ViTs and animals, by performing parallel controlled rearing\nexperiments on ViTs and newborn chicks. We first raised chicks in impoverished\nvisual environments containing a single object, then simulated the training\ndata available in those environments by building virtual animal chambers in a\nvideo game engine. We recorded the first-person images acquired by agents\nmoving through the virtual chambers and used those images to train self\nsupervised ViTs that leverage time as a teaching signal, akin to biological\nvisual systems. When ViTs were trained through the eyes of newborn chicks, the\nViTs solved the same view invariant object recognition tasks as the chicks.\nThus, ViTs were not more data hungry than newborn visual systems: both learned\nview invariant object representations in impoverished visual environments. The\nflexible and generic attention based learning mechanism in ViTs combined with\nthe embodied data streams available to newborn animals appears sufficient to\ndrive the development of animal-like object recognition.\n","authors":["Lalit Pandey","Samantha M. W. Wood","Justin N. Wood"],"pdf_url":"https://arxiv.org/pdf/2312.02843v1.pdf","comment":"Accepted in Thirty-seventh Conference on Neural Information\n Processing Systems (NeurIPS 2023)"},{"id":"http://arxiv.org/abs/2308.08410v2","updated":"2023-12-05T15:52:36Z","published":"2023-08-16T14:57:12Z","title":"Digital twinning of cardiac electrophysiology models from the surface\n ECG: a geodesic backpropagation approach","summary":" The eikonal equation has become an indispensable tool for modeling cardiac\nelectrical activation accurately and efficiently. In principle, by matching\nclinically recorded and eikonal-based electrocardiograms (ECGs), it is possible\nto build patient-specific models of cardiac electrophysiology in a purely\nnon-invasive manner. Nonetheless, the fitting procedure remains a challenging\ntask. The present study introduces a novel method, Geodesic-BP, to solve the\ninverse eikonal problem. Geodesic-BP is well-suited for GPU-accelerated machine\nlearning frameworks, allowing us to optimize the parameters of the eikonal\nequation to reproduce a given ECG. We show that Geodesic-BP can reconstruct a\nsimulated cardiac activation with high accuracy in a synthetic test case, even\nin the presence of modeling inaccuracies. Furthermore, we apply our algorithm\nto a publicly available dataset of a biventricular rabbit model, with promising\nresults. Given the future shift towards personalized medicine, Geodesic-BP has\nthe potential to help in future functionalizations of cardiac models meeting\nclinical time constraints while maintaining the physiological accuracy of\nstate-of-the-art cardiac models.\n","authors":["Thomas Grandits","Jan Verhülsdonk","Gundolf Haase","Alexander Effland","Simone Pezzuto"],"pdf_url":"https://arxiv.org/pdf/2308.08410v2.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.02829v1","updated":"2023-12-05T15:25:45Z","published":"2023-12-05T15:25:45Z","title":"MIMONets: Multiple-Input-Multiple-Output Neural Networks Exploiting\n Computation in Superposition","summary":" With the advent of deep learning, progressively larger neural networks have\nbeen designed to solve complex tasks. We take advantage of these capacity-rich\nmodels to lower the cost of inference by exploiting computation in\nsuperposition. To reduce the computational burden per input, we propose\nMultiple-Input-Multiple-Output Neural Networks (MIMONets) capable of handling\nmany inputs at once. MIMONets augment various deep neural network architectures\nwith variable binding mechanisms to represent an arbitrary number of inputs in\na compositional data structure via fixed-width distributed representations.\nAccordingly, MIMONets adapt nonlinear neural transformations to process the\ndata structure holistically, leading to a speedup nearly proportional to the\nnumber of superposed input items in the data structure. After processing in\nsuperposition, an unbinding mechanism recovers each transformed input of\ninterest. MIMONets also provide a dynamic trade-off between accuracy and\nthroughput by an instantaneous on-demand switching between a set of\naccuracy-throughput operating points, yet within a single set of fixed\nparameters. We apply the concept of MIMONets to both CNN and Transformer\narchitectures resulting in MIMOConv and MIMOFormer, respectively. Empirical\nevaluations show that MIMOConv achieves about 2-4 x speedup at an accuracy\ndelta within [+0.68, -3.18]% compared to WideResNet CNNs on CIFAR10 and\nCIFAR100. Similarly, MIMOFormer can handle 2-4 inputs at once while maintaining\na high average accuracy within a [-1.07, -3.43]% delta on the long range arena\nbenchmark. Finally, we provide mathematical bounds on the interference between\nsuperposition channels in MIMOFormer. Our code is available at\nhttps://github.com/IBM/multiple-input-multiple-output-nets.\n","authors":["Nicolas Menet","Michael Hersche","Geethan Karunaratne","Luca Benini","Abu Sebastian","Abbas Rahimi"],"pdf_url":"https://arxiv.org/pdf/2312.02829v1.pdf","comment":"accepted in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02828v1","updated":"2023-12-05T15:22:39Z","published":"2023-12-05T15:22:39Z","title":"Convergence Rates for Stochastic Approximation: Biased Noise with\n Unbounded Variance, and Applications","summary":" The Stochastic Approximation (SA) algorithm introduced by Robbins and Monro\nin 1951 has been a standard method for solving equations of the form\n$\\mathbf{f}({\\boldsymbol {\\theta}}) = \\mathbf{0}$, when only noisy measurements\nof $\\mathbf{f}(\\cdot)$ are available. If $\\mathbf{f}({\\boldsymbol {\\theta}}) =\n\\nabla J({\\boldsymbol {\\theta}})$ for some function $J(\\cdot)$, then SA can\nalso be used to find a stationary point of $J(\\cdot)$. In much of the\nliterature, it is assumed that the error term ${\\boldsymbol {xi}}_{t+1}$ has\nzero conditional mean, and that its conditional variance is bounded as a\nfunction of $t$ (though not necessarily with respect to ${\\boldsymbol\n{\\theta}}_t$). Also, for the most part, the emphasis has been on\n``synchronous'' SA, whereby, at each time $t$, \\textit{every} component of\n${\\boldsymbol {\\theta}}_t$ is updated. Over the years, SA has been applied to a\nvariety of areas, out of which two are the focus in this paper: Convex and\nnonconvex optimization, and Reinforcement Learning (RL). As it turns out, in\nthese applications, the above-mentioned assumptions do not always hold. In\nzero-order methods, the error neither has zero mean nor bounded conditional\nvariance. In the present paper, we extend SA theory to encompass errors with\nnonzero conditional mean and/or unbounded conditional variance, and also\nasynchronous SA. In addition, we derive estimates for the rate of convergence\nof the algorithm. Then we apply the new results to problems in nonconvex\noptimization, and to Markovian SA, a recently emerging area in RL. We prove\nthat SA converges in these situations, and compute the ``optimal step size\nsequences'' to maximize the estimated rate of convergence.\n","authors":["Rajeeva L. Karandikar","M. Vidyasagar"],"pdf_url":"https://arxiv.org/pdf/2312.02828v1.pdf","comment":"40 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.02826v1","updated":"2023-12-05T15:19:29Z","published":"2023-12-05T15:19:29Z","title":"Calibrated Adaptive Teacher for Domain Adaptive Intelligent Fault\n Diagnosis","summary":" Intelligent Fault Diagnosis (IFD) based on deep learning has proven to be an\neffective and flexible solution, attracting extensive research. Deep neural\nnetworks can learn rich representations from vast amounts of representative\nlabeled data for various applications. In IFD, they achieve high classification\nperformance from signals in an end-to-end manner, without requiring extensive\ndomain knowledge. However, deep learning models usually only perform well on\nthe data distribution they have been trained on. When applied to a different\ndistribution, they may experience performance drops. This is also observed in\nIFD, where assets are often operated in working conditions different from those\nin which labeled data have been collected. Unsupervised domain adaptation (UDA)\ndeals with the scenario where labeled data are available in a source domain,\nand only unlabeled data are available in a target domain, where domains may\ncorrespond to operating conditions. Recent methods rely on training with\nconfident pseudo-labels for target samples. However, the confidence-based\nselection of pseudo-labels is hindered by poorly calibrated confidence\nestimates in the target domain, primarily due to over-confident predictions,\nwhich limits the quality of pseudo-labels and leads to error accumulation. In\nthis paper, we propose a novel UDA method called Calibrated Adaptive Teacher\n(CAT), where we propose to calibrate the predictions of the teacher network\nthroughout the self-training process, leveraging post-hoc calibration\ntechniques. We evaluate CAT on domain-adaptive IFD and perform extensive\nexperiments on the Paderborn benchmark for bearing fault diagnosis under\nvarying operating conditions. Our proposed method achieves state-of-the-art\nperformance on most transfer tasks.\n","authors":["Florent Forest","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2312.02826v1.pdf","comment":"23 pages. Under review"},{"id":"http://arxiv.org/abs/2311.15218v2","updated":"2023-12-05T14:49:36Z","published":"2023-11-26T07:19:10Z","title":"Real-Time Online Stock Forecasting Utilizing Integrated Quantitative and\n Qualitative Analysis","summary":" The application of Machine learning to finance has become a familiar\napproach, even more so in stock market forecasting. The stock market is highly\nvolatile and huge amounts of data are generated every minute globally. The\nextraction of effective intelligence from this data is of critical importance.\nHowever, a collaboration of numerical stock data with qualitative text data can\nbe a challenging task. In this work, we accomplish this and provide an\nunprecedented, publicly available dataset with technical and fundamental data,\nsentiment that we gathered from News Archives, TV news captions, Radio\nTranscripts, Tweets, Daily financial newspapers, etc. The text data entries\nused for sentiment extraction total more than 1.4 Million. The dataset consists\nof daily entries from January 2018 to December 2022 for 8 companies\nrepresenting diverse industrial sectors and the Dow Jones Industrial Average\n(DJIA) as a whole. Holistic Fundamental and Technical data is provided training\nready for Model learning and deployment. The data generated could be used for\nIncremental online learning with real-time data points retrieved daily, since\nthere was no stagnant data utilized, all the data was retired from APIs or\nself-designed scripts. Moreover, the utilization of Spearman's rank correlation\nover real-time data, linking stock returns with sentiment analysis has produced\nnoteworthy results for the DJIA achieving accuracy levels surpassing 60\\%. The\ndataset is made available at https://github.com/batking24/Huge-Stock-Dataset\n","authors":["Sai Akash Bathini","Dagli Cihan"],"pdf_url":"https://arxiv.org/pdf/2311.15218v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02804v1","updated":"2023-12-05T14:44:58Z","published":"2023-12-05T14:44:58Z","title":"Score-Aware Policy-Gradient Methods and Performance Guarantees using\n Local Lyapunov Conditions: Applications to Product-Form Stochastic Networks\n and Queueing Systems","summary":" Stochastic networks and queueing systems often lead to Markov decision\nprocesses (MDPs) with large state and action spaces as well as nonconvex\nobjective functions, which hinders the convergence of many reinforcement\nlearning (RL) algorithms. Policy-gradient methods perform well on MDPs with\nlarge state and action spaces, but they sometimes experience slow convergence\ndue to the high variance of the gradient estimator. In this paper, we show that\nsome of these difficulties can be circumvented by exploiting the structure of\nthe underlying MDP. We first introduce a new family of gradient estimators\ncalled score-aware gradient estimators (SAGEs). When the stationary\ndistribution of the MDP belongs to an exponential family parametrized by the\npolicy parameters, SAGEs allow us to estimate the policy gradient without\nrelying on value-function estimation, contrary to classical policy-gradient\nmethods like actor-critic. To demonstrate their applicability, we examine two\ncommon control problems arising in stochastic networks and queueing systems\nwhose stationary distributions have a product-form, a special case of\nexponential families. As a second contribution, we show that, under appropriate\nassumptions, the policy under a SAGE-based policy-gradient method has a large\nprobability of converging to an optimal policy, provided that it starts\nsufficiently close to it, even with a nonconvex objective function and multiple\nmaximizers. Our key assumptions are that, locally around a maximizer, a\nnondegeneracy property of the Hessian of the objective function holds and a\nLyapunov function exists. Finally, we conduct a numerical comparison between a\nSAGE-based policy-gradient method and an actor-critic algorithm. The results\ndemonstrate that the SAGE-based method finds close-to-optimal policies more\nrapidly, highlighting its superior performance over the traditional\nactor-critic method.\n","authors":["Céline Comte","Matthieu Jonckheere","Jaron Sanders","Albert Senen-Cerda"],"pdf_url":"https://arxiv.org/pdf/2312.02804v1.pdf","comment":"45 pages, 5 figures"},{"id":"http://arxiv.org/abs/2202.06467v2","updated":"2023-12-05T14:42:31Z","published":"2022-02-14T03:01:05Z","title":"NeuroMixGDP: A Neural Collapse-Inspired Random Mixup for Private Data\n Release","summary":" Privacy-preserving data release algorithms have gained increasing attention\nfor their ability to protect user privacy while enabling downstream machine\nlearning tasks. However, the utility of current popular algorithms is not\nalways satisfactory. Mixup of raw data provides a new way of data augmentation,\nwhich can help improve utility. However, its performance drastically\ndeteriorates when differential privacy (DP) noise is added. To address this\nissue, this paper draws inspiration from the recently observed Neural Collapse\n(NC) phenomenon, which states that the last layer features of a neural network\nconcentrate on the vertices of a simplex as Equiangular Tight Frame (ETF). We\npropose a scheme to mixup the Neural Collapse features to exploit the ETF\nsimplex structure and release noisy mixed features to enhance the utility of\nthe released data. By using Gaussian Differential Privacy (GDP), we obtain an\nasymptotic rate for the optimal mixup degree. To further enhance the utility\nand address the label collapse issue when the mixup degree is large, we propose\na Hierarchical sampling method to stratify the mixup samples on a small number\nof classes. This method remarkably improves utility when the number of classes\nis large. Extensive experiments demonstrate the effectiveness of our proposed\nmethod in protecting against attacks and improving utility. In particular, our\napproach shows significantly improved utility compared to directly training\nclassification networks with DPSGD on CIFAR100 and MiniImagenet datasets,\nhighlighting the benefits of using privacy-preserving data release. We release\nreproducible code in https://github.com/Lidonghao1996/NeuroMixGDP.\n","authors":["Donghao Li","Yang Cao","Yuan Yao"],"pdf_url":"https://arxiv.org/pdf/2202.06467v2.pdf","comment":"28 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.02798v1","updated":"2023-12-05T14:35:11Z","published":"2023-12-05T14:35:11Z","title":"Weakly Supervised Detection of Hallucinations in LLM Activations","summary":" We propose an auditing method to identify whether a large language model\n(LLM) encodes patterns such as hallucinations in its internal states, which may\npropagate to downstream tasks. We introduce a weakly supervised auditing\ntechnique using a subset scanning approach to detect anomalous patterns in LLM\nactivations from pre-trained models. Importantly, our method does not need\nknowledge of the type of patterns a-priori. Instead, it relies on a reference\ndataset devoid of anomalies during testing. Further, our approach enables the\nidentification of pivotal nodes responsible for encoding these patterns, which\nmay offer crucial insights for fine-tuning specific sub-networks for bias\nmitigation. We introduce two new scanning methods to handle LLM activations for\nanomalous sentences that may deviate from the expected distribution in either\ndirection. Our results confirm prior findings of BERT's limited internal\ncapacity for encoding hallucinations, while OPT appears capable of encoding\nhallucination information internally. Importantly, our scanning approach,\nwithout prior exposure to false statements, performs comparably to a fully\nsupervised out-of-distribution classifier.\n","authors":["Miriam Rateike","Celia Cintas","John Wamburu","Tanya Akumu","Skyler Speakman"],"pdf_url":"https://arxiv.org/pdf/2312.02798v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02796v1","updated":"2023-12-05T14:29:18Z","published":"2023-12-05T14:29:18Z","title":"Materials Expert-Artificial Intelligence for Materials Discovery","summary":" The advent of material databases provides an unprecedented opportunity to\nuncover predictive descriptors for emergent material properties from vast data\nspace. However, common reliance on high-throughput ab initio data necessarily\ninherits limitations of such data: mismatch with experiments. On the other\nhand, experimental decisions are often guided by an expert's intuition honed\nfrom experiences that are rarely articulated. We propose using machine learning\nto \"bottle\" such operational intuition into quantifiable descriptors using\nexpertly curated measurement-based data. We introduce \"Materials\nExpert-Artificial Intelligence\" (ME-AI) to encapsulate and articulate this\nhuman intuition. As a first step towards such a program, we focus on the\ntopological semimetal (TSM) among square-net materials as the property inspired\nby the expert-identified descriptor based on structural information: the\ntolerance factor. We start by curating a dataset encompassing 12 primary\nfeatures of 879 square-net materials, using experimental data whenever\npossible. We then use Dirichlet-based Gaussian process regression using a\nspecialized kernel to reveal composite descriptors for square-net topological\nsemimetals. The ME-AI learned descriptors independently reproduce expert\nintuition and expand upon it. Specifically, new descriptors point to\nhypervalency as a critical chemical feature predicting TSM within square-net\ncompounds. Our success with a carefully defined problem points to the \"machine\nbottling human insight\" approach as promising for machine learning-aided\nmaterial discovery.\n","authors":["Yanjun Liu","Milena Jovanovic","Krishnanand Mallayya","Wesley J. Maddox","Andrew Gordon Wilson","Sebastian Klemenz","Leslie M. Schoop","Eun-Ah Kim"],"pdf_url":"https://arxiv.org/pdf/2312.02796v1.pdf","comment":"8 pages main text, 4 figs, 8 pages Supplementary material"},{"id":"http://arxiv.org/abs/2312.02786v1","updated":"2023-12-05T14:16:13Z","published":"2023-12-05T14:16:13Z","title":"Machine Learning Driven Sensitivity Analysis of E3SM Land Model\n Parameters for Wetland Methane Emissions","summary":" Methane (CH4) is the second most critical greenhouse gas after carbon\ndioxide, contributing to 16-25% of the observed atmospheric warming. Wetlands\nare the primary natural source of methane emissions globally. However, wetland\nmethane emission estimates from biogeochemistry models contain considerable\nuncertainty. One of the main sources of this uncertainty arises from the\nnumerous uncertain model parameters within various physical, biological, and\nchemical processes that influence methane production, oxidation, and transport.\nSensitivity Analysis (SA) can help identify critical parameters for methane\nemission and achieve reduced biases and uncertainties in future projections.\nThis study performs SA for 19 selected parameters responsible for critical\nbiogeochemical processes in the methane module of the Energy Exascale Earth\nSystem Model (E3SM) land model (ELM). The impact of these parameters on various\nCH4 fluxes is examined at 14 FLUXNET- CH4 sites with diverse vegetation types.\nGiven the extensive number of model simulations needed for global\nvariance-based SA, we employ a machine learning (ML) algorithm to emulate the\ncomplex behavior of ELM methane biogeochemistry. ML enables the computational\ntime to be shortened significantly from 6 CPU hours to 0.72 milliseconds,\nachieving reduced computational costs. We found that parameters linked to CH4\nproduction and diffusion generally present the highest sensitivities despite\napparent seasonal variation. Comparing simulated emissions from perturbed\nparameter sets against FLUXNET-CH4 observations revealed that better\nperformances can be achieved at each site compared to the default parameter\nvalues. This presents a scope for further improving simulated emissions using\nparameter calibration with advanced optimization techniques like Bayesian\noptimization.\n","authors":["Sandeep Chinta","Xiang Gao","Qing Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.02786v1.pdf","comment":"24 pages, 9 figures and 2 tables"},{"id":"http://arxiv.org/abs/2312.02783v1","updated":"2023-12-05T14:14:27Z","published":"2023-12-05T14:14:27Z","title":"Large Language Models on Graphs: A Comprehensive Survey","summary":" Large language models (LLMs), such as ChatGPT and LLaMA, are creating\nsignificant advancements in natural language processing, due to their strong\ntext encoding/decoding ability and newly found emergent capability (e.g.,\nreasoning). While LLMs are mainly designed to process pure texts, there are\nmany real-world scenarios where text data are associated with rich structure\ninformation in the form of graphs (e.g., academic networks, and e-commerce\nnetworks) or scenarios where graph data are paired with rich textual\ninformation (e.g., molecules with descriptions). Besides, although LLMs have\nshown their pure text-based reasoning ability, it is underexplored whether such\nability can be generalized to graph scenarios (i.e., graph-based reasoning). In\nthis paper, we provide a systematic review of scenarios and techniques related\nto large language models on graphs. We first summarize potential scenarios of\nadopting LLMs on graphs into three categories, namely pure graphs, text-rich\ngraphs, and text-paired graphs. We then discuss detailed techniques for\nutilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM\nas Aligner, and compare the advantages and disadvantages of different schools\nof models. Furthermore, we mention the real-world applications of such methods\nand summarize open-source codes and benchmark datasets. Finally, we conclude\nwith potential future research directions in this fast-growing field. The\nrelated source can be found at\nhttps://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs.\n","authors":["Bowen Jin","Gang Liu","Chi Han","Meng Jiang","Heng Ji","Jiawei Han"],"pdf_url":"https://arxiv.org/pdf/2312.02783v1.pdf","comment":"26 pages"},{"id":"http://arxiv.org/abs/2312.02780v1","updated":"2023-12-05T14:12:15Z","published":"2023-12-05T14:12:15Z","title":"Scaling Laws for Adversarial Attacks on Language Model Activations","summary":" We explore a class of adversarial attacks targeting the activations of\nlanguage models. By manipulating a relatively small subset of model\nactivations, $a$, we demonstrate the ability to control the exact prediction of\na significant number (in some cases up to 1000) of subsequent tokens $t$. We\nempirically verify a scaling law where the maximum number of target tokens\n$t_\\mathrm{max}$ predicted depends linearly on the number of tokens $a$ whose\nactivations the attacker controls as $t_\\mathrm{max} = \\kappa a$. We find that\nthe number of bits of control in the input space needed to control a single bit\nin the output space (what we call attack resistance $\\chi$) is remarkably\nconstant between $\\approx 16$ and $\\approx 25$ over 2 orders of magnitude of\nmodel sizes for different language models. Compared to attacks on tokens,\nattacks on activations are predictably much stronger, however, we identify a\nsurprising regularity where one bit of input steered either via activations or\nvia tokens is able to exert control over a similar amount of output bits. This\ngives support for the hypothesis that adversarial attacks are a consequence of\ndimensionality mismatch between the input and output spaces. A practical\nimplication of the ease of attacking language model activations instead of\ntokens is for multi-modal and selected retrieval models, where additional data\nsources are added as activations directly, sidestepping the tokenized input.\nThis opens up a new, broad attack surface. By using language models as a\ncontrollable test-bed to study adversarial attacks, we were able to experiment\nwith input-output dimensions that are inaccessible in computer vision,\nespecially where the output dimension dominates.\n","authors":["Stanislav Fort"],"pdf_url":"https://arxiv.org/pdf/2312.02780v1.pdf","comment":"15 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.02770v1","updated":"2023-12-05T14:00:32Z","published":"2023-12-05T14:00:32Z","title":"Learning \"Look-Ahead\" Nonlocal Traffic Dynamics in a Ring Road","summary":" The macroscopic traffic flow model is widely used for traffic control and\nmanagement. To incorporate drivers' anticipative behaviors and to remove\nimpractical speed discontinuity inherent in the classic\nLighthill-Whitham-Richards (LWR) traffic model, nonlocal partial differential\nequation (PDE) models with ``look-ahead\" dynamics have been proposed, which\nassume that the speed is a function of weighted downstream traffic density.\nHowever, it lacks data validation on two important questions: whether there\nexist nonlocal dynamics, and how the length and weight of the ``look-ahead\"\nwindow affect the spatial temporal propagation of traffic densities. In this\npaper, we adopt traffic trajectory data from a ring-road experiment and design\na physics-informed neural network to learn the fundamental diagram and\nlook-ahead kernel that best fit the data, and reinvent a data-enhanced nonlocal\nLWR model via minimizing the loss function combining the data discrepancy and\nthe nonlocal model discrepancy. Results show that the learned nonlocal LWR\nyields a more accurate prediction of traffic wave propagation in three\ndifferent scenarios: stop-and-go oscillations, congested, and free traffic. We\nfirst demonstrate the existence of ``look-ahead\" effect with real traffic data.\nThe optimal nonlocal kernel is found out to take a length of around 35 to 50\nmeters, and the kernel weight within 5 meters accounts for the majority of the\nnonlocal effect. Our results also underscore the importance of choosing a\npriori physics in machine learning models.\n","authors":["Chenguang Zhao","Huan Yu"],"pdf_url":"https://arxiv.org/pdf/2312.02770v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.02016v4","updated":"2023-12-05T13:44:04Z","published":"2022-07-05T12:56:08Z","title":"Robust Reinforcement Learning in Continuous Control Tasks with\n Uncertainty Set Regularization","summary":" Reinforcement learning (RL) is recognized as lacking generalization and\nrobustness under environmental perturbations, which excessively restricts its\napplication for real-world robotics. Prior work claimed that adding\nregularization to the value function is equivalent to learning a robust policy\nwith uncertain transitions. Although the regularization-robustness\ntransformation is appealing for its simplicity and efficiency, it is still\nlacking in continuous control tasks. In this paper, we propose a new\nregularizer named $\\textbf{U}$ncertainty $\\textbf{S}$et $\\textbf{R}$egularizer\n(USR), by formulating the uncertainty set on the parameter space of the\ntransition function. In particular, USR is flexible enough to be plugged into\nany existing RL framework. To deal with unknown uncertainty sets, we further\npropose a novel adversarial approach to generate them based on the value\nfunction. We evaluate USR on the Real-world Reinforcement Learning (RWRL)\nbenchmark, demonstrating improvements in the robust performance for perturbed\ntesting environments.\n","authors":["Yuan Zhang","Jianhong Wang","Joschka Boedecker"],"pdf_url":"https://arxiv.org/pdf/2207.02016v4.pdf","comment":"Accepted at CoRL 2023"},{"id":"http://arxiv.org/abs/2308.11068v2","updated":"2023-12-05T13:42:53Z","published":"2023-08-21T22:26:21Z","title":"Topological Graph Signal Compression","summary":" Recently emerged Topological Deep Learning (TDL) methods aim to extend\ncurrent Graph Neural Networks (GNN) by naturally processing higher-order\ninteractions, going beyond the pairwise relations and local neighborhoods\ndefined by graph representations. In this paper we propose a novel TDL-based\nmethod for compressing signals over graphs, consisting in two main steps:\nfirst, disjoint sets of higher-order structures are inferred based on the\noriginal signal --by clustering $N$ datapoints into $K\\ll N$ collections; then,\na topological-inspired message passing gets a compressed representation of the\nsignal within those multi-element sets. Our results show that our framework\nimproves both standard GNN and feed-forward architectures in compressing\ntemporal link-based signals from two real-word Internet Service Provider\nNetworks' datasets --from $30\\%$ up to $90\\%$ better reconstruction errors\nacross all evaluation scenarios--, suggesting that it better captures and\nexploits spatial and temporal correlations over the whole graph-based network\nstructure.\n","authors":["Guillermo Bernárdez","Lev Telyatnikov","Eduard Alarcón","Albert Cabellos-Aparicio","Pere Barlet-Ros","Pietro Liò"],"pdf_url":"https://arxiv.org/pdf/2308.11068v2.pdf","comment":"Accepted as Oral at the Second Learning on Graphs Conference (LoG\n 2023). The recording of the talk can be found in\n https://www.youtube.com/watch?v=OcruIkiRkiU"},{"id":"http://arxiv.org/abs/2312.02753v1","updated":"2023-12-05T13:28:59Z","published":"2023-12-05T13:28:59Z","title":"C3: High-performance and low-complexity neural compression from a single\n image or video","summary":" Most neural compression models are trained on large datasets of images or\nvideos in order to generalize to unseen data. Such generalization typically\nrequires large and expressive architectures with a high decoding complexity.\nHere we introduce C3, a neural compression method with strong rate-distortion\n(RD) performance that instead overfits a small model to each image or video\nseparately. The resulting decoding complexity of C3 can be an order of\nmagnitude lower than neural baselines with similar RD performance. C3 builds on\nCOOL-CHIC (Ladune et al.) and makes several simple and effective improvements\nfor images. We further develop new methodology to apply C3 to videos. On the\nCLIC2020 image benchmark, we match the RD performance of VTM, the reference\nimplementation of the H.266 codec, with less than 3k MACs/pixel for decoding.\nOn the UVG video benchmark, we match the RD performance of the Video\nCompression Transformer (Mentzer et al.), a well-established neural video\ncodec, with less than 5k MACs/pixel for decoding.\n","authors":["Hyunjik Kim","Matthias Bauer","Lucas Theis","Jonathan Richard Schwarz","Emilien Dupont"],"pdf_url":"https://arxiv.org/pdf/2312.02753v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.10988v2","updated":"2023-12-05T13:23:55Z","published":"2023-07-20T16:18:33Z","title":"On minimizing the training set fill distance in machine learning\n regression","summary":" For regression tasks one often leverages large datasets for training\npredictive machine learning models. However, using large datasets may not be\nfeasible due to computational limitations or high data labelling costs.\nTherefore, suitably selecting small training sets from large pools of\nunlabelled data points is essential to maximize model performance while\nmaintaining efficiency. In this work, we study Farthest Point Sampling (FPS), a\ndata selection approach that aims to minimize the fill distance of the selected\nset. We derive an upper bound for the maximum expected prediction error,\nconditional to the location of the unlabelled data points, that linearly\ndepends on the training set fill distance. For empirical validation, we perform\nexperiments using two regression models on three datasets. We empirically show\nthat selecting a training set by aiming to minimize the fill distance, thereby\nminimizing our derived bound, significantly reduces the maximum prediction\nerror of various regression models, outperforming alternative sampling\napproaches by a large margin. Furthermore, we show that selecting training sets\nwith the FPS can also increase model stability for the specific case of\nGaussian kernel regression approaches.\n","authors":["Paolo Climaco","Jochen Garcke"],"pdf_url":"https://arxiv.org/pdf/2307.10988v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02748v1","updated":"2023-12-05T13:23:15Z","published":"2023-12-05T13:23:15Z","title":"Compositional Generalization for Data-to-Text Generation","summary":" Data-to-text generation involves transforming structured data, often\nrepresented as predicate-argument tuples, into coherent textual descriptions.\nDespite recent advances, systems still struggle when confronted with unseen\ncombinations of predicates, producing unfaithful descriptions (e.g.\nhallucinations or omissions). We refer to this issue as compositional\ngeneralisation, and it encouraged us to create a benchmark for assessing the\nperformance of different approaches on this specific problem. Furthermore, we\npropose a novel model that addresses compositional generalization by clustering\npredicates into groups. Our model generates text in a sentence-by-sentence\nmanner, relying on one cluster of predicates at a time. This approach\nsignificantly outperforms T5~baselines across all evaluation metrics.Notably,\nit achieved a 31% improvement over T5 in terms of a metric focused on\nmaintaining faithfulness to the input.\n","authors":["Xinnuo Xu","Ivan Titov","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2312.02748v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.13616v2","updated":"2023-12-05T13:16:23Z","published":"2023-04-26T15:19:02Z","title":"CROP: Towards Distributional-Shift Robust Reinforcement Learning using\n Compact Reshaped Observation Processing","summary":" The safe application of reinforcement learning (RL) requires generalization\nfrom limited training data to unseen scenarios. Yet, fulfilling tasks under\nchanging circumstances is a key challenge in RL. Current state-of-the-art\napproaches for generalization apply data augmentation techniques to increase\nthe diversity of training data. Even though this prevents overfitting to the\ntraining environment(s), it hinders policy optimization. Crafting a suitable\nobservation, only containing crucial information, has been shown to be a\nchallenging task itself. To improve data efficiency and generalization\ncapabilities, we propose Compact Reshaped Observation Processing (CROP) to\nreduce the state information used for policy optimization. By providing only\nrelevant information, overfitting to a specific training layout is precluded\nand generalization to unseen environments is improved. We formulate three CROPs\nthat can be applied to fully observable observation- and action-spaces and\nprovide methodical foundation. We empirically show the improvements of CROP in\na distributionally shifted safety gridworld. We furthermore provide benchmark\ncomparisons to full observability and data-augmentation in two different-sized\nprocedurally generated mazes.\n","authors":["Philipp Altmann","Fabian Ritz","Leonard Feuchtinger","Jonas Nüßlein","Claudia Linnhoff-Popien","Thomy Phan"],"pdf_url":"https://arxiv.org/pdf/2304.13616v2.pdf","comment":"9 pages, 5 figures, published at IJCAI 2023"},{"id":"http://arxiv.org/abs/2312.02739v1","updated":"2023-12-05T13:06:25Z","published":"2023-12-05T13:06:25Z","title":"LExCI: A Framework for Reinforcement Learning with Embedded Systems","summary":" Advances in artificial intelligence (AI) have led to its application in many\nareas of everyday life. In the context of control engineering, reinforcement\nlearning (RL) represents a particularly promising approach as it is centred\naround the idea of allowing an agent to freely interact with its environment to\nfind an optimal strategy. One of the challenges professionals face when\ntraining and deploying RL agents is that the latter often have to run on\ndedicated embedded devices. This could be to integrate them into an existing\ntoolchain or to satisfy certain performance criteria like real-time\nconstraints. Conventional RL libraries, however, cannot be easily utilised in\nconjunction with that kind of hardware. In this paper, we present a framework\nnamed LExCI, the Learning and Experiencing Cycle Interface, which bridges this\ngap and provides end-users with a free and open-source tool for training agents\non embedded systems using the open-source library RLlib. Its operability is\ndemonstrated with two state-of-the-art RL-algorithms and a rapid control\nprototyping system.\n","authors":["Kevin Badalian","Lucas Koch","Tobias Brinkmann","Mario Picerno","Marius Wegener","Sung-Yong Lee","Jakob Andert"],"pdf_url":"https://arxiv.org/pdf/2312.02739v1.pdf","comment":"The code, models, and data used for this work are available in a\n separate branch of LExCI's GitHub repository\n (https://github.com/mechatronics-RWTH/lexci-2/tree/lexci_paper). This paper\n has been submitted to Applied Intelligence\n (https://link.springer.com/journal/10489)"},{"id":"http://arxiv.org/abs/2211.15498v4","updated":"2023-12-05T13:05:05Z","published":"2022-11-28T16:17:47Z","title":"Physics-informed neural networks with unknown measurement noise","summary":" Physics-informed neural networks (PINNs) constitute a flexible approach to\nboth finding solutions and identifying parameters of partial differential\nequations. Most works on the topic assume noiseless data, or data contaminated\nwith weak Gaussian noise. We show that the standard PINN framework breaks down\nin case of non-Gaussian noise. We give a way of resolving this fundamental\nissue and we propose to jointly train an energy-based model (EBM) to learn the\ncorrect noise distribution. We illustrate the improved performance of our\napproach using multiple examples.\n","authors":["Philipp Pilar","Niklas Wahlström"],"pdf_url":"https://arxiv.org/pdf/2211.15498v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2107.01347v4","updated":"2023-12-05T13:00:33Z","published":"2021-07-03T05:12:03Z","title":"Traffic Signal Control with Communicative Deep Reinforcement Learning\n Agents: a Case Study","summary":" In this work we analyze Multi-Agent Advantage Actor-Critic (MA2C) a recently\nproposed multi-agent reinforcement learning algorithm that can be applied to\nadaptive traffic signal control (ATSC) problems. To evaluate its potential we\ncompare MA2C with Independent Advantage Actor-Critic (IA2C) and other\nReinforcement Learning or heuristic based algorithms. Specifically, we analyze\nMA2C theoretically with the framework provided by non-Markov decision\nprocesses, which allows a deeper insight of the algorithm, and we critically\nexamine the effectiveness and the robustness of the method by testing it in two\ntraffic areas located in Bologna (Italy) simulated in SUMO, a software modeling\ntool for ATSC problems. Our results indicate that MA2C, trained with\npseudo-random vehicle flows, is a promising technique able to outperform the\nalternative methods.\n","authors":["Paolo Fazzini","Isaac Wheeler","Francesco Petracchini"],"pdf_url":"https://arxiv.org/pdf/2107.01347v4.pdf","comment":"Obsolete version"},{"id":"http://arxiv.org/abs/2303.00438v2","updated":"2023-12-05T12:51:15Z","published":"2023-03-01T11:54:22Z","title":"A Framework for Neurosymbolic Robot Action Planning using Large Language\n Models","summary":" Symbolic task planning is a widely used approach to enforce robot autonomy\ndue to its ease of understanding and deployment. However, symbolic task\nplanning is difficult to scale in real-world when frequent re-planning is\nneeded, for example, due to human-robot interactions or unforeseen events. Plan\nlength and planning time can hinder the robot's efficiency and negatively\naffect the overall human-robot interaction's fluency. We present a framework,\nTeriyaki, designed to bridge the gap between symbolic task planning and machine\nlearning approaches, by training Large Language Models (LLMs), namely GPT-3,\ninto neurosymbolic task planners compatible with the Planning Domain Definition\nLanguage (PDDL). Potential benefits include: (i) better scalability in so far\nas the planning domain complexity increases, since LLMs' response time linearly\nscales with the combined length of the input and the output, instead of\nsuper-linearly as in the case of symbolic task planners, and (ii) the ability\nto synthesize a plan action-by-action instead of end-to-end, and to make each\naction available for execution as soon as it is generated, which in turn\nenables concurrent planning and execution. In the past year, significant\nefforts have been devoted by the research community to evaluate the overall\ncognitive abilities of LLMs, with alternate successes. Instead, with Teriyaki\nwe aim to providing an overall planning performance comparable to traditional\nplanners in specific planning domains, while leveraging LLMs capabilities in\nother metrics which are used to build a look-ahead predictive planning model.\nPreliminary results in selected domains show that our method can: (i) solve\n95.5% of problems in a test data set of 1000 samples; (ii) produce plans up to\n13.5% shorter than a traditional symbolic planner; (iii) reduce average overall\nwaiting times for a plan availability by up to 61.4%.\n","authors":["Alessio Capitanelli","Fulvio Mastrogiovanni"],"pdf_url":"https://arxiv.org/pdf/2303.00438v2.pdf","comment":"Submitted to journal. 28 pages, 6 figures, 2 tables. Updated\n according to reviewers' comments. Previous title: A Framework to Generate\n Neurosymbolic PDDL-compliant Planners"},{"id":"http://arxiv.org/abs/2312.02730v1","updated":"2023-12-05T12:48:04Z","published":"2023-12-05T12:48:04Z","title":"Towards Measuring Representational Similarity of Large Language Models","summary":" Understanding the similarity of the numerous released large language models\n(LLMs) has many uses, e.g., simplifying model selection, detecting illegal\nmodel reuse, and advancing our understanding of what makes LLMs perform well.\nIn this work, we measure the similarity of representations of a set of LLMs\nwith 7B parameters. Our results suggest that some LLMs are substantially\ndifferent from others. We identify challenges of using representational\nsimilarity measures that suggest the need of careful study of similarity scores\nto avoid false conclusions.\n","authors":["Max Klabunde","Mehdi Ben Amor","Michael Granitzer","Florian Lemmerich"],"pdf_url":"https://arxiv.org/pdf/2312.02730v1.pdf","comment":"Extended abstract in UniReps Workshop @ NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.05063v3","updated":"2023-12-05T12:44:36Z","published":"2023-10-08T08:09:51Z","title":"Pushing the Limits of Pre-training for Time Series Forecasting in the\n CloudOps Domain","summary":" Time series has been left behind in the era of pre-training and transfer\nlearning. While research in the fields of natural language processing and\ncomputer vision are enjoying progressively larger datasets to train massive\nmodels, the most popular time series datasets consist of only tens of thousands\nof time steps, limiting our ability to study the effectiveness of pre-training\nand scaling. Recent studies have also cast doubt on the need for expressive\nmodels and scale. To alleviate these issues, we introduce three large-scale\ntime series forecasting datasets from the cloud operations (CloudOps) domain,\nthe largest having billions of observations, enabling further study into\npre-training and scaling of time series models. We build the empirical\ngroundwork for studying pre-training and scaling of time series models and pave\nthe way for future research by identifying a promising candidate architecture.\nWe show that it is a strong zero-shot baseline and benefits from further\nscaling, both in model and dataset size. Accompanying these datasets and\nresults is a suite of comprehensive benchmark results comparing classical and\ndeep learning baselines to our pre-trained method - achieving a 27% reduction\nin error on the largest dataset. Code and datasets can be found\nhttps://github.com/SalesforceAIResearch/pretrain-time-series-cloudops.\n","authors":["Gerald Woo","Chenghao Liu","Akshat Kumar","Doyen Sahoo"],"pdf_url":"https://arxiv.org/pdf/2310.05063v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02720v1","updated":"2023-12-05T12:34:51Z","published":"2023-12-05T12:34:51Z","title":"Towards the Inferrence of Structural Similarity of Combinatorial\n Landscapes","summary":" One of the most common problem-solving heuristics is by analogy. For a given\nproblem, a solver can be viewed as a strategic walk on its fitness landscape.\nThus if a solver works for one problem instance, we expect it will also be\neffective for other instances whose fitness landscapes essentially share\nstructural similarities with each other. However, due to the black-box nature\nof combinatorial optimization, it is far from trivial to infer such similarity\nin real-world scenarios. To bridge this gap, by using local optima network as a\nproxy of fitness landscapes, this paper proposed to leverage graph data mining\ntechniques to conduct qualitative and quantitative analyses to explore the\nlatent topological structural information embedded in those landscapes. By\nconducting large-scale empirical experiments on three classic combinatorial\noptimization problems, we gain concrete evidence to support the existence of\nstructural similarity between landscapes of the same classes within neighboring\ndimensions. We also interrogated the relationship between landscapes of\ndifferent problem classes.\n","authors":["Mingyu Huang","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2312.02720v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.08272v3","updated":"2023-12-05T12:18:54Z","published":"2023-02-16T13:04:59Z","title":"Revisiting Hidden Representations in Transfer Learning for Medical\n Imaging","summary":" While a key component to the success of deep learning is the availability of\nmassive amounts of training data, medical image datasets are often limited in\ndiversity and size. Transfer learning has the potential to bridge the gap\nbetween related yet different domains. For medical applications, however, it\nremains unclear whether it is more beneficial to pre-train on natural or\nmedical images. We aim to shed light on this problem by comparing\ninitialization on ImageNet and RadImageNet on seven medical classification\ntasks. Our work includes a replication study, which yields results contrary to\npreviously published findings. In our experiments, ResNet50 models pre-trained\non ImageNet tend to outperform those trained on RadImageNet. To gain further\ninsights, we investigate the learned representations using Canonical\nCorrelation Analysis (CCA) and compare the predictions of the different models.\nOur results indicate that, contrary to intuition, ImageNet and RadImageNet may\nconverge to distinct intermediate representations, which appear to diverge\nfurther during fine-tuning. Despite these distinct representations, the\npredictions of the models remain similar. Our findings show that the similarity\nbetween networks before and after fine-tuning does not correlate with\nperformance gains, suggesting that the advantages of transfer learning might\nnot solely originate from the reuse of features in the early layers of a\nconvolutional neural network.\n","authors":["Dovile Juodelyte","Amelia Jiménez-Sánchez","Veronika Cheplygina"],"pdf_url":"https://arxiv.org/pdf/2302.08272v3.pdf","comment":"Published in TMLR"},{"id":"http://arxiv.org/abs/2312.02111v2","updated":"2023-12-05T12:18:25Z","published":"2023-12-04T18:43:45Z","title":"TriDeNT: Triple Deep Network Training for Privileged Knowledge\n Distillation in Histopathology","summary":" Computational pathology models rarely utilise data that will not be available\nfor inference. This means most models cannot learn from highly informative data\nsuch as additional immunohistochemical (IHC) stains and spatial\ntranscriptomics. We present TriDeNT, a novel self-supervised method for\nutilising privileged data that is not available during inference to improve\nperformance. We demonstrate the efficacy of this method for a range of\ndifferent paired data including immunohistochemistry, spatial transcriptomics\nand expert nuclei annotations. In all settings, TriDeNT outperforms other\nstate-of-the-art methods in downstream tasks, with observed improvements of up\nto 101%. Furthermore, we provide qualitative and quantitative measurements of\nthe features learned by these models and how they differ from baselines.\nTriDeNT offers a novel method to distil knowledge from scarce or costly data\nduring training, to create significantly better models for routine inputs.\n","authors":["Lucas Farndale","Robert Insall","Ke Yuan"],"pdf_url":"https://arxiv.org/pdf/2312.02111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02708v1","updated":"2023-12-05T12:09:45Z","published":"2023-12-05T12:09:45Z","title":"(Provable) Adversarial Robustness for Group Equivariant Tasks: Graphs,\n Point Clouds, Molecules, and More","summary":" A machine learning model is traditionally considered robust if its prediction\nremains (almost) constant under input perturbations with small norm. However,\nreal-world tasks like molecular property prediction or point cloud segmentation\nhave inherent equivariances, such as rotation or permutation equivariance. In\nsuch tasks, even perturbations with large norm do not necessarily change an\ninput's semantic content. Furthermore, there are perturbations for which a\nmodel's prediction explicitly needs to change. For the first time, we propose a\nsound notion of adversarial robustness that accounts for task equivariance. We\nthen demonstrate that provable robustness can be achieved by (1) choosing a\nmodel that matches the task's equivariances (2) certifying traditional\nadversarial robustness. Certification methods are, however, unavailable for\nmany models, such as those with continuous equivariances. We close this gap by\ndeveloping the framework of equivariance-preserving randomized smoothing, which\nenables architecture-agnostic certification. We additionally derive the first\narchitecture-specific graph edit distance certificates, i.e. sound robustness\nguarantees for isomorphism equivariant tasks like node classification. Overall,\na sound notion of robustness is an important prerequisite for future work at\nthe intersection of robust and geometric machine learning.\n","authors":["Jan Schuchardt","Yan Scholten","Stephan Günnemann"],"pdf_url":"https://arxiv.org/pdf/2312.02708v1.pdf","comment":"Accepted at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2301.10250v2","updated":"2023-12-05T12:06:18Z","published":"2023-01-24T19:00:00Z","title":"Solving Inverse Physics Problems with Score Matching","summary":" We propose to solve inverse problems involving the temporal evolution of\nphysics systems by leveraging recent advances from diffusion models. Our method\nmoves the system's current state backward in time step by step by combining an\napproximate inverse physics simulator and a learned correction function. A\ncentral insight of our work is that training the learned correction with a\nsingle-step loss is equivalent to a score matching objective, while recursively\npredicting longer parts of the trajectory during training relates to maximum\nlikelihood training of a corresponding probability flow. We highlight the\nadvantages of our algorithm compared to standard denoising score matching and\nimplicit score matching, as well as fully learned baselines for a wide range of\ninverse physics problems. The resulting inverse solver has excellent accuracy\nand temporal stability and, in contrast to other learned inverse solvers,\nallows for sampling the posterior of the solutions.\n","authors":["Benjamin J. Holzschuh","Simona Vegetti","Nils Thuerey"],"pdf_url":"https://arxiv.org/pdf/2301.10250v2.pdf","comment":"Accepted at NeurIPS 2023; code available at\n https://github.com/tum-pbs/SMDP"},{"id":"http://arxiv.org/abs/2312.02696v1","updated":"2023-12-05T11:55:47Z","published":"2023-12-05T11:55:47Z","title":"Analyzing and Improving the Training Dynamics of Diffusion Models","summary":" Diffusion models currently dominate the field of data-driven image synthesis\nwith their unparalleled scaling to large datasets. In this paper, we identify\nand rectify several causes for uneven and ineffective training in the popular\nADM diffusion model architecture, without altering its high-level structure.\nObserving uncontrolled magnitude changes and imbalances in both the network\nactivations and weights over the course of training, we redesign the network\nlayers to preserve activation, weight, and update magnitudes on expectation. We\nfind that systematic application of this philosophy eliminates the observed\ndrifts and imbalances, resulting in considerably better networks at equal\ncomputational complexity. Our modifications improve the previous record FID of\n2.41 in ImageNet-512 synthesis to 1.81, achieved using fast deterministic\nsampling.\n As an independent contribution, we present a method for setting the\nexponential moving average (EMA) parameters post-hoc, i.e., after completing\nthe training run. This allows precise tuning of EMA length without the cost of\nperforming several training runs, and reveals its surprising interactions with\nnetwork architecture, training time, and guidance.\n","authors":["Tero Karras","Miika Aittala","Jaakko Lehtinen","Janne Hellsten","Timo Aila","Samuli Laine"],"pdf_url":"https://arxiv.org/pdf/2312.02696v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.13572v2","updated":"2023-12-05T11:53:43Z","published":"2023-10-20T15:10:16Z","title":"Unraveling the Enigma of Double Descent: An In-depth Analysis through\n the Lens of Learned Feature Space","summary":" Double descent presents a counter-intuitive aspect within the machine\nlearning domain, and researchers have observed its manifestation in various\nmodels and tasks. While some theoretical explanations have been proposed for\nthis phenomenon in specific contexts, an accepted theory to account for its\noccurrence in deep learning remains yet to be established. In this study, we\nrevisit the phenomenon of double descent and demonstrate that its occurrence is\nstrongly influenced by the presence of noisy data. Through conducting a\ncomprehensive analysis of the feature space of learned representations, we\nunveil that double descent arises in imperfect models trained with noisy data.\nWe argue that double descent is a consequence of the model first learning the\nnoisy data until interpolation and then adding implicit regularization via\nover-parameterization acquiring therefore capability to separate the\ninformation from the noise.\n","authors":["Yufei Gu","Xiaoqing Zheng","Tomaso Aste"],"pdf_url":"https://arxiv.org/pdf/2310.13572v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.14768v2","updated":"2023-12-05T11:43:54Z","published":"2023-10-23T10:12:23Z","title":"Policy Gradient with Kernel Quadrature","summary":" Reward evaluation of episodes becomes a bottleneck in a broad range of\nreinforcement learning tasks. Our aim in this paper is to select a small but\nrepresentative subset of a large batch of episodes, only on which we actually\ncompute rewards for more efficient policy gradient iterations. We build a\nGaussian process modeling of discounted returns or rewards to derive a positive\ndefinite kernel on the space of episodes, run an ``episodic\" kernel quadrature\nmethod to compress the information of sample episodes, and pass the reduced\nepisodes to the policy network for gradient updates. We present the theoretical\nbackground of this procedure as well as its numerical illustrations in MuJoCo\ntasks.\n","authors":["Satoshi Hayakawa","Tetsuro Morimura"],"pdf_url":"https://arxiv.org/pdf/2310.14768v2.pdf","comment":"18 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.02684v1","updated":"2023-12-05T11:40:41Z","published":"2023-12-05T11:40:41Z","title":"DeepPointMap: Advancing LiDAR SLAM with Unified Neural Descriptors","summary":" Point clouds have shown significant potential in various domains, including\nSimultaneous Localization and Mapping (SLAM). However, existing approaches\neither rely on dense point clouds to achieve high localization accuracy or use\ngeneralized descriptors to reduce map size. Unfortunately, these two aspects\nseem to conflict with each other. To address this limitation, we propose a\nunified architecture, DeepPointMap, achieving excellent preference on both\naspects. We utilize neural network to extract highly representative and sparse\nneural descriptors from point clouds, enabling memory-efficient map\nrepresentation and accurate multi-scale localization tasks (e.g., odometry and\nloop-closure). Moreover, we showcase the versatility of our framework by\nextending it to more challenging multi-agent collaborative SLAM. The promising\nresults obtained in these scenarios further emphasize the effectiveness and\npotential of our approach.\n","authors":["Xiaze Zhang","Ziheng Ding","Qi Jing","Yuejie Zhang","Wenchao Ding","Rui Feng"],"pdf_url":"https://arxiv.org/pdf/2312.02684v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02683v1","updated":"2023-12-05T11:40:38Z","published":"2023-12-05T11:40:38Z","title":"Diffusion-Based Speech Enhancement in Matched and Mismatched Conditions\n Using a Heun-Based Sampler","summary":" Diffusion models are a new class of generative models that have recently been\napplied to speech enhancement successfully. Previous works have demonstrated\ntheir superior performance in mismatched conditions compared to state-of-the\nart discriminative models. However, this was investigated with a single\ndatabase for training and another one for testing, which makes the results\nhighly dependent on the particular databases. Moreover, recent developments\nfrom the image generation literature remain largely unexplored for speech\nenhancement. These include several design aspects of diffusion models, such as\nthe noise schedule or the reverse sampler. In this work, we systematically\nassess the generalization performance of a diffusion-based speech enhancement\nmodel by using multiple speech, noise and binaural room impulse response (BRIR)\ndatabases to simulate mismatched acoustic conditions. We also experiment with a\nnoise schedule and a sampler that have not been applied to speech enhancement\nbefore. We show that the proposed system substantially benefits from using\nmultiple databases for training, and achieves superior performance compared to\nstate-of-the-art discriminative models in both matched and mismatched\nconditions. We also show that a Heun-based sampler achieves superior\nperformance at a smaller computational cost compared to a sampler commonly used\nfor speech enhancement.\n","authors":["Philippe Gonzalez","Zheng-Hua Tan","Jan Østergaard","Jesper Jensen","Tommy Sonne Alstrøm","Tobias May"],"pdf_url":"https://arxiv.org/pdf/2312.02683v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02682v1","updated":"2023-12-05T11:40:24Z","published":"2023-12-05T11:40:24Z","title":"H-GAP: Humanoid Control with a Generalist Planner","summary":" Humanoid control is an important research challenge offering avenues for\nintegration into human-centric infrastructures and enabling physics-driven\nhumanoid animations. The daunting challenges in this field stem from the\ndifficulty of optimizing in high-dimensional action spaces and the instability\nintroduced by the bipedal morphology of humanoids. However, the extensive\ncollection of human motion-captured data and the derived datasets of humanoid\ntrajectories, such as MoCapAct, paves the way to tackle these challenges. In\nthis context, we present Humanoid Generalist Autoencoding Planner (H-GAP), a\nstate-action trajectory generative model trained on humanoid trajectories\nderived from human motion-captured data, capable of adeptly handling downstream\ncontrol tasks with Model Predictive Control (MPC). For 56 degrees of freedom\nhumanoid, we empirically demonstrate that H-GAP learns to represent and\ngenerate a wide range of motor behaviours. Further, without any learning from\nonline interactions, it can also flexibly transfer these behaviors to solve\nnovel downstream control tasks via planning. Notably, H-GAP excels established\nMPC baselines that have access to the ground truth dynamics model, and is\nsuperior or comparable to offline RL methods trained for individual tasks.\nFinally, we do a series of empirical studies on the scaling properties of\nH-GAP, showing the potential for performance gains via additional data but not\ncomputing. Code and videos are available at\nhttps://ycxuyingchen.github.io/hgap/.\n","authors":["Zhengyao Jiang","Yingchen Xu","Nolan Wagener","Yicheng Luo","Michael Janner","Edward Grefenstette","Tim Rocktäschel","Yuandong Tian"],"pdf_url":"https://arxiv.org/pdf/2312.02682v1.pdf","comment":"18 pages including appendix, 4 figures"},{"id":"http://arxiv.org/abs/2312.02674v1","updated":"2023-12-05T11:29:54Z","published":"2023-12-05T11:29:54Z","title":"Amortized Bayesian Decision Making for simulation-based models","summary":" Simulation-based inference (SBI) provides a powerful framework for inferring\nposterior distributions of stochastic simulators in a wide range of domains. In\nmany settings, however, the posterior distribution is not the end goal itself\n-- rather, the derived parameter values and their uncertainties are used as a\nbasis for deciding what actions to take. Unfortunately, because posterior\ndistributions provided by SBI are (potentially crude) approximations of the\ntrue posterior, the resulting decisions can be suboptimal. Here, we address the\nquestion of how to perform Bayesian decision making on stochastic simulators,\nand how one can circumvent the need to compute an explicit approximation to the\nposterior. Our method trains a neural network on simulated data and can predict\nthe expected cost given any data and action, and can, thus, be directly used to\ninfer the action with lowest cost. We apply our method to several benchmark\nproblems and demonstrate that it induces similar cost as the true posterior\ndistribution. We then apply the method to infer optimal actions in a real-world\nsimulator in the medical neurosciences, the Bayesian Virtual Epileptic Patient,\nand demonstrate that it allows to infer actions associated with low cost after\nfew simulations.\n","authors":["Mila Gorecki","Jakob H. Macke","Michael Deistler"],"pdf_url":"https://arxiv.org/pdf/2312.02674v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.00908v2","updated":"2023-12-05T11:26:19Z","published":"2023-07-03T10:12:34Z","title":"Quantum Machine Learning on Near-Term Quantum Devices: Current State of\n Supervised and Unsupervised Techniques for Real-World Applications","summary":" The past decade has witnessed significant advancements in quantum hardware,\nencompassing improvements in speed, qubit quantity, and quantum volume-a metric\ndefining the maximum size of a quantum circuit effectively implementable on\nnear-term quantum devices. This progress has led to a surge in Quantum Machine\nLearning (QML) applications on real hardware, aiming to achieve quantum\nadvantage over classical approaches. This survey focuses on selected supervised\nand unsupervised learning applications executed on quantum hardware,\nspecifically tailored for real-world scenarios. The exploration includes a\nthorough analysis of current QML implementation limitations on quantum\nhardware, covering techniques like encoding, ansatz structure, error\nmitigation, and gradient methods to address these challenges. Furthermore, the\nsurvey evaluates the performance of QML implementations in comparison to\nclassical counterparts. In conclusion, we discuss existing bottlenecks related\nto applying QML on real quantum devices and propose potential solutions to\novercome these challenges in the future.\n","authors":["Yaswitha Gujju","Atsushi Matsuo","Rudy Raymond"],"pdf_url":"https://arxiv.org/pdf/2307.00908v2.pdf","comment":"40 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.02671v1","updated":"2023-12-05T11:26:02Z","published":"2023-12-05T11:26:02Z","title":"Learning a Sparse Representation of Barron Functions with the Inverse\n Scale Space Flow","summary":" This paper presents a method for finding a sparse representation of Barron\nfunctions. Specifically, given an $L^2$ function $f$, the inverse scale space\nflow is used to find a sparse measure $\\mu$ minimising the $L^2$ loss between\nthe Barron function associated to the measure $\\mu$ and the function $f$. The\nconvergence properties of this method are analysed in an ideal setting and in\nthe cases of measurement noise and sampling bias. In an ideal setting the\nobjective decreases strictly monotone in time to a minimizer with\n$\\mathcal{O}(1/t)$, and in the case of measurement noise or sampling bias the\noptimum is achieved up to a multiplicative or additive constant. This\nconvergence is preserved on discretization of the parameter space, and the\nminimizers on increasingly fine discretizations converge to the optimum on the\nfull parameter space.\n","authors":["Tjeerd Jan Heeringa","Tim Roith","Christoph Brune","Martin Burger"],"pdf_url":"https://arxiv.org/pdf/2312.02671v1.pdf","comment":"30 pages, 0 figures"},{"id":"http://arxiv.org/abs/2312.02665v1","updated":"2023-12-05T11:10:05Z","published":"2023-12-05T11:10:05Z","title":"Lights out: training RL agents robust to temporary blindness","summary":" Agents trained with DQN rely on an observation at each timestep to decide\nwhat action to take next. However, in real world applications observations can\nchange or be missing entirely. Examples of this could be a light bulb breaking\ndown, or the wallpaper in a certain room changing. While these situations\nchange the actual observation, the underlying optimal policy does not change.\nBecause of this we want our agent to continue taking actions until it receives\na (recognized) observation again. To achieve this we introduce a combination of\na neural network architecture that uses hidden representations of the\nobservations and a novel n-step loss function. Our implementation is able to\nwithstand location based blindness stretches longer than the ones it was\ntrained on, and therefore shows robustness to temporary blindness. For access\nto our implementation, please email Nathan, Marije, or Pau.\n","authors":["N. Ordonez","M. Tromp","P. M. Julbe","W. Böhmer"],"pdf_url":"https://arxiv.org/pdf/2312.02665v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00886v2","updated":"2023-12-05T11:05:06Z","published":"2023-12-01T19:26:23Z","title":"Nash Learning from Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) has emerged as the main\nparadigm for aligning large language models (LLMs) with human preferences.\nTypically, RLHF involves the initial step of learning a reward model from human\nfeedback, often expressed as preferences between pairs of text generations\nproduced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by\noptimizing it to maximize the reward model through a reinforcement learning\nalgorithm. However, an inherent limitation of current reward models is their\ninability to fully represent the richness of human preferences and their\ndependency on the sampling distribution.\n In this study, we introduce an alternative pipeline for the fine-tuning of\nLLMs using pairwise human feedback. Our approach entails the initial learning\nof a preference model, which is conditioned on two inputs given a prompt,\nfollowed by the pursuit of a policy that consistently generates responses\npreferred over those generated by any competing policy, thus defining the Nash\nequilibrium of this preference model. We term this approach Nash learning from\nhuman feedback (NLHF).\n In the context of a tabular policy representation, we present a novel\nalgorithmic solution, Nash-MD, founded on the principles of mirror descent.\nThis algorithm produces a sequence of policies, with the last iteration\nconverging to the regularized Nash equilibrium. Additionally, we explore\nparametric representations of policies and introduce gradient descent\nalgorithms for deep-learning architectures. To demonstrate the effectiveness of\nour approach, we present experimental results involving the fine-tuning of a\nLLM for a text summarization task. We believe NLHF offers a compelling avenue\nfor preference learning and policy optimization with the potential of advancing\nthe field of aligning LLMs with human preferences.\n","authors":["Rémi Munos","Michal Valko","Daniele Calandriello","Mohammad Gheshlaghi Azar","Mark Rowland","Zhaohan Daniel Guo","Yunhao Tang","Matthieu Geist","Thomas Mesnard","Andrea Michi","Marco Selvi","Sertan Girgin","Nikola Momchev","Olivier Bachem","Daniel J. Mankowitz","Doina Precup","Bilal Piot"],"pdf_url":"https://arxiv.org/pdf/2312.00886v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02661v1","updated":"2023-12-05T10:56:25Z","published":"2023-12-05T10:56:25Z","title":"A Self-Commissioning Edge Computing Method for Data-Driven Anomaly\n Detection in Power Electronic Systems","summary":" Ensuring the reliability of power electronic converters is a matter of great\nimportance, and data-driven condition monitoring techniques are cementing\nthemselves as an important tool for this purpose. However, translating methods\nthat work well in controlled lab environments to field applications presents\nsignificant challenges, notably because of the limited diversity and accuracy\nof the lab training data. By enabling the use of field data, online machine\nlearning can be a powerful tool to overcome this problem, but it introduces\nadditional challenges in ensuring the stability and predictability of the\ntraining processes. This work presents an edge computing method that mitigates\nthese shortcomings with minimal additional memory usage, by employing an\nautonomous algorithm that prioritizes the storage of training samples with\nlarger prediction errors. The method is demonstrated on the use case of a\nself-commissioning condition monitoring system, in the form of a thermal\nanomaly detection scheme for a variable frequency motor drive, where the\nalgorithm self-learned to distinguish normal and anomalous operation with\nminimal prior knowledge. The obtained results, based on experimental data, show\na significant improvement in prediction accuracy and training speed, when\ncompared to equivalent models trained online without the proposed data\nselection process.\n","authors":["Pere Izquierdo Gomez","Miguel E. Lopez Gajardo","Nenad Mijatovic","Tomislav Dragicevic"],"pdf_url":"https://arxiv.org/pdf/2312.02661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02658v1","updated":"2023-12-05T10:52:33Z","published":"2023-12-05T10:52:33Z","title":"Do AI models produce better weather forecasts than physics-based models?\n A quantitative evaluation case study of Storm Ciarán","summary":" There has been huge recent interest in the potential of making operational\nweather forecasts using machine learning techniques. As they become a part of\nthe weather forecasting toolbox, there is a pressing need to understand how\nwell current machine learning models can simulate high-impactweather events. We\ncompare forecasts of Storm Ciar\\'an, a European windstorm that caused sixteen\ndeaths and extensive damage in Northern Europe, made by machine learning and\nnumericalweather prediction models. The four machine learning models considered\n(FourCastNet, Pangu-Weather, GraphCast and FourCastNet-v2) produce forecasts\nthat accurately capture the synoptic-scale structure of the cyclone including\nthe position of the cloud head, shape of the warm sector and location of warm\nconveyor belt jet, and the large-scale dynamical drivers important for the\nrapid storm development such as the position of the storm relative to the\nupper-level jet exit. However, their ability to resolve the more detailed\nstructures important for issuing weather warnings is more mixed. All of the\nmachine learning models underestimate the peak amplitude of winds associated\nwith the storm, only some machine learning models resolve the warm core\nseclusion and none of the machine learning models capture the sharp bent-back\nwarm frontal gradient. Our study shows there is a great deal about the\nperformance and properties of machine learning weather forecasts that can be\nderived from case studies of high-impact weather events such as Storm Ciar\\'an.\n","authors":["Andrew J. Charlton-Perez","Helen F. Dacre","Simon Driscoll","Suzanne L. Gray","Ben Harvey","Natalie J. Harvey","Kieran M. R. Hunt","Robert W. Lee","Ranjini Swaminathan","Remy Vandaele","Ambrogio Volonté"],"pdf_url":"https://arxiv.org/pdf/2312.02658v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.04026v4","updated":"2023-12-05T10:47:31Z","published":"2023-06-06T21:41:31Z","title":"Value Functions are Control Barrier Functions: Verification of Safe\n Policies using Control Theory","summary":" Guaranteeing safe behaviour of reinforcement learning (RL) policies poses\nsignificant challenges for safety-critical applications, despite RL's\ngenerality and scalability. To address this, we propose a new approach to apply\nverification methods from control theory to learned value functions. By\nanalyzing task structures for safety preservation, we formalize original\ntheorems that establish links between value functions and control barrier\nfunctions. Further, we propose novel metrics for verifying value functions in\nsafe control tasks and practical implementation details to improve learning.\nOur work presents a novel method for certificate learning, which unlocks a\ndiversity of verification techniques from control theory for RL policies, and\nmarks a significant step towards a formal framework for the general, scalable,\nand verifiable design of RL-based control systems. Code and videos are\navailable at this https url: https://rl-cbf.github.io/\n","authors":["Daniel C. H. Tan","Fernando Acero","Robert McCarthy","Dimitrios Kanoulas","Zhibin Li"],"pdf_url":"https://arxiv.org/pdf/2306.04026v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02652v1","updated":"2023-12-05T10:46:16Z","published":"2023-12-05T10:46:16Z","title":"What Machine Learning Can Do for Focusing Aerogel Detectors","summary":" Particle identification at the Super Charm-Tau factory experiment will be\nprovided by a Focusing Aerogel Ring Imaging CHerenkov detector (FARICH). The\nspecifics of detector location make proper cooling difficult, therefore a\nsignificant number of ambient background hits are captured. They must be\nmitigated to reduce the data flow and improve particle velocity resolution. In\nthis work we present several approaches to filtering signal hits, inspired by\nmachine learning techniques from computer vision.\n","authors":["Foma Shipilov","Alexander Barnyakov","Vladimir Bobrovnikov","Sergey Kononov","Fedor Ratnikov"],"pdf_url":"https://arxiv.org/pdf/2312.02652v1.pdf","comment":"5 pages, 4 figures, to be published in 26th International Conference\n on Computing in High Energy & Nuclear Physics (CHEP2023) proceedings"},{"id":"http://arxiv.org/abs/2312.02649v1","updated":"2023-12-05T10:40:48Z","published":"2023-12-05T10:40:48Z","title":"A Q-learning approach to the continuous control problem of robot\n inverted pendulum balancing","summary":" This study evaluates the application of a discrete action space reinforcement\nlearning method (Q-learning) to the continuous control problem of robot\ninverted pendulum balancing. To speed up the learning process and to overcome\ntechnical difficulties related to the direct learning on the real robotic\nsystem, the learning phase is performed in simulation environment. A\nmathematical model of the system dynamics is implemented, deduced by curve\nfitting on data acquired from the real system. The proposed approach\ndemonstrated feasible, featuring its application on a real world robot that\nlearned to balance an inverted pendulum. This study also reinforces and\ndemonstrates the importance of an accurate representation of the physical world\nin simulation to achieve a more efficient implementation of reinforcement\nlearning algorithms in real world, even when using a discrete action space\nalgorithm to control a continuous action.\n","authors":["Mohammad Safeea","Pedro Neto"],"pdf_url":"https://arxiv.org/pdf/2312.02649v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02646v1","updated":"2023-12-05T10:37:54Z","published":"2023-12-05T10:37:54Z","title":"SAMSGL: Series-Aligned Multi-Scale Graph Learning for Spatio-Temporal\n Forecasting","summary":" Spatio-temporal forecasting in various domains, like traffic prediction and\nweather forecasting, is a challenging endeavor, primarily due to the\ndifficulties in modeling propagation dynamics and capturing high-dimensional\ninteractions among nodes. Despite the significant strides made by graph-based\nnetworks in spatio-temporal forecasting, there remain two pivotal factors\nclosely related to forecasting performance that need further consideration:\ntime delays in propagation dynamics and multi-scale high-dimensional\ninteractions. In this work, we present a Series-Aligned Multi-Scale Graph\nLearning (SAMSGL) framework, aiming to enhance forecasting performance. In\norder to handle time delays in spatial interactions, we propose a\nseries-aligned graph convolution layer to facilitate the aggregation of\nnon-delayed graph signals, thereby mitigating the influence of time delays for\nthe improvement in accuracy. To understand global and local spatio-temporal\ninteractions, we develop a spatio-temporal architecture via multi-scale graph\nlearning, which encompasses two essential components: multi-scale graph\nstructure learning and graph-fully connected (Graph-FC) blocks. The multi-scale\ngraph structure learning includes a global graph structure to learn both\ndelayed and non-delayed node embeddings, as well as a local one to learn node\nvariations influenced by neighboring factors. The Graph-FC blocks\nsynergistically fuse spatial and temporal information to boost prediction\naccuracy. To evaluate the performance of SAMSGL, we conduct experiments on\nmeteorological and traffic forecasting datasets, which demonstrate its\neffectiveness and superiority.\n","authors":["Xiaobei Zou","Luolin Xiong","Yang Tang","Jurgen Kurths"],"pdf_url":"https://arxiv.org/pdf/2312.02646v1.pdf","comment":"13 pages, 7figures"},{"id":"http://arxiv.org/abs/2311.00613v2","updated":"2023-12-05T10:32:03Z","published":"2023-11-01T16:01:01Z","title":"Controllable Music Production with Diffusion Models and Guidance\n Gradients","summary":" We demonstrate how conditional generation from diffusion models can be used\nto tackle a variety of realistic tasks in the production of music in 44.1kHz\nstereo audio with sampling-time guidance. The scenarios we consider include\ncontinuation, inpainting and regeneration of musical audio, the creation of\nsmooth transitions between two different music tracks, and the transfer of\ndesired stylistic characteristics to existing audio clips. We achieve this by\napplying guidance at sampling time in a simple framework that supports both\nreconstruction and classification losses, or any combination of the two. This\napproach ensures that generated audio can match its surrounding context, or\nconform to a class distribution or latent representation specified relative to\nany suitable pre-trained classifier or embedding model. Audio samples are\navailable at https://machinelearning.apple.com/research/controllable-music\n","authors":["Mark Levy","Bruno Di Giorgi","Floris Weers","Angelos Katharopoulos","Tom Nickson"],"pdf_url":"https://arxiv.org/pdf/2311.00613v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.20204v2","updated":"2023-12-05T10:20:11Z","published":"2023-10-31T06:04:18Z","title":"General-Purpose Retrieval-Enhanced Medical Prediction Model Using\n Near-Infinite History","summary":" Developing clinical prediction models (e.g., mortality prediction) based on\nelectronic health records (EHRs) typically relies on expert opinion for feature\nselection and adjusting observation window size. This burdens experts and\ncreates a bottleneck in the development process. We propose Retrieval-Enhanced\nMedical prediction model (REMed) to address such challenges. REMed can\nessentially evaluate an unlimited number of clinical events, select the\nrelevant ones, and make predictions. This approach effectively eliminates the\nneed for manual feature selection and enables an unrestricted observation\nwindow. We verified these properties through experiments on 27 clinical tasks\nand two independent cohorts from publicly available EHR datasets, where REMed\noutperformed other contemporary architectures that aim to handle as many events\nas possible. Notably, we found that the preferences of REMed align closely with\nthose of medical experts. We expect our approach to significantly expedite the\ndevelopment of EHR prediction models by minimizing clinicians' need for manual\ninvolvement.\n","authors":["Junu Kim","Chaeeun Shim","Bosco Seong Kyu Yang","Chami Im","Sung Yoon Lim","Han-Gil Jeong","Edward Choi"],"pdf_url":"https://arxiv.org/pdf/2310.20204v2.pdf","comment":"The source codes corresponding to this paper are available at:\n https://github.com/starmpcc/REMed"},{"id":"http://arxiv.org/abs/2312.02622v1","updated":"2023-12-05T09:55:49Z","published":"2023-12-05T09:55:49Z","title":"On the Initialization of Graph Neural Networks","summary":" Graph Neural Networks (GNNs) have displayed considerable promise in graph\nrepresentation learning across various applications. The core learning process\nrequires the initialization of model weight matrices within each GNN layer,\nwhich is typically accomplished via classic initialization methods such as\nXavier initialization. However, these methods were originally motivated to\nstabilize the variance of hidden embeddings and gradients across layers of\nFeedforward Neural Networks (FNNs) and Convolutional Neural Networks (CNNs) to\navoid vanishing gradients and maintain steady information flow. In contrast,\nwithin the GNN context classical initializations disregard the impact of the\ninput graph structure and message passing on variance. In this paper, we\nanalyze the variance of forward and backward propagation across GNN layers and\nshow that the variance instability of GNN initializations comes from the\ncombined effect of the activation function, hidden dimension, graph structure\nand message passing. To better account for these influence factors, we propose\na new initialization method for Variance Instability Reduction within GNN\nOptimization (Virgo), which naturally tends to equate forward and backward\nvariances across successive layers. We conduct comprehensive experiments on 15\ndatasets to show that Virgo can lead to superior model performance and more\nstable variance at initialization on node classification, link prediction and\ngraph classification tasks. Codes are in\nhttps://github.com/LspongebobJH/virgo_icml2023.\n","authors":["Jiahang Li","Yakun Song","Xiang Song","David Paul Wipf"],"pdf_url":"https://arxiv.org/pdf/2312.02622v1.pdf","comment":"Accepted by ICML 2023"},{"id":"http://arxiv.org/abs/2310.18428v2","updated":"2023-12-05T09:50:00Z","published":"2023-10-27T18:59:31Z","title":"The Bayesian Stability Zoo","summary":" We show that many definitions of stability found in the learning theory\nliterature are equivalent to one another. We distinguish between two families\nof definitions of stability: distribution-dependent and\ndistribution-independent Bayesian stability. Within each family, we establish\nequivalences between various definitions, encompassing approximate differential\nprivacy, pure differential privacy, replicability, global stability, perfect\ngeneralization, TV stability, mutual information stability, KL-divergence\nstability, and R\\'enyi-divergence stability. Along the way, we prove boosting\nresults that enable the amplification of the stability of a learning rule. This\nwork is a step towards a more systematic taxonomy of stability notions in\nlearning theory, which can promote clarity and an improved understanding of an\narray of stability concepts that have emerged in recent years.\n","authors":["Shay Moran","Hilla Schefler","Jonathan Shafer"],"pdf_url":"https://arxiv.org/pdf/2310.18428v2.pdf","comment":"v2, minor typo fix"},{"id":"http://arxiv.org/abs/2312.02619v1","updated":"2023-12-05T09:49:50Z","published":"2023-12-05T09:49:50Z","title":"Rethinking and Simplifying Bootstrapped Graph Latents","summary":" Graph contrastive learning (GCL) has emerged as a representative paradigm in\ngraph self-supervised learning, where negative samples are commonly regarded as\nthe key to preventing model collapse and producing distinguishable\nrepresentations. Recent studies have shown that GCL without negative samples\ncan achieve state-of-the-art performance as well as scalability improvement,\nwith bootstrapped graph latent (BGRL) as a prominent step forward. However,\nBGRL relies on a complex architecture to maintain the ability to scatter\nrepresentations, and the underlying mechanisms enabling the success remain\nlargely unexplored. In this paper, we introduce an instance-level decorrelation\nperspective to tackle the aforementioned issue and leverage it as a springboard\nto reveal the potential unnecessary model complexity within BGRL. Based on our\nfindings, we present SGCL, a simple yet effective GCL framework that utilizes\nthe outputs from two consecutive iterations as positive pairs, eliminating the\nnegative samples. SGCL only requires a single graph augmentation and a single\ngraph encoder without additional parameters. Extensive experiments conducted on\nvarious graph benchmarks demonstrate that SGCL can achieve competitive\nperformance with fewer parameters, lower time and space costs, and significant\nconvergence speedup.\n","authors":["Wangbin Sun","Jintang Li","Liang Chen","Bingzhe Wu","Yatao Bian","Zibin Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.02619v1.pdf","comment":"Accepted by WSDM 2024"},{"id":"http://arxiv.org/abs/2312.02615v1","updated":"2023-12-05T09:44:47Z","published":"2023-12-05T09:44:47Z","title":"Projection Regret: Reducing Background Bias for Novelty Detection via\n Diffusion Models","summary":" Novelty detection is a fundamental task of machine learning which aims to\ndetect abnormal ($\\textit{i.e.}$ out-of-distribution (OOD)) samples. Since\ndiffusion models have recently emerged as the de facto standard generative\nframework with surprising generation results, novelty detection via diffusion\nmodels has also gained much attention. Recent methods have mainly utilized the\nreconstruction property of in-distribution samples. However, they often suffer\nfrom detecting OOD samples that share similar background information to the\nin-distribution data. Based on our observation that diffusion models can\n\\emph{project} any sample to an in-distribution sample with similar background\ninformation, we propose \\emph{Projection Regret (PR)}, an efficient novelty\ndetection method that mitigates the bias of non-semantic information. To be\nspecific, PR computes the perceptual distance between the test image and its\ndiffusion-based projection to detect abnormality. Since the perceptual distance\noften fails to capture semantic changes when the background information is\ndominant, we cancel out the background bias by comparing it against recursive\nprojections. Extensive experiments demonstrate that PR outperforms the prior\nart of generative-model-based novelty detection methods by a significant\nmargin.\n","authors":["Sungik Choi","Hankook Lee","Honglak Lee","Moontae Lee"],"pdf_url":"https://arxiv.org/pdf/2312.02615v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02614v1","updated":"2023-12-05T09:44:45Z","published":"2023-12-05T09:44:45Z","title":"Prompt Optimization via Adversarial In-Context Learning","summary":" We propose a new method, Adversarial In-Context Learning (adv-ICL), to\noptimize prompt for in-context learning (ICL) by employing one LLM as a\ngenerator, another as a discriminator, and a third as a prompt modifier. As in\ntraditional adversarial learning, adv-ICL is implemented as a two-player game\nbetween the generator and discriminator, where the generator tries to generate\nrealistic enough output to fool the discriminator. In each round, given an\ninput prefixed by task instructions and several exemplars, the generator\nproduces an output. The discriminator is then tasked with classifying the\ngenerator input-output pair as model-generated or real data. Based on the\ndiscriminator loss, the prompt modifier proposes possible edits to the\ngenerator and discriminator prompts, and the edits that most improve the\nadversarial loss are selected. We show that adv-ICL results in significant\nimprovements over state-of-the-art prompt optimization techniques for both open\nand closed-source models on 11 generation and classification tasks including\nsummarization, arithmetic reasoning, machine translation, data-to-text\ngeneration, and the MMLU and big-bench hard benchmarks. In addition, because\nour method uses pre-trained models and updates only prompts rather than model\nparameters, it is computationally efficient, easy to extend to any LLM and\ntask, and effective in low-resource settings.\n","authors":["Xuan Long Do","Yiran Zhao","Hannah Brown","Yuxi Xie","James Xu Zhao","Nancy F. Chen","Kenji Kawaguchi","Michael Qizhe Xie","Junxian He"],"pdf_url":"https://arxiv.org/pdf/2312.02614v1.pdf","comment":"21 pages"},{"id":"http://arxiv.org/abs/2312.02611v1","updated":"2023-12-05T09:39:04Z","published":"2023-12-05T09:39:04Z","title":"Privacy-Aware Data Acquisition under Data Similarity in Regression\n Markets","summary":" Data markets facilitate decentralized data exchange for applications such as\nprediction, learning, or inference. The design of these markets is challenged\nby varying privacy preferences as well as data similarity among data owners.\nRelated works have often overlooked how data similarity impacts pricing and\ndata value through statistical information leakage. We demonstrate that data\nsimilarity and privacy preferences are integral to market design and propose a\nquery-response protocol using local differential privacy for a two-party data\nacquisition mechanism. In our regression data market model, we analyze\nstrategic interactions between privacy-aware owners and the learner as a\nStackelberg game over the asked price and privacy factor. Finally, we\nnumerically evaluate how data similarity affects market participation and\ntraded data value.\n","authors":["Shashi Raj Pandey","Pierre Pinson","Petar Popovski"],"pdf_url":"https://arxiv.org/pdf/2312.02611v1.pdf","comment":"Submitted to IEEE Transactions on Neural Networks and Learning\n Systems (submission version)"},{"id":"http://arxiv.org/abs/2311.17431v4","updated":"2023-12-05T09:35:03Z","published":"2023-11-29T08:21:42Z","title":"Grounding Foundation Models through Federated Transfer Learning: A\n General Framework","summary":" Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and\npowerful emergent abilities have achieved remarkable success in various natural\nlanguage processing and computer vision tasks. Grounding FMs by adapting them\nto domain-specific tasks or augmenting them with domain-specific knowledge\nenables us to exploit the full potential of FMs. However, grounding FMs faces\nseveral challenges, stemming primarily from constrained computing resources,\ndata privacy, model heterogeneity, and model ownership. Federated Transfer\nLearning (FTL), the combination of federated learning and transfer learning,\nprovides promising solutions to address these challenges. In recent years, the\nneed for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in\nboth academia and industry. Motivated by the strong growth in FTL-FM research\nand the potential impact of FTL-FM on industrial applications, we propose an\nFTL-FM framework that formulates problems of grounding FMs in the federated\nlearning setting, construct a detailed taxonomy based on the FTL-FM framework\nto categorize state-of-the-art FTL-FM works, and comprehensively overview\nFTL-FM works based on the proposed taxonomy. We also establish correspondences\nbetween FTL-FM and conventional phases of adapting FM so that FM practitioners\ncan align their research works with FTL-FM. In addition, we overview advanced\nefficiency-improving and privacy-preserving techniques because efficiency and\nprivacy are critical concerns in FTL-FM. Last, we discuss opportunities and\nfuture research directions of FTL-FM.\n","authors":["Yan Kang","Tao Fan","Hanlin Gu","Lixin Fan","Qiang Yang"],"pdf_url":"https://arxiv.org/pdf/2311.17431v4.pdf","comment":"In progress. fixed some typos, errors, and revised the text a little\n bit"},{"id":"http://arxiv.org/abs/2312.02608v1","updated":"2023-12-05T09:34:56Z","published":"2023-12-05T09:34:56Z","title":"Panoptica -- instance-wise evaluation of 3D semantic and instance\n segmentation maps","summary":" This paper introduces panoptica, a versatile and performance-optimized\npackage designed for computing instance-wise segmentation quality metrics from\n2D and 3D segmentation maps. panoptica addresses the limitations of existing\nmetrics and provides a modular framework that complements the original\nintersection over union-based panoptic quality with other metrics, such as the\ndistance metric Average Symmetric Surface Distance. The package is open-source,\nimplemented in Python, and accompanied by comprehensive documentation and\ntutorials. panoptica employs a three-step metrics computation process to cover\ndiverse use cases. The efficacy of panoptica is demonstrated on various\nreal-world biomedical datasets, where an instance-wise evaluation is\ninstrumental for an accurate representation of the underlying clinical task.\nOverall, we envision panoptica as a valuable tool facilitating in-depth\nevaluation of segmentation methods.\n","authors":["Florian Kofler","Hendrik Möller","Josef A. Buchner","Ezequiel de la Rosa","Ivan Ezhov","Marcel Rosier","Isra Mekki","Suprosanna Shit","Moritz Negwer","Rami Al-Maskari","Ali Ertürk","Shankeeth Vinayahalingam","Fabian Isensee","Sarthak Pati","Daniel Rueckert","Jan S. Kirschke","Stefan K. Ehrlich","Annika Reinke","Bjoern Menze","Benedikt Wiestler","Marie Piraud"],"pdf_url":"https://arxiv.org/pdf/2312.02608v1.pdf","comment":"15 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2305.19798v2","updated":"2023-12-05T09:26:05Z","published":"2023-05-31T12:38:24Z","title":"Primal-Attention: Self-attention through Asymmetric Kernel SVD in Primal\n Representation","summary":" Recently, a new line of works has emerged to understand and improve\nself-attention in Transformers by treating it as a kernel machine. However,\nexisting works apply the methods for symmetric kernels to the asymmetric\nself-attention, resulting in a nontrivial gap between the analytical\nunderstanding and numerical implementation. In this paper, we provide a new\nperspective to represent and optimize self-attention through asymmetric Kernel\nSingular Value Decomposition (KSVD), which is also motivated by the low-rank\nproperty of self-attention normally observed in deep layers. Through asymmetric\nKSVD, $i$) a primal-dual representation of self-attention is formulated, where\nthe optimization objective is cast to maximize the projection variances in the\nattention outputs; $ii$) a novel attention mechanism, i.e., Primal-Attention,\nis proposed via the primal representation of KSVD, avoiding explicit\ncomputation of the kernel matrix in the dual; $iii$) with KKT conditions, we\nprove that the stationary solution to the KSVD optimization in Primal-Attention\nyields a zero-value objective. In this manner, KSVD optimization can be\nimplemented by simply minimizing a regularization loss, so that low-rank\nproperty is promoted without extra decomposition. Numerical experiments show\nstate-of-the-art performance of our Primal-Attention with improved efficiency.\nMoreover, we demonstrate that the deployed KSVD optimization regularizes\nPrimal-Attention with a sharper singular value decay than that of the canonical\nself-attention, further verifying the great potential of our method. To the\nbest of our knowledge, this is the first work that provides a primal-dual\nrepresentation for the asymmetric kernel in self-attention and successfully\napplies it to modeling and optimization.\n","authors":["Yingyi Chen","Qinghua Tao","Francesco Tonin","Johan A. K. Suykens"],"pdf_url":"https://arxiv.org/pdf/2305.19798v2.pdf","comment":"NeurIPS 2023. We provide a primal-dual representation for the\n asymmetric self-attention in transformer that allows to avoid explicit\n computation of the kernel matrix"},{"id":"http://arxiv.org/abs/2304.06059v2","updated":"2023-12-05T09:16:40Z","published":"2023-04-12T15:29:28Z","title":"Efficient Deep Learning Models for Privacy-preserving People Counting on\n Low-resolution Infrared Arrays","summary":" Ultra-low-resolution Infrared (IR) array sensors offer a low-cost,\nenergy-efficient, and privacy-preserving solution for people counting, with\napplications such as occupancy monitoring. Previous work has shown that Deep\nLearning (DL) can yield superior performance on this task. However, the\nliterature was missing an extensive comparative analysis of various efficient\nDL architectures for IR array-based people counting, that considers not only\ntheir accuracy, but also the cost of deploying them on memory- and\nenergy-constrained Internet of Things (IoT) edge nodes. In this work, we\naddress this need by comparing 6 different DL architectures on a novel dataset\ncomposed of IR images collected from a commercial 8x8 array, which we made\nopenly available. With a wide architectural exploration of each model type, we\nobtain a rich set of Pareto-optimal solutions, spanning cross-validated\nbalanced accuracy scores in the 55.70-82.70% range. When deployed on a\ncommercial Microcontroller (MCU) by STMicroelectronics, the STM32L4A6ZG, these\nmodels occupy 0.41-9.28kB of memory, and require 1.10-7.74ms per inference,\nwhile consuming 17.18-120.43 $\\mu$J of energy. Our models are significantly\nmore accurate than a previous deterministic method (up to +39.9%), while being\nup to 3.53x faster and more energy efficient. Further, our models' accuracy is\ncomparable to state-of-the-art DL solutions on similar resolution sensors,\ndespite a much lower complexity. All our models enable continuous, real-time\ninference on a MCU-based IoT node, with years of autonomous operation without\nbattery recharging.\n","authors":["Chen Xie","Francesco Daghero","Yukai Chen","Marco Castellano","Luca Gandolfi","Andrea Calimera","Enrico Macii","Massimo Poncino","Daniele Jahier Pagliari"],"pdf_url":"https://arxiv.org/pdf/2304.06059v2.pdf","comment":"This article has been accepted for publication in IEEE Internet of\n Things Journal; Fixed typos"},{"id":"http://arxiv.org/abs/2312.02596v1","updated":"2023-12-05T09:15:10Z","published":"2023-12-05T09:15:10Z","title":"TSVR+: Twin support vector regression with privileged information","summary":" In the realm of machine learning, the data may contain additional attributes,\nknown as privileged information (PI). The main purpose of PI is to assist in\nthe training of the model and then utilize the acquired knowledge to make\npredictions for unseen samples. Support vector regression (SVR) is an effective\nregression model, however, it has a low learning speed due to solving a convex\nquadratic problem (QP) subject to a pair of constraints. In contrast, twin\nsupport vector regression (TSVR) is more efficient than SVR as it solves two\nQPs each subject to one set of constraints. However, TSVR and its variants are\ntrained only on regular features and do not use privileged features for\ntraining. To fill this gap, we introduce a fusion of TSVR with learning using\nprivileged information (LUPI) and propose a novel approach called twin support\nvector regression with privileged information (TSVR+). The regularization terms\nin the proposed TSVR+ capture the essence of statistical learning theory and\nimplement the structural risk minimization principle. We use the successive\noverrelaxation (SOR) technique to solve the optimization problem of the\nproposed TSVR+, which enhances the training efficiency. As far as our knowledge\nextends, the integration of the LUPI concept into twin variants of regression\nmodels is a novel advancement. The numerical experiments conducted on UCI,\nstock and time series data collectively demonstrate the superiority of the\nproposed model.\n","authors":["Anuradha Kumari","M. Tanveer"],"pdf_url":"https://arxiv.org/pdf/2312.02596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02592v1","updated":"2023-12-05T09:09:21Z","published":"2023-12-05T09:09:21Z","title":"FRAPPÉ: A Post-Processing Framework for Group Fairness Regularization","summary":" Post-processing mitigation techniques for group fairness generally adjust the\ndecision threshold of a base model in order to improve fairness. Methods in\nthis family exhibit several advantages that make them appealing in practice:\npost-processing requires no access to the model training pipeline, is agnostic\nto the base model architecture, and offers a reduced computation cost compared\nto in-processing. Despite these benefits, existing methods face other\nchallenges that limit their applicability: they require knowledge of the\nsensitive attributes at inference time and are oftentimes outperformed by\nin-processing. In this paper, we propose a general framework to transform any\nin-processing method with a penalized objective into a post-processing\nprocedure. The resulting method is specifically designed to overcome the\naforementioned shortcomings of prior post-processing approaches. Furthermore,\nwe show theoretically and through extensive experiments on real-world data that\nthe resulting post-processing method matches or even surpasses the\nfairness-error trade-off offered by the in-processing counterpart.\n","authors":["Alexandru Ţifrea","Preethi Lahoti","Ben Packer","Yoni Halpern","Ahmad Beirami","Flavien Prost"],"pdf_url":"https://arxiv.org/pdf/2312.02592v1.pdf","comment":"Presubmission"},{"id":"http://arxiv.org/abs/2311.03236v2","updated":"2023-12-05T09:08:30Z","published":"2023-11-06T16:26:52Z","title":"Out-of-distribution Detection Learning with Unreliable\n Out-of-distribution Sources","summary":" Out-of-distribution (OOD) detection discerns OOD data where the predictor\ncannot make valid predictions as in-distribution (ID) data, thereby increasing\nthe reliability of open-world classification. However, it is typically hard to\ncollect real out-of-distribution (OOD) data for training a predictor capable of\ndiscerning ID and OOD patterns. This obstacle gives rise to data\ngeneration-based learning methods, synthesizing OOD data via data generators\nfor predictor training without requiring any real OOD data. Related methods\ntypically pre-train a generator on ID data and adopt various selection\nprocedures to find those data likely to be the OOD cases. However, generated\ndata may still coincide with ID semantics, i.e., mistaken OOD generation\nremains, confusing the predictor between ID and OOD data. To this end, we\nsuggest that generated data (with mistaken OOD generation) can be used to\ndevise an auxiliary OOD detection task to facilitate real OOD detection.\nSpecifically, we can ensure that learning from such an auxiliary task is\nbeneficial if the ID and the OOD parts have disjoint supports, with the help of\na well-designed training procedure for the predictor. Accordingly, we propose a\npowerful data generation-based learning method named Auxiliary Task-based OOD\nLearning (ATOL) that can relieve the mistaken OOD generation. We conduct\nextensive experiments under various OOD detection setups, demonstrating the\neffectiveness of our method against its advanced counterparts.\n","authors":["Haotian Zheng","Qizhou Wang","Zhen Fang","Xiaobo Xia","Feng Liu","Tongliang Liu","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2311.03236v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2310.06609v2","updated":"2023-12-05T09:01:30Z","published":"2023-10-10T13:23:05Z","title":"Discovering Interpretable Physical Models using Symbolic Regression and\n Discrete Exterior Calculus","summary":" Computational modeling is a key resource to gather insight into physical\nsystems in modern scientific research and engineering. While access to large\namount of data has fueled the use of Machine Learning (ML) to recover physical\nmodels from experiments and increase the accuracy of physical simulations,\npurely data-driven models have limited generalization and interpretability. To\novercome these limitations, we propose a framework that combines Symbolic\nRegression (SR) and Discrete Exterior Calculus (DEC) for the automated\ndiscovery of physical models starting from experimental data. Since these\nmodels consist of mathematical expressions, they are interpretable and amenable\nto analysis, and the use of a natural, general-purpose discrete mathematical\nlanguage for physics favors generalization with limited input data.\nImportantly, DEC provides building blocks for the discrete analogue of field\ntheories, which are beyond the state-of-the-art applications of SR to physical\nproblems. Further, we show that DEC allows to implement a strongly-typed SR\nprocedure that guarantees the mathematical consistency of the recovered models\nand reduces the search space of symbolic expressions. Finally, we prove the\neffectiveness of our methodology by re-discovering three models of Continuum\nPhysics from synthetic experimental data: Poisson equation, the Euler's\nElastica and the equations of Linear Elasticity. Thanks to their\ngeneral-purpose nature, the methods developed in this paper may be applied to\ndiverse contexts of physical modeling.\n","authors":["Simone Manti","Alessandro Lucantonio"],"pdf_url":"https://arxiv.org/pdf/2310.06609v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01529v2","updated":"2023-12-05T09:01:07Z","published":"2023-12-03T23:03:22Z","title":"T3D: Towards 3D Medical Image Understanding through Vision-Language\n Pre-training","summary":" Expert annotation of 3D medical image for downstream analysis is\nresource-intensive, posing challenges in clinical applications. Visual\nself-supervised learning (vSSL), though effective for learning visual\ninvariance, neglects the incorporation of domain knowledge from medicine. To\nincorporate medical knowledge into visual representation learning,\nvision-language pre-training (VLP) has shown promising results in 2D image.\nHowever, existing VLP approaches become generally impractical when applied to\nhigh-resolution 3D medical images due to GPU hardware constraints and the\npotential loss of critical details caused by downsampling, which is the\nintuitive solution to hardware constraints. To address the above limitations,\nwe introduce T3D, the first VLP framework designed for high-resolution 3D\nmedical images. T3D incorporates two text-informed pretext tasks:\n(\\lowerromannumeral{1}) text-informed contrastive learning;\n(\\lowerromannumeral{2}) text-informed image restoration. These tasks focus on\nlearning 3D visual representations from high-resolution 3D medical images and\nintegrating clinical knowledge from radiology reports, without distorting\ninformation through forced alignment of downsampled volumes with detailed\nanatomical text. Trained on a newly curated large-scale dataset of 3D medical\nimages and radiology reports, T3D significantly outperforms current vSSL\nmethods in tasks like organ and tumor segmentation, as well as disease\nclassification. This underlines T3D's potential in representation learning for\n3D medical image analysis. All data and code will be available upon acceptance.\n","authors":["Che Liu","Cheng Ouyang","Yinda Chen","Cesar César Quilodrán-Casas","Lei Ma","Jie Fu","Yike Guo","Anand Shah","Wenjia Bai","Rossella Arcucci"],"pdf_url":"https://arxiv.org/pdf/2312.01529v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10313v4","updated":"2023-12-05T08:59:33Z","published":"2023-09-19T04:51:13Z","title":"Investigating the Catastrophic Forgetting in Multimodal Large Language\n Models","summary":" Following the success of GPT4, there has been a surge in interest in\nmultimodal large language model (MLLM) research. This line of research focuses\non developing general-purpose LLMs through fine-tuning pre-trained LLMs and\nvision models. However, catastrophic forgetting, a notorious phenomenon where\nthe fine-tuned model fails to retain similar performance compared to the\npre-trained model, still remains an inherent problem in multimodal LLMs (MLLM).\nIn this paper, we introduce EMT: Evaluating MulTimodality for evaluating the\ncatastrophic forgetting in MLLMs, by treating each MLLM as an image classifier.\nWe first apply EMT to evaluate several open-source fine-tuned MLLMs and we\ndiscover that almost all evaluated MLLMs fail to retain the same performance\nlevels as their vision encoders on standard image classification tasks.\nMoreover, we continue fine-tuning LLaVA, an MLLM and utilize EMT to assess\nperformance throughout the fine-tuning. Interestingly, our results suggest that\nearly-stage fine-tuning on an image dataset improves performance across other\nimage datasets, by enhancing the alignment of text and visual features.\nHowever, as fine-tuning proceeds, the MLLMs begin to hallucinate, resulting in\na significant loss of generalizability, even when the image encoder remains\nfrozen. Our results suggest that MLLMs have yet to demonstrate performance on\npar with their vision models on standard image classification tasks and the\ncurrent MLLM fine-tuning procedure still has room for improvement.\n","authors":["Yuexiang Zhai","Shengbang Tong","Xiao Li","Mu Cai","Qing Qu","Yong Jae Lee","Yi Ma"],"pdf_url":"https://arxiv.org/pdf/2309.10313v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12334v2","updated":"2023-12-05T08:52:09Z","published":"2023-08-18T09:32:49Z","title":"Deep Knowledge Tracing is an implicit dynamic multidimensional item\n response theory model","summary":" Knowledge tracing consists in predicting the performance of some students on\nnew questions given their performance on previous questions, and can be a prior\nstep to optimizing assessment and learning. Deep knowledge tracing (DKT) is a\ncompetitive model for knowledge tracing relying on recurrent neural networks,\neven if some simpler models may match its performance. However, little is known\nabout why DKT works so well. In this paper, we frame deep knowledge tracing as\na encoderdecoder architecture. This viewpoint not only allows us to propose\nbetter models in terms of performance, simplicity or expressivity but also\nopens up promising avenues for future research directions. In particular, we\nshow on several small and large datasets that a simpler decoder, with possibly\nfewer parameters than the one used by DKT, can predict student performance\nbetter.\n","authors":["Jill-Jênn Vie","Hisashi Kashima"],"pdf_url":"https://arxiv.org/pdf/2309.12334v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.06053v4","updated":"2023-12-05T08:51:52Z","published":"2023-08-11T10:05:53Z","title":"Cost-effective On-device Continual Learning over Memory Hierarchy with\n Miro","summary":" Continual learning (CL) trains NN models incrementally from a continuous\nstream of tasks. To remember previously learned knowledge, prior studies store\nold samples over a memory hierarchy and replay them when new tasks arrive. Edge\ndevices that adopt CL to preserve data privacy are typically energy-sensitive\nand thus require high model accuracy while not compromising energy efficiency,\ni.e., cost-effectiveness. Our work is the first to explore the design space of\nhierarchical memory replay-based CL to gain insights into achieving\ncost-effectiveness on edge devices. We present Miro, a novel system runtime\nthat carefully integrates our insights into the CL framework by enabling it to\ndynamically configure the CL system based on resource states for the best\ncost-effectiveness. To reach this goal, Miro also performs online profiling on\nparameters with clear accuracy-energy trade-offs and adapts to optimal values\nwith low overhead. Extensive evaluations show that Miro significantly\noutperforms baseline systems we build for comparison, consistently achieving\nhigher cost-effectiveness.\n","authors":["Xinyue Ma","Suyeon Jeong","Minjia Zhang","Di Wang","Jonghyun Choi","Myeongjae Jeon"],"pdf_url":"https://arxiv.org/pdf/2308.06053v4.pdf","comment":"This paper is published in the 29th Annual International Conference\n on Mobile Computing and Networking (ACM MobiCom '23)"},{"id":"http://arxiv.org/abs/2312.02573v1","updated":"2023-12-05T08:41:23Z","published":"2023-12-05T08:41:23Z","title":"UTBoost: A Tree-boosting based System for Uplift Modeling","summary":" Uplift modeling refers to the set of machine learning techniques that a\nmanager may use to estimate customer uplift, that is, the net effect of an\naction on some customer outcome. By identifying the subset of customers for\nwhom a treatment will have the greatest effect, uplift models assist\ndecision-makers in optimizing resource allocations and maximizing overall\nreturns. Accurately estimating customer uplift poses practical challenges, as\nit requires assessing the difference between two mutually exclusive outcomes\nfor each individual. In this paper, we propose two innovative adaptations of\nthe well-established Gradient Boosting Decision Trees (GBDT) algorithm, which\nlearn the causal effect in a sequential way and overcome the counter-factual\nnature. Both approaches innovate existing techniques in terms of ensemble\nlearning method and learning objectives, respectively. Experiments on\nlarge-scale datasets demonstrate the usefulness of the proposed methods, which\noften yielding remarkable improvements over base models. To facilitate the\napplication, we develop the UTBoost, an end-to-end tree boosting system\nspecifically designed for uplift modeling. The package is open source and has\nbeen optimized for training speed to meet the needs of real industrial\napplications.\n","authors":["Junjie Gao","Xiangyu Zheng","DongDong Wang","Zhixiang Huang","Bangqi Zheng","Kai Yang"],"pdf_url":"https://arxiv.org/pdf/2312.02573v1.pdf","comment":"11 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.02566v1","updated":"2023-12-05T08:24:26Z","published":"2023-12-05T08:24:26Z","title":"Structured World Representations in Maze-Solving Transformers","summary":" Transformer models underpin many recent advances in practical machine\nlearning applications, yet understanding their internal behavior continues to\nelude researchers. Given the size and complexity of these models, forming a\ncomprehensive picture of their inner workings remains a significant challenge.\nTo this end, we set out to understand small transformer models in a more\ntractable setting: that of solving mazes. In this work, we focus on the\nabstractions formed by these models and find evidence for the consistent\nemergence of structured internal representations of maze topology and valid\npaths. We demonstrate this by showing that the residual stream of only a single\ntoken can be linearly decoded to faithfully reconstruct the entire maze. We\nalso find that the learned embeddings of individual tokens have spatial\nstructure. Furthermore, we take steps towards deciphering the circuity of\npath-following by identifying attention heads (dubbed $\\textit{adjacency\nheads}$), which are implicated in finding valid subsequent tokens.\n","authors":["Michael Igorevich Ivanitskiy","Alex F. Spies","Tilman Räuker","Guillaume Corlouer","Chris Mathwin","Lucia Quirke","Can Rager","Rusheb Shah","Dan Valentine","Cecilia Diniz Behn","Katsumi Inoue","Samy Wu Fung"],"pdf_url":"https://arxiv.org/pdf/2312.02566v1.pdf","comment":"15 pages, 18 figures, 15 tables. Corresponding author: Michael\n Ivanitskiy (mivanits@mines.edu). Code available at\n https://github.com/understanding-search/structured-representations-maze-transformers"},{"id":"http://arxiv.org/abs/2212.00433v2","updated":"2023-12-05T08:12:53Z","published":"2022-12-01T11:11:14Z","title":"Regularization Trade-offs with Fake Features","summary":" Recent successes of massively overparameterized models have inspired a new\nline of work investigating the underlying conditions that enable\noverparameterized models to generalize well. This paper considers a framework\nwhere the possibly overparametrized model includes fake features, i.e.,\nfeatures that are present in the model but not in the data. We present a\nnon-asymptotic high-probability bound on the generalization error of the ridge\nregression problem under the model misspecification of having fake features.\nOur highprobability results provide insights into the interplay between the\nimplicit regularization provided by the fake features and the explicit\nregularization provided by the ridge parameter. Numerical results illustrate\nthe trade-off between the number of fake features and how the optimal ridge\nparameter may heavily depend on the number of fake features.\n","authors":["Martin Hellkvist","Ayça Özçelikkale","Anders Ahlén"],"pdf_url":"https://arxiv.org/pdf/2212.00433v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.12307v2","updated":"2023-12-05T08:12:50Z","published":"2023-09-21T17:59:11Z","title":"LongLoRA: Efficient Fine-tuning of Long-Context Large Language Models","summary":" We present LongLoRA, an efficient fine-tuning approach that extends the\ncontext sizes of pre-trained large language models (LLMs), with limited\ncomputation cost. Typically, training LLMs with long context sizes is\ncomputationally expensive, requiring extensive training hours and GPU\nresources. For example, training on the context length of 8192 needs 16x\ncomputational costs in self-attention layers as that of 2048. In this paper, we\nspeed up the context extension of LLMs in two aspects. On the one hand,\nalthough dense global attention is needed during inference, fine-tuning the\nmodel can be effectively and efficiently done by sparse local attention. The\nproposed shifted sparse attention (S$^2$-Attn) effectively enables context\nextension, leading to non-trivial computation saving with similar performance\nto fine-tuning with vanilla attention. Particularly, it can be implemented with\nonly two lines of code in training, while being optional in inference. On the\nother hand, we revisit the parameter-efficient fine-tuning regime for context\nexpansion. Notably, we find that LoRA for context extension works well under\nthe premise of trainable embedding and normalization. LongLoRA combines this\nimproved LoRA with S$^2$-Attn. LongLoRA demonstrates strong empirical results\non various tasks on Llama2 models from 7B/13B to 70B. LongLoRA adopts Llama2 7B\nfrom 4k context to 100k, or Llama2 70B to 32k on a single 8x A100 machine.\nLongLoRA extends models' context while retaining their original architectures,\nand is compatible with most existing techniques, like Flash-Attention2. In\naddition, we further conduct supervised fine-tuning with LongLoRA and our long\ninstruction-following LongAlpaca dataset.\n","authors":["Yukang Chen","Shengju Qian","Haotian Tang","Xin Lai","Zhijian Liu","Song Han","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2309.12307v2.pdf","comment":"Code, models, dataset, and demo are available at\n https://github.com/dvlab-research/LongLoRA"},{"id":"http://arxiv.org/abs/2312.02554v1","updated":"2023-12-05T07:52:12Z","published":"2023-12-05T07:52:12Z","title":"ULMA: Unified Language Model Alignment with Demonstration and Point-wise\n Human Preference","summary":" Language model alignment is a cutting-edge technique in large language model\ntraining to align the model output to user's intent, e.g., being helpful and\nharmless. Recent alignment framework consists of two steps: supervised\nfine-tuning with demonstration data and preference learning with human\npreference data. Previous preference learning methods, such as RLHF and DPO,\nmainly focus on pair-wise preference data. However, in many real-world\nscenarios where human feedbacks are intrinsically point-wise, these methods\nwill suffer from information loss or even fail. To fill this gap, in this\npaper, we first develop a preference learning method called point-wise DPO to\ntackle point-wise preference data. Further revelation on the connection between\nsupervised fine-tuning and point-wise preference learning enables us to develop\na unified framework for both human demonstration and point-wise preference\ndata, which sheds new light on the construction of preference dataset.\nExtensive experiments on point-wise datasets with binary or continuous labels\ndemonstrate the superior performance and efficiency of our proposed methods. A\nnew dataset with high-quality demonstration samples on harmlessness is\nconstructed and made publicly available.\n","authors":["Tianchi Cai","Xierui Song","Jiyan Jiang","Fei Teng","Jinjie Gu","Guannan Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02554v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14235v3","updated":"2023-12-05T07:41:23Z","published":"2023-09-25T15:47:07Z","title":"Stackelberg Driver Model for Continual Policy Improvement in\n Scenario-Based Closed-Loop Autonomous Driving","summary":" The deployment of autonomous vehicles (AVs) has faced hurdles due to the\ndominance of rare but critical corner cases within the long-tail distribution\nof driving scenarios, which negatively affects their overall performance. To\naddress this challenge, adversarial generation methods have emerged as a class\nof efficient approaches to synthesize safety-critical scenarios for AV testing.\nHowever, these generated scenarios are often underutilized for AV training,\nresulting in the potential for continual AV policy improvement remaining\nuntapped, along with a deficiency in the closed-loop design needed to achieve\nit. Therefore, we tailor the Stackelberg Driver Model (SDM) to accurately\ncharacterize the hierarchical nature of vehicle interaction dynamics,\nfacilitating iterative improvement by engaging background vehicles (BVs) and AV\nin a sequential game-like interaction paradigm. With AV acting as the leader\nand BVs as followers, this leader-follower modeling ensures that AV would\nconsistently refine its policy, always taking into account the additional\ninformation that BVs play the best response to challenge AV. Extensive\nexperiments have shown that our algorithm exhibits superior performance\ncompared to several baselines especially in higher dimensional scenarios,\nleading to substantial advancements in AV capabilities while continually\ngenerating progressively challenging scenarios. Code is available at\nhttps://github.com/BlueCat-de/SDM.\n","authors":["Haoyi Niu","Qimao Chen","Yingyue Li","Yi Zhang","Jianming Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14235v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03191v2","updated":"2023-12-05T07:35:24Z","published":"2023-11-06T15:29:30Z","title":"DeepInception: Hypnotize Large Language Model to Be Jailbreaker","summary":" Despite remarkable success in various applications, large language models\n(LLMs) are vulnerable to adversarial jailbreaks that make the safety guardrails\nvoid. However, previous studies for jailbreaks usually resort to brute-force\noptimization or extrapolations of a high computation cost, which might not be\npractical or effective. In this paper, inspired by the Milgram experiment that\nindividuals can harm another person if they are told to do so by an\nauthoritative figure, we disclose a lightweight method, termed as\nDeepInception, which can easily hypnotize LLM to be a jailbreaker and unlock\nits misusing risks. Specifically, DeepInception leverages the personification\nability of LLM to construct a novel nested scene to behave, which realizes an\nadaptive way to escape the usage control in a normal scenario and provides the\npossibility for further direct jailbreaks. Empirically, we conduct\ncomprehensive experiments to show its efficacy. Our DeepInception can achieve\ncompetitive jailbreak success rates with previous counterparts and realize a\ncontinuous jailbreak in subsequent interactions, which reveals the critical\nweakness of self-losing on both open/closed-source LLMs like Falcon, Vicuna,\nLlama-2, and GPT-3.5/4/4V. Our investigation appeals that people should pay\nmore attention to the safety aspects of LLMs and a stronger defense against\ntheir misuse risks. The code is publicly available at:\nhttps://github.com/tmlr-group/DeepInception.\n","authors":["Xuan Li","Zhanke Zhou","Jianing Zhu","Jiangchao Yao","Tongliang Liu","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2311.03191v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02547v1","updated":"2023-12-05T07:33:51Z","published":"2023-12-05T07:33:51Z","title":"On Optimal Consistency-Robustness Trade-Off for Learning-Augmented\n Multi-Option Ski Rental","summary":" The learning-augmented multi-option ski rental problem generalizes the\nclassical ski rental problem in two ways: the algorithm is provided with a\nprediction on the number of days we can ski, and the ski rental options now\ncome with a variety of rental periods and prices to choose from, unlike the\nclassical two-option setting. Subsequent to the initial study of the\nmulti-option ski rental problem (without learning augmentation) due to Zhang,\nPoon, and Xu, significant progress has been made for this problem recently in\nparticular. The problem is very well understood when we relinquish one of the\ntwo generalizations -- for the learning-augmented classical ski rental problem,\nalgorithms giving best-possible trade-off between consistency and robustness\nexist; for the multi-option ski rental problem without learning augmentation,\ndeterministic/randomized algorithms giving the best-possible competitiveness\nhave been found. However, in presence of both generalizations, there remained a\nhuge gap between the algorithmic and impossibility results. In fact, for\nrandomized algorithms, we did not have any nontrivial lower bounds on the\nconsistency-robustness trade-off before.\n This paper bridges this gap for both deterministic and randomized algorithms.\nFor deterministic algorithms, we present a best-possible algorithm that\ncompletely matches the known lower bound. For randomized algorithms, we show\nthe first nontrivial lower bound on the consistency-robustness trade-off, and\nalso present an improved randomized algorithm. Our algorithm matches our lower\nbound on robustness within a factor of e/2 when the consistency is at most\n1.086.\n","authors":["Yongho Shin","Changyeol Lee","Hyung-Chan An"],"pdf_url":"https://arxiv.org/pdf/2312.02547v1.pdf","comment":"16 pages, 2 figures"},{"id":"http://arxiv.org/abs/2312.02544v1","updated":"2023-12-05T07:21:00Z","published":"2023-12-05T07:21:00Z","title":"Characterization of Locality in Spin States and Forced Moves for\n Optimizations","summary":" Ising formulations are widely utilized to solve combinatorial optimization\nproblems, and a variety of quantum or semiconductor-based hardware has recently\nbeen made available. In combinatorial optimization problems, the existence of\nlocal minima in energy landscapes is problematic to use to seek the global\nminimum. We note that the aim of the optimization is not to obtain exact\nsamplings from the Boltzmann distribution, and there is thus no need to satisfy\ndetailed balance conditions. In light of this fact, we develop an algorithm to\nget out of the local minima efficiently while it does not yield the exact\nsamplings. For this purpose, we utilize a feature that characterizes locality\nin the current state, which is easy to obtain with a type of specialized\nhardware. Furthermore, as the proposed algorithm is based on a rejection-free\nalgorithm, the computational cost is low. In this work, after presenting the\ndetails of the proposed algorithm, we report the results of numerical\nexperiments that demonstrate the effectiveness of the proposed feature and\nalgorithm.\n","authors":["Yoshiki Sato","Makiko Konoshima","Hirotaka Tamura","Jun Ohkubo"],"pdf_url":"https://arxiv.org/pdf/2312.02544v1.pdf","comment":"7 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.16573v5","updated":"2023-12-05T07:19:21Z","published":"2023-05-26T01:45:19Z","title":"Exploring Weight Balancing on Long-Tailed Recognition Problem","summary":" Recognition problems in long-tailed data, in which the sample size per class\nis heavily skewed, have gained importance because the distribution of the\nsample size per class in a dataset is generally exponential unless the sample\nsize is intentionally adjusted. Various methods have been devised to address\nthese problems. Recently, weight balancing, which combines well-known classical\nregularization techniques with two-stage training, has been proposed. Despite\nits simplicity, it is known for its high performance compared with existing\nmethods devised in various ways. However, there is a lack of understanding as\nto why this method is effective for long-tailed data. In this study, we analyze\nweight balancing by focusing on neural collapse and the cone effect at each\ntraining stage and found that it can be decomposed into an increase in Fisher's\ndiscriminant ratio of the feature extractor caused by weight decay and cross\nentropy loss and implicit logit adjustment caused by weight decay and\nclass-balanced loss. Our analysis enables the training method to be further\nsimplified by reducing the number of training stages to one while increasing\naccuracy.\n","authors":["Naoya Hasegawa","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2305.16573v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16994v4","updated":"2023-12-05T07:18:56Z","published":"2022-11-30T13:49:43Z","title":"Continual Learning with Distributed Optimization: Does CoCoA Forget?","summary":" We focus on the continual learning problem where the tasks arrive\nsequentially and the aim is to perform well on the newly arrived task without\nperformance degradation on the previously seen tasks. In contrast to the\ncontinual learning literature focusing on the centralized setting, we\ninvestigate the distributed estimation framework. We consider the\nwell-established distributed learning algorithm COCOA. We derive closed form\nexpressions for the iterations for the overparametrized case. We illustrate the\nconvergence and the error performance of the algorithm based on the\nover/under-parameterization of the problem. Our results show that depending on\nthe problem dimensions and data generation assumptions, COCOA can perform\ncontinual learning over a sequence of tasks, i.e., it can learn a new task\nwithout forgetting previously learned tasks, with access only to one task at a\ntime.\n","authors":["Martin Hellkvist","Ayça Özçelikkale","Anders Ahlén"],"pdf_url":"https://arxiv.org/pdf/2211.16994v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00839v2","updated":"2023-12-05T07:16:55Z","published":"2023-12-01T01:52:38Z","title":"PipeOptim: Ensuring Effective 1F1B Schedule with Optimizer-Dependent\n Weight Prediction","summary":" Asynchronous pipeline model parallelism with a \"1F1B\" (one forward, one\nbackward) schedule generates little bubble overhead and always provides quite a\nhigh throughput. However, the \"1F1B\" schedule inevitably leads to weight\ninconsistency and weight staleness issues due to the cross-training of\ndifferent mini-batches across GPUs. To simultaneously address these two\nproblems, in this paper, we propose an optimizer-dependent weight prediction\nstrategy (a.k.a PipeOptim) for asynchronous pipeline training. The key insight\nof our proposal is that we employ a weight prediction strategy in the forward\npass to ensure that each mini-batch uses consistent and staleness-free weights\nto compute the forward pass. To be concrete, we first construct the weight\nprediction scheme based on the update rule of the used optimizer when training\nthe deep neural network models. Then throughout the \"1F1B\" pipelined training,\neach mini-batch is mandated to execute weight prediction ahead of the forward\npass, subsequently employing the predicted weights to perform the forward pass.\nAs a result, PipeOptim 1) inherits the advantage of the \"1F1B\" schedule and\ngenerates pretty high throughput, and 2) can ensure effective parameter\nlearning regardless of the type of the used optimizer. To verify the\neffectiveness of our proposal, we conducted extensive experimental evaluations\nusing eight different deep-learning models spanning three machine-learning\ntasks including image classification, sentiment analysis, and machine\ntranslation. The experiment results demonstrate that PipeOptim outperforms the\npopular pipelined approaches including GPipe, PipeDream, PipeDream-2BW, and\nSpecTrain. The code of PipeOptim can be accessible at\nhttps://github.com/guanleics/PipeOptim.\n","authors":["Lei Guan","Dongsheng Li","Jiye Liang","Wenjian Wang","Xicheng Lu"],"pdf_url":"https://arxiv.org/pdf/2312.00839v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.02537v1","updated":"2023-12-05T07:04:21Z","published":"2023-12-05T07:04:21Z","title":"Asymmetric leader-laggard cluster synchronization for collective\n decision-making with laser network","summary":" Photonic accelerators have recently attracted soaring interest, harnessing\nthe ultimate nature of light for information processing. Collective\ndecision-making with a laser network, employing the chaotic and synchronous\ndynamics of optically interconnected lasers to address the competitive\nmulti-armed bandit (CMAB) problem, is a highly compelling approach due to its\nscalability and experimental feasibility. We investigated essential network\nstructures for collective decision-making through quantitative stability\nanalysis. Moreover, we demonstrated the asymmetric preferences of players in\nthe CMAB problem, extending its functionality to more practical applications.\nOur study highlights the capability and significance of machine learning built\nupon chaotic lasers and photonic devices.\n","authors":["Shun Kotoku","Takatomo Mihana","André Röhm","Ryoichi Horisaki","Makoto Naruse"],"pdf_url":"https://arxiv.org/pdf/2312.02537v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02530v1","updated":"2023-12-05T06:28:19Z","published":"2023-12-05T06:28:19Z","title":"MEMTO: Memory-guided Transformer for Multivariate Time Series Anomaly\n Detection","summary":" Detecting anomalies in real-world multivariate time series data is\nchallenging due to complex temporal dependencies and inter-variable\ncorrelations. Recently, reconstruction-based deep models have been widely used\nto solve the problem. However, these methods still suffer from an\nover-generalization issue and fail to deliver consistently high performance. To\naddress this issue, we propose the MEMTO, a memory-guided Transformer using a\nreconstruction-based approach. It is designed to incorporate a novel memory\nmodule that can learn the degree to which each memory item should be updated in\nresponse to the input data. To stabilize the training procedure, we use a\ntwo-phase training paradigm which involves using K-means clustering for\ninitializing memory items. Additionally, we introduce a bi-dimensional\ndeviation-based detection criterion that calculates anomaly scores considering\nboth input space and latent space. We evaluate our proposed method on five\nreal-world datasets from diverse domains, and it achieves an average anomaly\ndetection F1-score of 95.74%, significantly outperforming the previous\nstate-of-the-art methods. We also conduct extensive experiments to empirically\nvalidate the effectiveness of our proposed model's key components.\n","authors":["Junho Song","Keonwoo Kim","Jeonglyul Oh","Sungzoon Cho"],"pdf_url":"https://arxiv.org/pdf/2312.02530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.02521v2","updated":"2023-12-05T06:12:04Z","published":"2023-09-05T18:22:11Z","title":"Comparative Analysis of CPU and GPU Profiling for Deep Learning Models","summary":" Deep Learning(DL) and Machine Learning(ML) applications are rapidly\nincreasing in recent days. Massive amounts of data are being generated over the\ninternet which can derive meaningful results by the use of ML and DL\nalgorithms. Hardware resources and open-source libraries have made it easy to\nimplement these algorithms. Tensorflow and Pytorch are one of the leading\nframeworks for implementing ML projects. By using those frameworks, we can\ntrace the operations executed on both GPU and CPU to analyze the resource\nallocations and consumption. This paper presents the time and memory allocation\nof CPU and GPU while training deep neural networks using Pytorch. This paper\nanalysis shows that GPU has a lower running time as compared to CPU for deep\nneural networks. For a simpler network, there are not many significant\nimprovements in GPU over the CPU.\n","authors":["Dipesh Gyawali"],"pdf_url":"https://arxiv.org/pdf/2309.02521v2.pdf","comment":"6 pages, 11 figures"},{"id":"http://arxiv.org/abs/2307.07980v3","updated":"2023-12-05T06:06:49Z","published":"2023-07-16T08:46:46Z","title":"Byzantine-Robust Distributed Online Learning: Taming Adversarial\n Participants in An Adversarial Environment","summary":" This paper studies distributed online learning under Byzantine attacks. The\nperformance of an online learning algorithm is often characterized by\n(adversarial) regret, which evaluates the quality of one-step-ahead\ndecision-making when an environment provides adversarial losses, and a\nsublinear bound is preferred. But we prove that, even with a class of\nstate-of-the-art robust aggregation rules, in an adversarial environment and in\nthe presence of Byzantine participants, distributed online gradient descent can\nonly achieve a linear adversarial regret bound, which is tight. This is the\ninevitable consequence of Byzantine attacks, even though we can control the\nconstant of the linear adversarial regret to a reasonable level. Interestingly,\nwhen the environment is not fully adversarial so that the losses of the honest\nparticipants are i.i.d. (independent and identically distributed), we show that\nsublinear stochastic regret, in contrast to the aforementioned adversarial\nregret, is possible. We develop a Byzantine-robust distributed online momentum\nalgorithm to attain such a sublinear stochastic regret bound. Extensive\nnumerical experiments corroborate our theoretical analysis.\n","authors":["Xingrong Dong","Zhaoxian Wu","Qing Ling","Zhi Tian"],"pdf_url":"https://arxiv.org/pdf/2307.07980v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02522v1","updated":"2023-12-05T06:05:04Z","published":"2023-12-05T06:05:04Z","title":"MASP: Scalable GNN-based Planning for Multi-Agent Navigation","summary":" We investigate the problem of decentralized multi-agent navigation tasks,\nwhere multiple agents need to reach initially unassigned targets in a limited\ntime. Classical planning-based methods suffer from expensive computation\noverhead at each step and offer limited expressiveness for complex cooperation\nstrategies. In contrast, reinforcement learning (RL) has recently become a\npopular paradigm for addressing this issue. However, RL struggles with low data\nefficiency and cooperation when directly exploring (nearly) optimal policies in\nthe large search space, especially with an increased agent number (e.g., 10+\nagents) or in complex environments (e.g., 3D simulators). In this paper, we\npropose Multi-Agent Scalable GNN-based P lanner (MASP), a goal-conditioned\nhierarchical planner for navigation tasks with a substantial number of agents.\nMASP adopts a hierarchical framework to divide a large search space into\nmultiple smaller spaces, thereby reducing the space complexity and accelerating\ntraining convergence. We also leverage graph neural networks (GNN) to model the\ninteraction between agents and goals, improving goal achievement. Besides, to\nenhance generalization capabilities in scenarios with unseen team sizes, we\ndivide agents into multiple groups, each with a previously trained number of\nagents. The results demonstrate that MASP outperforms classical planning-based\ncompetitors and RL baselines, achieving a nearly 100% success rate with minimal\ntraining data in both multi-agent particle environments (MPE) with 50 agents\nand a quadrotor 3-dimensional environment (OmniDrones) with 20 agents.\nFurthermore, the learned policy showcases zero-shot generalization across\nunseen team sizes.\n","authors":["Xinyi Yang","Xinting Yang","Chao Yu","Jiayu Chen","Huazhong Yang","Yu Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02522v1.pdf","comment":"Submitted to IEEE RA-L"},{"id":"http://arxiv.org/abs/2312.02519v1","updated":"2023-12-05T06:00:52Z","published":"2023-12-05T06:00:52Z","title":"Creative Agents: Empowering Agents with Imagination for Creative Tasks","summary":" We study building embodied agents for open-ended creative tasks. While\nexisting methods build instruction-following agents that can perform diverse\nopen-ended tasks, none of them demonstrates creativity -- the ability to give\nnovel and diverse task solutions implicit in the language instructions. This\nlimitation comes from their inability to convert abstract language instructions\ninto concrete task goals in the environment and perform long-horizon planning\nfor such complicated goals. Given the observation that humans perform creative\ntasks with the help of imagination, we propose a class of solutions for\ncreative agents, where the controller is enhanced with an imaginator that\ngenerates detailed imaginations of task outcomes conditioned on language\ninstructions. We introduce several approaches to implementing the components of\ncreative agents. We implement the imaginator with either a large language model\nfor textual imagination or a diffusion model for visual imagination. The\ncontroller can either be a behavior-cloning policy learned from data or a\npre-trained foundation model generating executable codes in the environment. We\nbenchmark creative tasks with the challenging open-world game Minecraft, where\nthe agents are asked to create diverse buildings given free-form language\ninstructions. In addition, we propose novel evaluation metrics for open-ended\ncreative tasks utilizing GPT-4V, which holds many advantages over existing\nmetrics. We perform a detailed experimental analysis of creative agents,\nshowing that creative agents are the first AI agents accomplishing diverse\nbuilding creation in the survival mode of Minecraft. Our benchmark and models\nare open-source for future research on creative agents\n(https://github.com/PKU-RL/Creative-Agents).\n","authors":["Chi Zhang","Penglin Cai","Yuhui Fu","Haoqi Yuan","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2312.02519v1.pdf","comment":"The first two authors contribute equally"},{"id":"http://arxiv.org/abs/2312.02517v1","updated":"2023-12-05T05:52:44Z","published":"2023-12-05T05:52:44Z","title":"Simplifying Neural Network Training Under Class Imbalance","summary":" Real-world datasets are often highly class-imbalanced, which can adversely\nimpact the performance of deep learning models. The majority of research on\ntraining neural networks under class imbalance has focused on specialized loss\nfunctions, sampling techniques, or two-stage training procedures. Notably, we\ndemonstrate that simply tuning existing components of standard deep learning\npipelines, such as the batch size, data augmentation, optimizer, and label\nsmoothing, can achieve state-of-the-art performance without any such\nspecialized class imbalance methods. We also provide key prescriptions and\nconsiderations for training under class imbalance, and an understanding of why\nimbalance methods succeed or fail.\n","authors":["Ravid Shwartz-Ziv","Micah Goldblum","Yucen Lily Li","C. Bayan Bruss","Andrew Gordon Wilson"],"pdf_url":"https://arxiv.org/pdf/2312.02517v1.pdf","comment":"NeurIPS 2023. Code available at\n https://github.com/ravidziv/SimplifyingImbalancedTraining"},{"id":"http://arxiv.org/abs/2311.17932v2","updated":"2023-12-05T05:48:16Z","published":"2023-11-27T22:53:41Z","title":"Generating Molecular Conformer Fields","summary":" In this paper we tackle the problem of generating conformers of a molecule in\n3D space given its molecular graph. We parameterize these conformers as\ncontinuous functions that map elements from the molecular graph to points in 3D\nspace. We then formulate the problem of learning to generate conformers as\nlearning a distribution over these functions using a diffusion generative\nmodel, called Molecular Conformer Fields (MCF). Our approach is simple and\nscalable, and achieves state-of-the-art performance on challenging molecular\nconformer generation benchmarks while making no assumptions about the explicit\nstructure of molecules (e.g. modeling torsional angles). MCF represents an\nadvance in extending diffusion models to handle complex scientific problems in\na conceptually simple, scalable and effective manner.\n","authors":["Yuyang Wang","Ahmed A. Elhag","Navdeep Jaitly","Joshua M. Susskind","Miguel Angel Bautista"],"pdf_url":"https://arxiv.org/pdf/2311.17932v2.pdf","comment":"20 pages, 11 figures. arXiv admin note: text overlap with\n arXiv:2305.15586"},{"id":"http://arxiv.org/abs/2312.01606v2","updated":"2023-12-05T05:43:31Z","published":"2023-12-04T03:38:17Z","title":"Deep Learning-Driven Enhancement of Welding Quality Control: Predicting\n Welding Depth and Pore Volume in Hairpin Welding","summary":" To advance quality assurance in the welding process, this study presents a\nrobust deep learning model that enables the prediction of two critical welds\nKey Performance Characteristics (KPCs): welding depth and average pore volume.\nIn the proposed approach, a comprehensive range of laser welding Key Input\nCharacteristics (KICs) is utilized, including welding beam geometries, welding\nfeed rates, path repetitions for weld beam geometries, and bright light weld\nratios for all paths, all of which were obtained from hairpin welding\nexperiments. Two deep learning networks are employed with multiple hidden dense\nlayers and linear activation functions to showcase the capabilities of deep\nneural networks in capturing the intricate nonlinear connections inherent\nwithin welding KPCs and KICs. Applying deep learning networks to the small\nnumerical experimental hairpin welding dataset has shown promising results,\nachieving Mean Absolute Error (MAE) values as low as 0.1079 for predicting\nwelding depth and 0.0641 for average pore volume. Additionally, the validity\nverification demonstrates the reliability of the proposed method. This, in\nturn, promises significant advantages in controlling welding outcomes, moving\nbeyond the current trend of relying merely on monitoring for defect\nclassification.\n","authors":["Amena Darwish","Stefan Ericson","Rohollah Ghasemi","Tobias Andersson","Dan Lönn","Andreas Andersson Lassila","Kent Salomonsson"],"pdf_url":"https://arxiv.org/pdf/2312.01606v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02515v1","updated":"2023-12-05T05:38:38Z","published":"2023-12-05T05:38:38Z","title":"ASPEN: High-Throughput LoRA Fine-Tuning of Large Language Models with a\n Single GPU","summary":" Transformer-based large language models (LLMs) have demonstrated outstanding\nperformance across diverse domains, particularly when fine-turned for specific\ndomains. Recent studies suggest that the resources required for fine-tuning\nLLMs can be economized through parameter-efficient methods such as Low-Rank\nAdaptation (LoRA). While LoRA effectively reduces computational burdens and\nresource demands, it currently supports only a single-job fine-tuning setup.\n In this paper, we present ASPEN, a high-throughput framework for fine-tuning\nLLMs. ASPEN efficiently trains multiple jobs on a single GPU using the LoRA\nmethod, leveraging shared pre-trained model and adaptive scheduling. ASPEN is\ncompatible with transformer-based language models like LLaMA and ChatGLM, etc.\nExperiments show that ASPEN saves 53% of GPU memory when training multiple\nLLaMA-7B models on NVIDIA A100 80GB GPU and boosts training throughput by about\n17% compared to existing methods when training with various pre-trained models\non different GPUs. The adaptive scheduling algorithm reduces turnaround time by\n24%, end-to-end training latency by 12%, prioritizing jobs and preventing\nout-of-memory issues.\n","authors":["Zhengmao Ye","Dengchun Li","Jingqi Tian","Tingfeng Lan","Jie Zuo","Lei Duan","Hui Lu","Yexi Jiang","Jian Sha","Ke Zhang","Mingjie Tang"],"pdf_url":"https://arxiv.org/pdf/2312.02515v1.pdf","comment":"14 pages, 14 figures"},{"id":"http://arxiv.org/abs/2310.04406v2","updated":"2023-12-05T05:25:55Z","published":"2023-10-06T17:55:11Z","title":"Language Agent Tree Search Unifies Reasoning Acting and Planning in\n Language Models","summary":" While large language models (LLMs) have demonstrated impressive performance\non a range of decision-making tasks, they rely on simple acting processes and\nfall short of broad deployment as autonomous agents. We introduce LATS\n(Language Agent Tree Search), a general framework that synergizes the\ncapabilities of LLMs in planning, acting, and reasoning. Drawing inspiration\nfrom Monte Carlo tree search in model-based reinforcement learning, LATS\nemploys LLMs as agents, value functions, and optimizers, repurposing their\nlatent strengths for enhanced decision-making. What is crucial in this method\nis the use of an environment for external feedback, which offers a more\ndeliberate and adaptive problem-solving mechanism that moves beyond the\nlimitations of existing techniques. Our experimental evaluation across diverse\ndomains, such as programming, HotPotQA, and WebShop, illustrates the\napplicability of LATS for both reasoning and acting. In particular, LATS\nachieves 94.4% for programming on HumanEval with GPT-4 and an average score of\n75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness\nand generality of our method.\n","authors":["Andy Zhou","Kai Yan","Michal Shlapentokh-Rothman","Haohan Wang","Yu-Xiong Wang"],"pdf_url":"https://arxiv.org/pdf/2310.04406v2.pdf","comment":"Website and code can be found at\n https://andyz245.github.io/LanguageAgentTreeSearch"},{"id":"http://arxiv.org/abs/2310.16546v3","updated":"2023-12-05T05:14:37Z","published":"2023-10-25T10:53:04Z","title":"Pitfall of Optimism: Distributional Reinforcement Learning by\n Randomizing Risk Criterion","summary":" Distributional reinforcement learning algorithms have attempted to utilize\nestimated uncertainty for exploration, such as optimism in the face of\nuncertainty. However, using the estimated variance for optimistic exploration\nmay cause biased data collection and hinder convergence or performance. In this\npaper, we present a novel distributional reinforcement learning algorithm that\nselects actions by randomizing risk criterion to avoid one-sided tendency on\nrisk. We provide a perturbed distributional Bellman optimality operator by\ndistorting the risk measure and prove the convergence and optimality of the\nproposed method with the weaker contraction property. Our theoretical results\nsupport that the proposed method does not fall into biased exploration and is\nguaranteed to converge to an optimal return. Finally, we empirically show that\nour method outperforms other existing distribution-based algorithms in various\nenvironments including Atari 55 games.\n","authors":["Taehyun Cho","Seungyub Han","Heesoo Lee","Kyungjae Lee","Jungwoo Lee"],"pdf_url":"https://arxiv.org/pdf/2310.16546v3.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02494v1","updated":"2023-12-05T04:51:42Z","published":"2023-12-05T04:51:42Z","title":"ReconU-Net: a direct PET image reconstruction using U-Net architecture\n with back projection-induced skip connection","summary":" [Objective] This study aims to introduce a novel back projection-induced\nU-Net-shaped architecture, called ReconU-Net, for deep learning-based direct\npositron emission tomography (PET) image reconstruction. Additionally, our\nobjective is to analyze the behavior of direct PET image reconstruction and\ngain deeper insights by comparing the proposed ReconU-Net architecture with\nother encoder-decoder architectures without skip connections. [Approach] The\nproposed ReconU-Net architecture uniquely integrates the physical model of the\nback projection operation into the skip connection. This distinctive feature\nfacilitates the effective transfer of intrinsic spatial information from the\ninput sinogram to the reconstructed image via an embedded physical model. The\nproposed ReconU-Net was trained using Monte Carlo simulation data from the\nBrainweb phantom and tested on both simulated and real Hoffman brain phantom\ndata. [Main results] The proposed ReconU-Net method generated a reconstructed\nimage with a more accurate structure compared to other deep learning-based\ndirect reconstruction methods. Further analysis showed that the proposed\nReconU-Net architecture has the ability to transfer features of multiple\nresolutions, especially non-abstract high-resolution information, through skip\nconnections. Despite limited training on simulated data, the proposed\nReconU-Net successfully reconstructed the real Hoffman brain phantom, unlike\nother deep learning-based direct reconstruction methods, which failed to\nproduce a reconstructed image. [Significance] The proposed ReconU-Net can\nimprove the fidelity of direct PET image reconstruction, even when dealing with\nsmall training datasets, by leveraging the synergistic relationship between\ndata-driven modeling and the physics model of the imaging process.\n","authors":["Fumio Hashimoto","Kibo Ote"],"pdf_url":"https://arxiv.org/pdf/2312.02494v1.pdf","comment":"8 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.02491v1","updated":"2023-12-05T04:43:23Z","published":"2023-12-05T04:43:23Z","title":"Pseudo Replay-based Class Continual Learning for Online New Category\n Anomaly Detection in Additive Manufacturing","summary":" The incorporation of advanced sensors and machine learning techniques has\nenabled modern manufacturing enterprises to perform data-driven in-situ quality\nmonitoring based on the sensor data collected in manufacturing processes.\nHowever, one critical challenge is that newly presented defect category may\nmanifest as the manufacturing process continues, resulting in monitoring\nperformance deterioration of previously trained machine learning models. Hence,\nthere is an increasing need for empowering machine learning model to learn\ncontinually. Among all continual learning methods, memory-based continual\nlearning has the best performance but faces the constraints of data storage\ncapacity. To address this issue, this paper develops a novel pseudo\nreplay-based continual learning by integrating class incremental learning and\noversampling-based data generation. Without storing all the data, the developed\nframework could generate high-quality data representing previous classes to\ntrain machine learning model incrementally when new category anomaly occurs. In\naddition, it could even enhance the monitoring performance since it also\neffectively improves the data quality. The effectiveness of the proposed\nframework is validated in an additive manufacturing process, which leverages\nsupervised classification problem for anomaly detection. The experimental\nresults show that the developed method is very promising in detecting novel\nanomaly while maintaining a good performance on the previous task and brings up\nmore flexibility in model architecture.\n","authors":["Zhangyue Shi","Tianxin Xie","Chenang Liu","Yuxuan Li"],"pdf_url":"https://arxiv.org/pdf/2312.02491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02490v1","updated":"2023-12-05T04:42:04Z","published":"2023-12-05T04:42:04Z","title":"Constrained Twin Variational Auto-Encoder for Intrusion Detection in IoT\n Systems","summary":" Intrusion detection systems (IDSs) play a critical role in protecting\nbillions of IoT devices from malicious attacks. However, the IDSs for IoT\ndevices face inherent challenges of IoT systems, including the heterogeneity of\nIoT data/devices, the high dimensionality of training data, and the imbalanced\ndata. Moreover, the deployment of IDSs on IoT systems is challenging, and\nsometimes impossible, due to the limited resources such as memory/storage and\ncomputing capability of typical IoT devices. To tackle these challenges, this\narticle proposes a novel deep neural network/architecture called Constrained\nTwin Variational Auto-Encoder (CTVAE) that can feed classifiers of IDSs with\nmore separable/distinguishable and lower-dimensional representation data.\nAdditionally, in comparison to the state-of-the-art neural networks used in\nIDSs, CTVAE requires less memory/storage and computing power, hence making it\nmore suitable for IoT IDS systems. Extensive experiments with the 11 most\npopular IoT botnet datasets show that CTVAE can boost around 1% in terms of\naccuracy and Fscore in detection attack compared to the state-of-the-art\nmachine learning and representation learning methods, whilst the running time\nfor attack detection is lower than 2E-6 seconds and the model size is lower\nthan 1 MB. We also further investigate various characteristics of CTVAE in the\nlatent space and in the reconstruction representation to demonstrate its\nefficacy compared with current well-known methods.\n","authors":["Phai Vu Dinh","Quang Uy Nguyen","Dinh Thai Hoang","Diep N. Nguyen","Son Pham Bao","Eryk Dutkiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.02490v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01239v2","updated":"2023-12-05T04:14:18Z","published":"2023-12-02T22:25:24Z","title":"Motion Informed Needle Segmentation in Ultrasound Images","summary":" Segmenting a moving needle in ultrasound images is challenging due to the\npresence of artifacts, noise, and needle occlusion. This task becomes even more\ndemanding in scenarios where data availability is limited. Convolutional Neural\nNetworks (CNNs) have been successful in many computer vision applications, but\nstruggle to accurately segment needles without considering their motion. In\nthis paper, we present a novel approach for needle segmentation that combines\nclassical Kalman Filter (KF) techniques with data-driven learning,\nincorporating both needle features and needle motion. Our method offers two key\ncontributions. First, we propose a compatible framework that seamlessly\nintegrates into commonly used encoder-decoder style architectures. Second, we\ndemonstrate superior performance compared to recent state-of-the-art needle\nsegmentation models using our novel convolutional neural network (CNN) based\nKF-inspired block, achieving a 15\\% reduction in pixel-wise needle tip error\nand an 8\\% reduction in length error. Third, to our knowledge we are the first\nto implement a learnable filter to incorporate non-linear needle motion for\nimproving needle segmentation.\n","authors":["Raghavv Goel","Cecilia Morales","Manpreet Singh","Artur Dubrawski","John Galeotti","Howie Choset"],"pdf_url":"https://arxiv.org/pdf/2312.01239v2.pdf","comment":"7 pages, 4 figures, under review"},{"id":"http://arxiv.org/abs/2312.02478v1","updated":"2023-12-05T04:06:09Z","published":"2023-12-05T04:06:09Z","title":"RL-Based Cargo-UAV Trajectory Planning and Cell Association for Minimum\n Handoffs, Disconnectivity, and Energy Consumption","summary":" Unmanned aerial vehicle (UAV) is a promising technology for last-mile cargo\ndelivery. However, the limited on-board battery capacity, cellular\nunreliability, and frequent handoffs in the airspace are the main obstacles to\nunleash its full potential. Given that existing cellular networks were\nprimarily designed to service ground users, re-utilizing the same architecture\nfor highly mobile aerial users, e.g., cargo-UAVs, is deemed challenging.\nIndeed, to ensure a safe delivery using cargo-UAVs, it is crucial to utilize\nthe available energy efficiently, while guaranteeing reliable connectivity for\ncommand-and-control and avoiding frequent handoff. To achieve this goal, we\npropose a novel approach for joint cargo-UAV trajectory planning and cell\nassociation. Specifically, we formulate the cargo-UAV mission as a\nmulti-objective problem aiming to 1) minimize energy consumption, 2) reduce\nhandoff events, and 3) guarantee cellular reliability along the trajectory. We\nleverage reinforcement learning (RL) to jointly optimize the cargo-UAV's\ntrajectory and cell association. Simulation results demonstrate a performance\nimprovement of our proposed method, in terms of handoffs, disconnectivity, and\nenergy consumption, compared to benchmarks.\n","authors":["Nesrine Cherif","Wael Jaafar","Halim Yanikomeroglu","Abbas Yongacoglu"],"pdf_url":"https://arxiv.org/pdf/2312.02478v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02473v1","updated":"2023-12-05T03:58:05Z","published":"2023-12-05T03:58:05Z","title":"NeutronStream: A Dynamic GNN Training Framework with Sliding Window for\n Graph Streams","summary":" Existing Graph Neural Network (GNN) training frameworks have been designed to\nhelp developers easily create performant GNN implementations. However, most\nexisting GNN frameworks assume that the input graphs are static, but ignore\nthat most real-world graphs are constantly evolving. Though many dynamic GNN\nmodels have emerged to learn from evolving graphs, the training process of\nthese dynamic GNNs is dramatically different from traditional GNNs in that it\ncaptures both the spatial and temporal dependencies of graph updates. This\nposes new challenges for designing dynamic GNN training frameworks. First, the\ntraditional batched training method fails to capture real-time structural\nevolution information. Second, the time-dependent nature makes parallel\ntraining hard to design. Third, it lacks system supports for users to\nefficiently implement dynamic GNNs. In this paper, we present NeutronStream, a\nframework for training dynamic GNN models. NeutronStream abstracts the input\ndynamic graph into a chronologically updated stream of events and processes the\nstream with an optimized sliding window to incrementally capture the\nspatial-temporal dependencies of events. Furthermore, NeutronStream provides a\nparallel execution engine to tackle the sequential event processing challenge\nto achieve high performance. NeutronStream also integrates a built-in graph\nstorage structure that supports dynamic updates and provides a set of\neasy-to-use APIs that allow users to express their dynamic GNNs. Our\nexperimental results demonstrate that, compared to state-of-the-art dynamic GNN\nimplementations, NeutronStream achieves speedups ranging from 1.48X to 5.87X\nand an average accuracy improvement of 3.97%.\n","authors":["Chaoyi Chen","Dechao Gao","Yanfeng Zhang","Qiange Wang","Zhenbo Fu","Xuecang Zhang","Junhua Zhu","Yu Gu","Ge Yu"],"pdf_url":"https://arxiv.org/pdf/2312.02473v1.pdf","comment":"12 pages, 15 figures"},{"id":"http://arxiv.org/abs/2312.02471v1","updated":"2023-12-05T03:46:30Z","published":"2023-12-05T03:46:30Z","title":"Congestion-aware Distributed Task Offloading in Wireless Multi-hop\n Networks Using Graph Neural Networks","summary":" Computational offloading has become an enabling component for edge\nintelligence in mobile and smart devices. Existing offloading schemes mainly\nfocus on mobile devices and servers, while ignoring the potential network\ncongestion caused by tasks from multiple mobile devices, especially in wireless\nmulti-hop networks. To fill this gap, we propose a low-overhead,\ncongestion-aware distributed task offloading scheme by augmenting a distributed\ngreedy framework with graph-based machine learning. In simulated wireless\nmulti-hop networks with 20-110 nodes and a resource allocation scheme based on\nshortest path routing and contention-based link scheduling, our approach is\ndemonstrated to be effective in reducing congestion or unstable queues under\nthe context-agnostic baseline, while improving the execution latency over local\ncomputing.\n","authors":["Zhongyuan Zhao","Jake Perazzone","Gunjan Verma","Santiago Segarra"],"pdf_url":"https://arxiv.org/pdf/2312.02471v1.pdf","comment":"5 pages, 5 figures, submitted to IEEE ICASSP 2024"},{"id":"http://arxiv.org/abs/2312.02470v1","updated":"2023-12-05T03:41:17Z","published":"2023-12-05T03:41:17Z","title":"Generator Born from Classifier","summary":" In this paper, we make a bold attempt toward an ambitious task: given a\npre-trained classifier, we aim to reconstruct an image generator, without\nrelying on any data samples. From a black-box perspective, this challenge seems\nintractable, since it inevitably involves identifying the inverse function for\na classifier, which is, by nature, an information extraction process. As such,\nwe resort to leveraging the knowledge encapsulated within the parameters of the\nneural network. Grounded on the theory of Maximum-Margin Bias of gradient\ndescent, we propose a novel learning paradigm, in which the generator is\ntrained to ensure that the convergence conditions of the network parameters are\nsatisfied over the generated distribution of the samples. Empirical validation\nfrom various image generation tasks substantiates the efficacy of our strategy.\n","authors":["Runpeng Yu","Xinchao Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02470v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02469v1","updated":"2023-12-05T03:39:54Z","published":"2023-12-05T03:39:54Z","title":"Learning Energy-based Model via Dual-MCMC Teaching","summary":" This paper studies the fundamental learning problem of the energy-based model\n(EBM). Learning the EBM can be achieved using the maximum likelihood estimation\n(MLE), which typically involves the Markov Chain Monte Carlo (MCMC) sampling,\nsuch as the Langevin dynamics. However, the noise-initialized Langevin dynamics\ncan be challenging in practice and hard to mix. This motivates the exploration\nof joint training with the generator model where the generator model serves as\na complementary model to bypass MCMC sampling. However, such a method can be\nless accurate than the MCMC and result in biased EBM learning. While the\ngenerator can also serve as an initializer model for better MCMC sampling, its\nlearning can be biased since it only matches the EBM and has no access to\nempirical training examples. Such biased generator learning may limit the\npotential of learning the EBM. To address this issue, we present a joint\nlearning framework that interweaves the maximum likelihood learning algorithm\nfor both the EBM and the complementary generator model. In particular, the\ngenerator model is learned by MLE to match both the EBM and the empirical data\ndistribution, making it a more informative initializer for MCMC sampling of\nEBM. Learning generator with observed examples typically requires inference of\nthe generator posterior. To ensure accurate and efficient inference, we adopt\nthe MCMC posterior sampling and introduce a complementary inference model to\ninitialize such latent MCMC sampling. We show that three separate models can be\nseamlessly integrated into our joint framework through two (dual-) MCMC\nteaching, enabling effective and efficient EBM learning.\n","authors":["Jiali Cui","Tian Han"],"pdf_url":"https://arxiv.org/pdf/2312.02469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02462v1","updated":"2023-12-05T03:25:45Z","published":"2023-12-05T03:25:45Z","title":"Dimensionality Reduction and Dynamical Mode Recognition of Circular\n Arrays of Flame Oscillators Using Deep Neural Network","summary":" Oscillatory combustion in aero engines and modern gas turbines often has\nsignificant adverse effects on their operation, and accurately recognizing\nvarious oscillation modes is the prerequisite for understanding and controlling\ncombustion instability. However, the high-dimensional spatial-temporal data of\na complex combustion system typically poses considerable challenges to the\ndynamical mode recognition. Based on a two-layer bidirectional long short-term\nmemory variational autoencoder (Bi-LSTM-VAE) dimensionality reduction model and\na two-dimensional Wasserstein distance-based classifier (WDC), this study\nproposes a promising method (Bi-LSTM-VAE-WDC) for recognizing dynamical modes\nin oscillatory combustion systems. Specifically, the Bi-LSTM-VAE dimension\nreduction model was introduced to reduce the high-dimensional spatial-temporal\ndata of the combustion system to a low-dimensional phase space; Gaussian kernel\ndensity estimates (GKDE) were computed based on the distribution of phase\npoints in a grid; two-dimensional WD values were calculated from the GKDE maps\nto recognize the oscillation modes. The time-series data used in this study\nwere obtained from numerical simulations of circular arrays of laminar flame\noscillators. The results show that the novel Bi-LSTM-VAE method can produce a\nnon-overlapping distribution of phase points, indicating an effective\nunsupervised mode recognition and classification. Furthermore, the present\nmethod exhibits a more prominent performance than VAE and PCA (principal\ncomponent analysis) for distinguishing dynamical modes in complex flame\nsystems, implying its potential in studying turbulent combustion.\n","authors":["Weiming Xu","Tao Yang","Peng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02462v1.pdf","comment":"17 pages, 10 figures, research paper"},{"id":"http://arxiv.org/abs/2312.02450v1","updated":"2023-12-05T03:03:54Z","published":"2023-12-05T03:03:54Z","title":"GIT-Net: Generalized Integral Transform for Operator Learning","summary":" This article introduces GIT-Net, a deep neural network architecture for\napproximating Partial Differential Equation (PDE) operators, inspired by\nintegral transform operators. GIT-NET harnesses the fact that differential\noperators commonly used for defining PDEs can often be represented\nparsimoniously when expressed in specialized functional bases (e.g., Fourier\nbasis). Unlike rigid integral transforms, GIT-Net parametrizes adaptive\ngeneralized integral transforms with deep neural networks. When compared to\nseveral recently proposed alternatives, GIT-Net's computational and memory\nrequirements scale gracefully with mesh discretizations, facilitating its\napplication to PDE problems on complex geometries. Numerical experiments\ndemonstrate that GIT-Net is a competitive neural network operator, exhibiting\nsmall test errors and low evaluations across a range of PDE problems. This\nstands in contrast to existing neural network operators, which typically excel\nin just one of these areas.\n","authors":["Chao Wang","Alexandre Hoang Thiery"],"pdf_url":"https://arxiv.org/pdf/2312.02450v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02438v1","updated":"2023-12-05T02:38:04Z","published":"2023-12-05T02:38:04Z","title":"Adaptive Instrument Design for Indirect Experiments","summary":" Indirect experiments provide a valuable framework for estimating treatment\neffects in situations where conducting randomized control trials (RCTs) is\nimpractical or unethical. Unlike RCTs, indirect experiments estimate treatment\neffects by leveraging (conditional) instrumental variables, enabling estimation\nthrough encouragement and recommendation rather than strict treatment\nassignment. However, the sample efficiency of such estimators depends not only\non the inherent variability in outcomes but also on the varying compliance\nlevels of users with the instrumental variables and the choice of estimator\nbeing used, especially when dealing with numerous instrumental variables. While\nadaptive experiment design has a rich literature for direct experiments, in\nthis paper we take the initial steps towards enhancing sample efficiency for\nindirect experiments by adaptively designing a data collection policy over\ninstrumental variables. Our main contribution is a practical computational\nprocedure that utilizes influence functions to search for an optimal data\ncollection policy, minimizing the mean-squared error of the desired\n(non-linear) estimator. Through experiments conducted in various domains\ninspired by real-world applications, we showcase how our method can\nsignificantly improve the sample efficiency of indirect experiments.\n","authors":["Yash Chandak","Shiv Shankar","Vasilis Syrgkanis","Emma Brunskill"],"pdf_url":"https://arxiv.org/pdf/2312.02438v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02429v1","updated":"2023-12-05T02:08:48Z","published":"2023-12-05T02:08:48Z","title":"PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval\n Models","summary":" Embedding-based Retrieval Models (ERMs) have emerged as a promising framework\nfor large-scale text retrieval problems due to powerful large language models.\nNevertheless, fine-tuning ERMs to reach state-of-the-art results can be\nexpensive due to the extreme scale of data as well as the complexity of\nmulti-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this\nwork, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast\ntuning of ERMs without any backward pass in the optimization. At index building\nstage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN)\ncomponent. At inference stage, PEFA performs a convex combination of two\nscoring functions, one from the ERM and the other from the kNN. Based on the\nneighborhood definition, PEFA framework induces two realizations, namely\nPEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra\nsmall) using a single ANN index. Empirically, PEFA achieves significant\nimprovement on two retrieval applications. For document retrieval, regarding\nRecall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an\naverage of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%,\nrespectively. For product search, PEFA improves the Recall@100 of the\nfine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL,\nrespectively. Our code is available at https://github.com/\namzn/pecos/tree/mainline/examples/pefa-wsdm24\n","authors":["Wei-Cheng Chang","Jyun-Yu Jiang","Jiong Zhang","Mutasem Al-Darabsah","Choon Hui Teo","Cho-Jui Hsieh","Hsiang-Fu Yu","S. V. N. Vishwanathan"],"pdf_url":"https://arxiv.org/pdf/2312.02429v1.pdf","comment":"Accept by WSDM 2024"},{"id":"http://arxiv.org/abs/2311.11905v2","updated":"2023-12-05T01:50:27Z","published":"2023-11-20T16:38:45Z","title":"Real-Time Surface-to-Air Missile Engagement Zone Prediction Using\n Simulation and Machine Learning","summary":" Surface-to-Air Missiles (SAMs) are crucial in modern air defense systems. A\ncritical aspect of their effectiveness is the Engagement Zone (EZ), the spatial\nregion within which a SAM can effectively engage and neutralize a target.\nNotably, the EZ is intrinsically related to the missile's maximum range; it\ndefines the furthest distance at which a missile can intercept a target. The\naccurate computation of this EZ is essential but challenging due to the dynamic\nand complex factors involved, which often lead to high computational costs and\nextended processing times when using conventional simulation methods. In light\nof these challenges, our study investigates the potential of machine learning\ntechniques, proposing an approach that integrates machine learning with a\ncustom-designed simulation tool to train supervised algorithms. We leverage a\ncomprehensive dataset of pre-computed SAM EZ simulations, enabling our model to\naccurately predict the SAM EZ for new input parameters. It accelerates SAM EZ\nsimulations, enhances air defense strategic planning, and provides real-time\ninsights, improving SAM system performance. The study also includes a\ncomparative analysis of machine learning algorithms, illuminating their\ncapabilities and performance metrics and suggesting areas for future research,\nhighlighting the transformative potential of machine learning in SAM EZ\nsimulations.\n","authors":["Joao P. A. Dantas","Diego Geraldo","Felipe L. L. Medeiros","Marcos R. O. A. Maximo","Takashi Yoneyama"],"pdf_url":"https://arxiv.org/pdf/2311.11905v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16934v2","updated":"2023-12-05T01:24:29Z","published":"2022-11-30T12:09:40Z","title":"VideoDubber: Machine Translation with Speech-Aware Length Control for\n Video Dubbing","summary":" Video dubbing aims to translate the original speech in a film or television\nprogram into the speech in a target language, which can be achieved with a\ncascaded system consisting of speech recognition, machine translation and\nspeech synthesis. To ensure the translated speech to be well aligned with the\ncorresponding video, the length/duration of the translated speech should be as\nclose as possible to that of the original speech, which requires strict length\ncontrol. Previous works usually control the number of words or characters\ngenerated by the machine translation model to be similar to the source\nsentence, without considering the isochronicity of speech as the speech\nduration of words/characters in different languages varies. In this paper, we\npropose a machine translation system tailored for the task of video dubbing,\nwhich directly considers the speech duration of each token in translation, to\nmatch the length of source and target speech. Specifically, we control the\nspeech length of generated sentence by guiding the prediction of each word with\nthe duration information, including the speech duration of itself as well as\nhow much duration is left for the remaining words. We design experiments on\nfour language directions (German -> English, Spanish -> English, Chinese <->\nEnglish), and the results show that the proposed method achieves better length\ncontrol ability on the generated speech than baseline methods. To make up the\nlack of real-world datasets, we also construct a real-world test set collected\nfrom films to provide comprehensive evaluations on the video dubbing task.\n","authors":["Yihan Wu","Junliang Guo","Xu Tan","Chen Zhang","Bohan Li","Ruihua Song","Lei He","Sheng Zhao","Arul Menezes","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2211.16934v2.pdf","comment":"AAAI 2023 camera version"},{"id":"http://arxiv.org/abs/2312.02418v1","updated":"2023-12-05T01:19:30Z","published":"2023-12-05T01:19:30Z","title":"Decoding Data Quality via Synthetic Corruptions: Embedding-guided\n Pruning of Code Data","summary":" Code datasets, often collected from diverse and uncontrolled sources such as\nGitHub, potentially suffer from quality issues, thereby affecting the\nperformance and training efficiency of Large Language Models (LLMs) optimized\nfor code generation. Previous studies demonstrated the benefit of using\nembedding spaces for data pruning, but they mainly focused on duplicate removal\nor increasing variety, and in other modalities, such as images. Our work\nfocuses on using embeddings to identify and remove \"low-quality\" code data.\nFirst, we explore features of \"low-quality\" code in embedding space, through\nthe use of synthetic corruptions. Armed with this knowledge, we devise novel\npruning metrics that operate in embedding space to identify and remove\nlow-quality entries in the Stack dataset. We demonstrate the benefits of this\nsynthetic corruption informed pruning (SCIP) approach on the well-established\nHumanEval and MBPP benchmarks, outperforming existing embedding-based methods.\nImportantly, we achieve up to a 3% performance improvement over no pruning,\nthereby showing the promise of insights from synthetic corruptions for data\npruning.\n","authors":["Yu Yang","Aaditya K. Singh","Mostafa Elhoushi","Anas Mahmoud","Kushal Tirumala","Fabian Gloeckle","Baptiste Rozière","Carole-Jean Wu","Ari S. Morcos","Newsha Ardalani"],"pdf_url":"https://arxiv.org/pdf/2312.02418v1.pdf","comment":"12 pages, 4 figures, Oral Presentation at 3rd Workshop on Efficient\n Natural Language and Speech Processing (ENLSP-III), NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02416v1","updated":"2023-12-05T01:12:56Z","published":"2023-12-05T01:12:56Z","title":"Towards Fast and Stable Federated Learning: Confronting Heterogeneity\n via Knowledge Anchor","summary":" Federated learning encounters a critical challenge of data heterogeneity,\nadversely affecting the performance and convergence of the federated model.\nVarious approaches have been proposed to address this issue, yet their\neffectiveness is still limited. Recent studies have revealed that the federated\nmodel suffers severe forgetting in local training, leading to global forgetting\nand performance degradation. Although the analysis provides valuable insights,\na comprehensive understanding of the vulnerable classes and their impact\nfactors is yet to be established. In this paper, we aim to bridge this gap by\nsystematically analyzing the forgetting degree of each class during local\ntraining across different communication rounds. Our observations are: (1) Both\nmissing and non-dominant classes suffer similar severe forgetting during local\ntraining, while dominant classes show improvement in performance. (2) When\ndynamically reducing the sample size of a dominant class, catastrophic\nforgetting occurs abruptly when the proportion of its samples is below a\ncertain threshold, indicating that the local model struggles to leverage a few\nsamples of a specific class effectively to prevent forgetting. Motivated by\nthese findings, we propose a novel and straightforward algorithm called\nFederated Knowledge Anchor (FedKA). Assuming that all clients have a single\nshared sample for each class, the knowledge anchor is constructed before each\nlocal training stage by extracting shared samples for missing classes and\nrandomly selecting one sample per class for non-dominant classes. The knowledge\nanchor is then utilized to correct the gradient of each mini-batch towards the\ndirection of preserving the knowledge of the missing and non-dominant classes.\nExtensive experimental results demonstrate that our proposed FedKA achieves\nfast and stable convergence, significantly improving accuracy on popular\nbenchmarks.\n","authors":["Jinqian Chen","Jihua Zhu","Qinghai Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.02416v1.pdf","comment":"Published in ACM MM23"},{"id":"http://arxiv.org/abs/2312.02407v1","updated":"2023-12-05T00:46:29Z","published":"2023-12-05T00:46:29Z","title":"Robust Clustering using Hyperdimensional Computing","summary":" This paper addresses the clustering of data in the hyperdimensional computing\n(HDC) domain. In prior work, an HDC-based clustering framework, referred to as\nHDCluster, has been proposed. However, the performance of the existing\nHDCluster is not robust. The performance of HDCluster is degraded as the\nhypervectors for the clusters are chosen at random during the initialization\nstep. To overcome this bottleneck, we assign the initial cluster hypervectors\nby exploring the similarity of the encoded data, referred to as \\textit{query}\nhypervectors. Intra-cluster hypervectors have a higher similarity than\ninter-cluster hypervectors. Harnessing the similarity results among query\nhypervectors, this paper proposes four HDC-based clustering algorithms:\nsimilarity-based k-means, equal bin-width histogram, equal bin-height\nhistogram, and similarity-based affinity propagation. Experimental results\nillustrate that: (i) Compared to the existing HDCluster, our proposed HDC-based\nclustering algorithms can achieve better accuracy, more robust performance,\nfewer iterations, and less execution time. Similarity-based affinity\npropagation outperforms the other three HDC-based clustering algorithms on\neight datasets by 2~38% in clustering accuracy. (ii) Even for one-pass\nclustering, i.e., without any iterative update of the cluster hypervectors, our\nproposed algorithms can provide more robust clustering accuracy than HDCluster.\n(iii) Over eight datasets, five out of eight can achieve higher or comparable\naccuracy when projected onto the hyperdimensional space. Traditional clustering\nis more desirable than HDC when the number of clusters, $k$, is large.\n","authors":["Lulu Ge","Keshab K. Parhi"],"pdf_url":"https://arxiv.org/pdf/2312.02407v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02406v1","updated":"2023-12-05T00:42:35Z","published":"2023-12-05T00:42:35Z","title":"Efficient Online Data Mixing For Language Model Pre-Training","summary":" The data used to pretrain large language models has a decisive impact on a\nmodel's downstream performance, which has led to a large body of work on data\nselection methods that aim to automatically determine the most suitable data to\nuse for pretraining. Existing data selection methods suffer from slow and\ncomputationally expensive processes, a problem amplified by the increasing size\nof models and of pretraining datasets. Data mixing, on the other hand, reduces\nthe complexity of data selection by grouping data points together and\ndetermining sampling probabilities across entire groups. However, data mixing\nproportions are typically fixed before training and therefore cannot adapt to\nchanging training dynamics. To address these limitations, we develop an\nefficient algorithm for Online Data Mixing (ODM) that combines elements from\nboth data selection and data mixing. Based on multi-armed bandit algorithms,\nour online approach optimizes the data mixing proportions during training.\nRemarkably, our method trains a model that reaches the final perplexity of the\nnext best method with 19\\% fewer training iterations, and improves performance\non the 5-shot MMLU benchmark by 1.9% relative accuracy, while adding negligible\nwall-clock time during pretraining.\n","authors":["Alon Albalak","Liangming Pan","Colin Raffel","William Yang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.10191v3","updated":"2023-12-05T00:24:56Z","published":"2023-06-16T21:53:16Z","title":"Neural Priming for Sample-Efficient Adaptation","summary":" We propose Neural Priming, a technique for adapting large pretrained models\nto distribution shifts and downstream tasks given few or no labeled examples.\nPresented with class names or unlabeled test samples, Neural Priming enables\nthe model to recall and conditions its parameters on relevant data seen\nthroughout pretraining, thereby priming it for the test distribution. Neural\nPriming can be performed at test time, even for pretraining datasets as large\nas LAION-2B. Performing lightweight updates on the recalled data significantly\nimproves accuracy across a variety of distribution shift and transfer learning\nbenchmarks. Concretely, in the zero-shot setting, we see a 2.45% improvement in\naccuracy on ImageNet and 3.81% accuracy improvement on average across standard\ntransfer learning benchmarks. Further, using Neural Priming at inference to\nadapt to distribution shift, we see a 1.41% accuracy improvement on ImageNetV2.\nThese results demonstrate the effectiveness of Neural Priming in addressing the\nchallenge of limited labeled data and changing distributions. Code is available\nat github.com/RAIVNLab/neural-priming.\n","authors":["Matthew Wallingford","Vivek Ramanujan","Alex Fang","Aditya Kusupati","Roozbeh Mottaghi","Aniruddha Kembhavi","Ludwig Schmidt","Ali Farhadi"],"pdf_url":"https://arxiv.org/pdf/2306.10191v3.pdf","comment":"18 pages, 7 figures, 9 tables"},{"id":"http://arxiv.org/abs/2301.03150v4","updated":"2023-12-05T00:24:44Z","published":"2023-01-09T02:42:39Z","title":"MOTOR: A Time-To-Event Foundation Model For Structured Medical Records","summary":" We present a self-supervised, time-to-event (TTE) foundation model called\nMOTOR (Many Outcome Time Oriented Representations) which is pretrained on\ntimestamped sequences of events in electronic health records (EHR) and health\ninsurance claims. TTE models are used for estimating the probability\ndistribution of the time until a specific event occurs, which is an important\ntask in medical settings. TTE models provide many advantages over\nclassification using fixed time horizons, including naturally handling censored\nobservations, but are challenging to train with limited labeled data. MOTOR\naddresses this challenge by pretraining on up to 55M patient records (9B\nclinical events). We evaluate MOTOR's transfer learning performance on 19\ntasks, across 3 patient databases (a private EHR system, MIMIC-IV, and Merative\nclaims data). Task-specific models adapted from MOTOR improve time-dependent C\nstatistics by 4.6% over state-of-the-art, improve label efficiency by up to 95%\n,and are more robust to temporal distributional shifts. We further evaluate\ncross-site portability by adapting our MOTOR foundation model for six\nprediction tasks on the MIMIC-IV dataset, where it outperforms all baselines.\nMOTOR is the first foundation model for medical TTE predictions and we release\na 143M parameter pretrained model for research use at [redacted URL].\n","authors":["Ethan Steinberg","Jason Fries","Yizhe Xu","Nigam Shah"],"pdf_url":"https://arxiv.org/pdf/2301.03150v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02401v1","updated":"2023-12-05T00:11:09Z","published":"2023-12-05T00:11:09Z","title":"Harmonizing Global Voices: Culturally-Aware Models for Enhanced Content\n Moderation","summary":" Content moderation at scale faces the challenge of considering local cultural\ndistinctions when assessing content. While global policies aim to maintain\ndecision-making consistency and prevent arbitrary rule enforcement, they often\noverlook regional variations in interpreting natural language as expressed in\ncontent. In this study, we are looking into how moderation systems can tackle\nthis issue by adapting to local comprehension nuances. We train large language\nmodels on extensive datasets of media news and articles to create culturally\nattuned models. The latter aim to capture the nuances of communication across\ngeographies with the goal of recognizing cultural and societal variations in\nwhat is considered offensive content. We further explore the capability of\nthese models to generate explanations for instances of content violation,\naiming to shed light on how policy guidelines are perceived when cultural and\nsocietal contexts change. We find that training on extensive media datasets\nsuccessfully induced cultural awareness and resulted in improvements in\nhandling content violations on a regional basis. Additionally, these\nadvancements include the ability to provide explanations that align with the\nspecific local norms and nuances as evidenced by the annotators' preference in\nour conducted study. This multifaceted success reinforces the critical role of\nan adaptable content moderation approach in keeping pace with the ever-evolving\nnature of the content it oversees.\n","authors":["Alex J. Chan","José Luis Redondo García","Fabrizio Silvestri","Colm O'Donnel","Konstantina Palla"],"pdf_url":"https://arxiv.org/pdf/2312.02401v1.pdf","comment":"12 pages, 8 Figures. Supplementary material"},{"id":"http://arxiv.org/abs/2312.02400v1","updated":"2023-12-05T00:09:57Z","published":"2023-12-05T00:09:57Z","title":"Auto DP-SGD: Dual Improvements of Privacy and Accuracy via Automatic\n Clipping Threshold and Noise Multiplier Estimation","summary":" DP-SGD has emerged as a popular method to protect personally identifiable\ninformation in deep learning applications. Unfortunately, DP-SGD's per-sample\ngradient clipping and uniform noise addition during training can significantly\ndegrade model utility. To enhance the model's utility, researchers proposed\nvarious adaptive DP-SGD methods. However, we examine and discover that these\ntechniques result in greater privacy leakage or lower accuracy than the\ntraditional DP-SGD method, or a lack of evaluation on a complex data set such\nas CIFAR100. To address these limitations, we propose an Auto DP-SGD. Our\nmethod automates clipping threshold estimation based on the DL model's gradient\nnorm and scales the gradients of each training sample without losing gradient\ninformation. This helps to improve the algorithm's utility while using a less\nprivacy budget. To further improve accuracy, we introduce automatic noise\nmultiplier decay mechanisms to decrease the noise multiplier after every epoch.\nFinally, we develop closed-form mathematical expressions using tCDP accountant\nfor automatic noise multiplier and automatic clipping threshold estimation.\nThrough extensive experimentation, we demonstrate that Auto DP-SGD outperforms\nexisting SOTA DP-SGD methods in privacy and accuracy on various benchmark\ndatasets. We also show that privacy can be improved by lowering the scale\nfactor and using learning rate schedulers without significantly reducing\naccuracy. Specifically, Auto DP-SGD, when used with a step noise multiplier,\nimproves accuracy by 3.20, 1.57, 6.73, and 1.42 for the MNIST, CIFAR10,\nCIFAR100, and AG News Corpus datasets, respectively. Furthermore, it obtains a\nsubstantial reduction in the privacy budget of 94.9, 79.16, 67.36, and 53.37\nfor the corresponding data sets.\n","authors":["Sai Venkatesh Chilukoti","Md Imran Hossen","Liqun Shan","Vijay Srinivas Tida","Xiai Hei"],"pdf_url":"https://arxiv.org/pdf/2312.02400v1.pdf","comment":"25 pages single column, 2 figures"},{"id":"http://arxiv.org/abs/2312.03187v1","updated":"2023-12-05T23:33:49Z","published":"2023-12-05T23:33:49Z","title":"FERGI: Automatic Annotation of User Preferences for Text-to-Image\n Generation from Spontaneous Facial Expression Reaction","summary":" Researchers have proposed to use data of human preference feedback to\nfine-tune text-to-image generative models. However, the scalability of human\nfeedback collection has been limited by its reliance on manual annotation.\nTherefore, we develop and test a method to automatically annotate user\npreferences from their spontaneous facial expression reaction to the generated\nimages. We collect a dataset of Facial Expression Reaction to Generated Images\n(FERGI) and show that the activations of multiple facial action units (AUs) are\nhighly correlated with user evaluations of the generated images. Specifically,\nAU4 (brow lowerer) is most consistently reflective of negative evaluations of\nthe generated image. This can be useful in two ways. Firstly, we can\nautomatically annotate user preferences between image pairs with substantial\ndifference in AU4 responses to them with an accuracy significantly\noutperforming state-of-the-art scoring models. Secondly, directly integrating\nthe AU4 responses with the scoring models improves their consistency with human\npreferences. Additionally, the AU4 response best reflects the user's evaluation\nof the image fidelity, making it complementary to the state-of-the-art scoring\nmodels, which are generally better at reflecting image-text alignment. Finally,\nthis method of automatic annotation with facial expression analysis can be\npotentially generalized to other generation tasks. The code is available at\nhttps://github.com/ShuangquanFeng/FERGI, and the dataset is also available at\nthe same link for research purposes.\n","authors":["Shuangquan Feng","Junhua Ma","Virginia R. de Sa"],"pdf_url":"https://arxiv.org/pdf/2312.03187v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03186v1","updated":"2023-12-05T23:32:48Z","published":"2023-12-05T23:32:48Z","title":"Data-Driven Traffic Reconstruction and Kernel Methods for Identifying\n Stop-and-Go Congestion","summary":" Identifying stop-and-go events (SAGs) in traffic flow presents an important\navenue for advancing data-driven research for climate change mitigation and\nsustainability, owing to their substantial impact on carbon emissions, travel\ntime, fuel consumption, and roadway safety. In fact, SAGs are estimated to\naccount for 33-50% of highway driving externalities. However, insufficient\nattention has been paid to precisely quantifying where, when, and how much\nthese SAGs take place -necessary for downstream decision making, such as\nintervention design and policy analysis. A key challenge is that the data\navailable to researchers and governments are typically sparse and aggregated to\na granularity that obscures SAGs. To overcome such data limitations, this study\nthus explores the use of traffic reconstruction techniques for SAG\nidentification. In particular, we introduce a kernel-based method for\nidentifying spatio-temporal features in traffic and leverage bootstrapping to\nquantify the uncertainty of the reconstruction process. Experimental results on\nCalifornia highway data demonstrate the promise of the method for capturing\nSAGs. This work contributes to a foundation for data-driven decision making to\nadvance sustainability of traffic systems.\n","authors":["Edgar Ramirez Sanchez","Shreyaa Raghavan","Cathy Wu"],"pdf_url":"https://arxiv.org/pdf/2312.03186v1.pdf","comment":"Presented at NeurIPS 2023 workshops: Tackling Climate Change with\n Machine Learning & Computational Sustainability"},{"id":"http://arxiv.org/abs/2203.01850v4","updated":"2023-12-05T23:28:57Z","published":"2022-03-03T16:58:54Z","title":"T-Cal: An optimal test for the calibration of predictive models","summary":" The prediction accuracy of machine learning methods is steadily increasing,\nbut the calibration of their uncertainty predictions poses a significant\nchallenge. Numerous works focus on obtaining well-calibrated predictive models,\nbut less is known about reliably assessing model calibration. This limits our\nability to know when algorithms for improving calibration have a real effect,\nand when their improvements are merely artifacts due to random noise in finite\ndatasets. In this work, we consider detecting mis-calibration of predictive\nmodels using a finite validation dataset as a hypothesis testing problem. The\nnull hypothesis is that the predictive model is calibrated, while the\nalternative hypothesis is that the deviation from calibration is sufficiently\nlarge.\n We find that detecting mis-calibration is only possible when the conditional\nprobabilities of the classes are sufficiently smooth functions of the\npredictions. When the conditional class probabilities are H\\\"older continuous,\nwe propose T-Cal, a minimax optimal test for calibration based on a debiased\nplug-in estimator of the $\\ell_2$-Expected Calibration Error (ECE). We further\npropose Adaptive T-Cal, a version that is adaptive to unknown smoothness. We\nverify our theoretical findings with a broad range of experiments, including\nwith several popular deep neural net architectures and several standard\npost-hoc calibration methods. T-Cal is a practical general-purpose tool, which\n-- combined with classical tests for discrete-valued predictors -- can be used\nto test the calibration of virtually any probabilistic classification method.\n","authors":["Donghwan Lee","Xinmeng Huang","Hamed Hassani","Edgar Dobriban"],"pdf_url":"https://arxiv.org/pdf/2203.01850v4.pdf","comment":"The implementation of T-Cal is available at\n https://github.com/dh7401/T-Cal"},{"id":"http://arxiv.org/abs/2312.03179v1","updated":"2023-12-05T23:05:36Z","published":"2023-12-05T23:05:36Z","title":"CaloQVAE : Simulating high-energy particle-calorimeter interactions\n using hybrid quantum-classical generative models","summary":" The Large Hadron Collider's high luminosity era presents major computational\nchallenges in the analysis of collision events. Large amounts of Monte Carlo\n(MC) simulation will be required to constrain the statistical uncertainties of\nthe simulated datasets below these of the experimental data. Modelling of\nhigh-energy particles propagating through the calorimeter section of the\ndetector is the most computationally intensive MC simulation task. We introduce\na technique combining recent advancements in generative models and quantum\nannealing for fast and efficient simulation of high-energy particle-calorimeter\ninteractions.\n","authors":["Sehmimul Hoque","Hao Jia","Abhishek Abhishek","Mojde Fadaie","J. Quetzalcoatl Toledo-Marín","Tiago Vale","Roger G. Melko","Maximilian Swiatlowski","Wojciech T. Fedorko"],"pdf_url":"https://arxiv.org/pdf/2312.03179v1.pdf","comment":"6 pages, 3 figures"},{"id":"http://arxiv.org/abs/2206.04979v3","updated":"2023-12-05T23:05:32Z","published":"2022-06-10T10:28:55Z","title":"Convolutional layers are equivariant to discrete shifts but not\n continuous translations","summary":" The purpose of this short and simple note is to clarify a common\nmisconception about convolutional neural networks (CNNs). CNNs are made up of\nconvolutional layers which are shift equivariant due to weight sharing.\nHowever, convolutional layers are not translation equivariant, even when\nboundary effects are ignored and when pooling and subsampling are absent. This\nis because shift equivariance is a discrete symmetry while translation\nequivariance is a continuous symmetry. This fact is well known among\nresearchers in equivariant machine learning, but is usually overlooked among\nnon-experts. To minimize confusion, we suggest using the term `shift\nequivariance' to refer to discrete shifts in pixels and `translation\nequivariance' to refer to continuous translations.\n","authors":["Nick McGreivy","Ammar Hakim"],"pdf_url":"https://arxiv.org/pdf/2206.04979v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03177v1","updated":"2023-12-05T22:53:05Z","published":"2023-12-05T22:53:05Z","title":"Using Curiosity for an Even Representation of Tasks in Continual Offline\n Reinforcement Learning","summary":" In this work, we investigate the means of using curiosity on replay buffers\nto improve offline multi-task continual reinforcement learning when tasks,\nwhich are defined by the non-stationarity in the environment, are non labeled\nand not evenly exposed to the learner in time. In particular, we investigate\nthe use of curiosity both as a tool for task boundary detection and as a\npriority metric when it comes to retaining old transition tuples, which we\nrespectively use to propose two different buffers. Firstly, we propose a Hybrid\nReservoir Buffer with Task Separation (HRBTS), where curiosity is used to\ndetect task boundaries that are not known due to the task agnostic nature of\nthe problem. Secondly, by using curiosity as a priority metric when it comes to\nretaining old transition tuples, a Hybrid Curious Buffer (HCB) is proposed. We\nultimately show that these buffers, in conjunction with regular reinforcement\nlearning algorithms, can be used to alleviate the catastrophic forgetting issue\nsuffered by the state of the art on replay buffers when the agent's exposure to\ntasks is not equal along time. We evaluate catastrophic forgetting and the\nefficiency of our proposed buffers against the latest works such as the Hybrid\nReservoir Buffer (HRB) and the Multi-Time Scale Replay Buffer (MTR) in three\ndifferent continual reinforcement learning settings. Experiments were done on\nclassical control tasks and Metaworld environment. Experiments show that our\nproposed replay buffers display better immunity to catastrophic forgetting\ncompared to existing works in most of the settings.\n","authors":["Pankayaraj Pathmanathan","Natalia Díaz-Rodríguez","Javier Del Ser"],"pdf_url":"https://arxiv.org/pdf/2312.03177v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03176v1","updated":"2023-12-05T22:44:05Z","published":"2023-12-05T22:44:05Z","title":"Active Learning for Abrupt Shifts Change-point Detection via\n Derivative-Aware Gaussian Processes","summary":" Change-point detection (CPD) is crucial for identifying abrupt shifts in\ndata, which influence decision-making and efficient resource allocation across\nvarious domains. To address the challenges posed by the costly and\ntime-intensive data acquisition in CPD, we introduce the Derivative-Aware\nChange Detection (DACD) method. It leverages the derivative process of a\nGaussian process (GP) for Active Learning (AL), aiming to pinpoint change-point\nlocations effectively. DACD balances the exploitation and exploration of\nderivative processes through multiple data acquisition functions (AFs). By\nutilizing GP derivative mean and variance as criteria, DACD sequentially\nselects the next sampling data point, thus enhancing algorithmic efficiency and\nensuring reliable and accurate results. We investigate the effectiveness of\nDACD method in diverse scenarios and show it outperforms other active learning\nchange-point detection approaches.\n","authors":["Hao Zhao","Rong Pan"],"pdf_url":"https://arxiv.org/pdf/2312.03176v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2305.16450v2","updated":"2023-12-05T18:35:18Z","published":"2023-05-25T19:54:33Z","title":"Investigation of UAV Detection in Images with Complex Backgrounds and\n Rainy Artifacts","summary":" To detect unmanned aerial vehicles (UAVs) in real-time, computer vision and\ndeep learning approaches are evolving research areas. Interest in this problem\nhas grown due to concerns regarding the possible hazards and misuse of\nemploying UAVs in many applications. These include potential privacy\nviolations. To address the concerns, vision-based object detection methods have\nbeen developed for UAV detection. However, UAV detection in images with complex\nbackgrounds and weather artifacts like rain has yet to be reasonably studied.\nHence, for this purpose, we prepared two training datasets. The first dataset\nhas the sky as its background and is called the Sky Background Dataset (SBD).\nThe second training dataset has more complex scenes (with diverse backgrounds)\nand is named the Complex Background Dataset (CBD). Additionally, two test sets\nwere prepared: one containing clear images and the other with images with three\nrain artifacts, named the Rainy Test Set (RTS). This work also focuses on\nbenchmarking state-of-the-art object detection models, and to the best of our\nknowledge, it is the first to investigate the performance of recent and popular\nvision-based object detection methods for UAV detection under challenging\nconditions such as complex backgrounds, varying UAV sizes, and low-to-heavy\nrainy conditions. The findings presented in the paper shall help provide\ninsights concerning the performance of the selected models for UAV detection\nunder challenging conditions and pave the way to develop more robust UAV\ndetection methods. The codes and datasets are available at:\nhttps://github.com/AdnanMunir294/UAVD-CBRA.\n","authors":["Adnan Munir","Abdul Jabbar Siddiqui","Saeed Anwar"],"pdf_url":"https://arxiv.org/pdf/2305.16450v2.pdf","comment":"Accepted at the Real-World Surveillance Workshop, IEEE/CVF Winter\n Conference on Applications of Computer Vision 2024"},{"id":"http://arxiv.org/abs/2306.11300v4","updated":"2023-12-05T12:23:36Z","published":"2023-06-20T05:30:59Z","title":"RS5M: A Large Scale Vision-Language Dataset for Remote Sensing\n Vision-Language Foundation Model","summary":" Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text\npaired data have demonstrated unprecedented image-text association\ncapabilities, achieving remarkable results across various downstream tasks. A\ncritical challenge is how to make use of existing large-scale pre-trained VLMs,\nwhich are trained on common objects, to perform the domain-specific transfer\nfor accomplishing domain-related downstream tasks. A critical challenge is how\nto make use of existing large-scale pre-trained VLMs, which are trained on\ncommon objects, to perform the domain-specific transfer for accomplishing\ndomain-related downstream tasks. In this paper, we propose a new framework that\nincludes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap\nbetween the General Vision-Language Model (GVLM) and domain-specific downstream\ntasks. Moreover, we present an image-text paired dataset in the field of remote\nsensing (RS), RS5M, which has 5 million RS images with English descriptions.\nThe dataset is obtained from filtering publicly available image-text paired\ndatasets and captioning label-only RS datasets with pre-trained VLM. These\nconstitute the first large-scale RS image-text paired dataset. Additionally, we\nfine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning\nmethods on RS5M to implement the DVLM. Experimental results show that our\nproposed dataset is highly effective for various tasks, and our model GeoRSCLIP\nimproves upon the baseline or previous state-of-the-art model by $3\\%\\sim20\\%$\nin Zero-shot Classification (ZSC), $3\\%\\sim6\\%$ in Remote Sensing Cross-Modal\nText-Image Retrieval (RSCTIR) and $4\\%\\sim5\\%$ in Semantic Localization (SeLo)\ntasks. Dataset and models have been released in:\n\\url{https://github.com/om-ai-lab/RS5M}.\n","authors":["Zilun Zhang","Tiancheng Zhao","Yulong Guo","Jianwei Yin"],"pdf_url":"https://arxiv.org/pdf/2306.11300v4.pdf","comment":"RS5M dataset v5"},{"id":"http://arxiv.org/abs/2311.13307v2","updated":"2023-12-05T10:25:23Z","published":"2023-11-22T10:55:36Z","title":"Rethinking Radiology Report Generation via Causal Reasoning and\n Counterfactual Augmentation","summary":" Radiology Report Generation (RRG) draws attention as an interaction between\nvision and language fields. Previous works inherited the ideology of\nvision-to-language generation tasks,aiming to generate paragraphs with high\nconsistency as reports. However, one unique characteristic of RRG, the\nindependence between diseases, was neglected, leading to the injection of\ndisease co-occurrence as a confounder that effects the results through backdoor\npath. Unfortunately, this confounder confuses the process of report generation\nworse because of the biased RRG data distribution. In this paper, to rethink\nthis issue thoroughly, we reason about its causes and effects from a novel\nperspective of statistics and causality, where the Joint Vision Coupling and\nthe Conditional Sentence Coherence Coupling are two aspects prone to implicitly\ndecrease the accuracy of reports. Then, a counterfactual augmentation strategy\nthat contains the Counterfactual Sample Synthesis and the Counterfactual Report\nReconstruction sub-methods is proposed to break these two aspects of spurious\neffects. Experimental results and further analyses on two widely used datasets\njustify our reasoning and proposed methods.\n","authors":["Xiao Song","Jiafan Liu","Yun Li","Wenbin Lei","Ruxin Wang"],"pdf_url":"https://arxiv.org/pdf/2311.13307v2.pdf","comment":"10 pages,5 figures"},{"id":"http://arxiv.org/abs/2311.03236v2","updated":"2023-12-05T09:08:30Z","published":"2023-11-06T16:26:52Z","title":"Out-of-distribution Detection Learning with Unreliable\n Out-of-distribution Sources","summary":" Out-of-distribution (OOD) detection discerns OOD data where the predictor\ncannot make valid predictions as in-distribution (ID) data, thereby increasing\nthe reliability of open-world classification. However, it is typically hard to\ncollect real out-of-distribution (OOD) data for training a predictor capable of\ndiscerning ID and OOD patterns. This obstacle gives rise to data\ngeneration-based learning methods, synthesizing OOD data via data generators\nfor predictor training without requiring any real OOD data. Related methods\ntypically pre-train a generator on ID data and adopt various selection\nprocedures to find those data likely to be the OOD cases. However, generated\ndata may still coincide with ID semantics, i.e., mistaken OOD generation\nremains, confusing the predictor between ID and OOD data. To this end, we\nsuggest that generated data (with mistaken OOD generation) can be used to\ndevise an auxiliary OOD detection task to facilitate real OOD detection.\nSpecifically, we can ensure that learning from such an auxiliary task is\nbeneficial if the ID and the OOD parts have disjoint supports, with the help of\na well-designed training procedure for the predictor. Accordingly, we propose a\npowerful data generation-based learning method named Auxiliary Task-based OOD\nLearning (ATOL) that can relieve the mistaken OOD generation. We conduct\nextensive experiments under various OOD detection setups, demonstrating the\neffectiveness of our method against its advanced counterparts.\n","authors":["Haotian Zheng","Qizhou Wang","Zhen Fang","Xiaobo Xia","Feng Liu","Tongliang Liu","Bo Han"],"pdf_url":"https://arxiv.org/pdf/2311.03236v2.pdf","comment":"Accepted by NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02512v1","updated":"2023-12-05T05:36:44Z","published":"2023-12-05T05:36:44Z","title":"AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation\n with Unified Audio-Visual Speech Representation","summary":" This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech\nTranslation (AV2AV) framework, where the input and output of the system are\nmultimodal (i.e., audio and visual speech). With the proposed AV2AV, two key\nadvantages can be brought: 1) We can perform real-like conversations with\nindividuals worldwide in a virtual meeting by utilizing our own primary\nlanguages. In contrast to Speech-to-Speech Translation (A2A), which solely\ntranslates between audio modalities, the proposed AV2AV directly translates\nbetween audio-visual speech. This capability enhances the dialogue experience\nby presenting synchronized lip movements along with the translated speech. 2)\nWe can improve the robustness of the spoken language translation system. By\nemploying the complementary information of audio-visual speech, the system can\neffectively translate spoken language even in the presence of acoustic noise,\nshowcasing robust performance. To mitigate the problem of the absence of a\nparallel AV2AV translation dataset, we propose to train our spoken language\ntranslation system with the audio-only dataset of A2A. This is done by learning\nunified audio-visual speech representations through self-supervised learning in\nadvance to train the translation system. Moreover, we propose an AV-Renderer\nthat can generate raw audio and video in parallel. It is designed with\nzero-shot speaker modeling, thus the speaker in source audio-visual speech can\nbe maintained at the target translated audio-visual speech. The effectiveness\nof AV2AV is evaluated with extensive experiments in a many-to-many language\ntranslation setting. The demo page is available on\nhttps://choijeongsoo.github.io/av2av.\n","authors":["Jeongsoo Choi","Se Jin Park","Minsu Kim","Yong Man Ro"],"pdf_url":"https://arxiv.org/pdf/2312.02512v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.16934v2","updated":"2023-12-05T01:24:29Z","published":"2022-11-30T12:09:40Z","title":"VideoDubber: Machine Translation with Speech-Aware Length Control for\n Video Dubbing","summary":" Video dubbing aims to translate the original speech in a film or television\nprogram into the speech in a target language, which can be achieved with a\ncascaded system consisting of speech recognition, machine translation and\nspeech synthesis. To ensure the translated speech to be well aligned with the\ncorresponding video, the length/duration of the translated speech should be as\nclose as possible to that of the original speech, which requires strict length\ncontrol. Previous works usually control the number of words or characters\ngenerated by the machine translation model to be similar to the source\nsentence, without considering the isochronicity of speech as the speech\nduration of words/characters in different languages varies. In this paper, we\npropose a machine translation system tailored for the task of video dubbing,\nwhich directly considers the speech duration of each token in translation, to\nmatch the length of source and target speech. Specifically, we control the\nspeech length of generated sentence by guiding the prediction of each word with\nthe duration information, including the speech duration of itself as well as\nhow much duration is left for the remaining words. We design experiments on\nfour language directions (German -> English, Spanish -> English, Chinese <->\nEnglish), and the results show that the proposed method achieves better length\ncontrol ability on the generated speech than baseline methods. To make up the\nlack of real-world datasets, we also construct a real-world test set collected\nfrom films to provide comprehensive evaluations on the video dubbing task.\n","authors":["Yihan Wu","Junliang Guo","Xu Tan","Chen Zhang","Bohan Li","Ruihua Song","Lei He","Sheng Zhao","Arul Menezes","Jiang Bian"],"pdf_url":"https://arxiv.org/pdf/2211.16934v2.pdf","comment":"AAAI 2023 camera version"}]},"2023-12-06T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.03700v1","updated":"2023-12-06T18:59:19Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":" Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v1.pdf","comment":"Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2312.03699v1","updated":"2023-12-06T18:59:11Z","published":"2023-12-06T18:59:11Z","title":"PROMISE: A Framework for Model-Driven Stateful Prompt Orchestration","summary":" The advent of increasingly powerful language models has raised expectations\nfor language-based interactions. However, controlling these models is a\nchallenge, emphasizing the need to be able to investigate the feasibility and\nvalue of their application. We present PROMISE, a framework that facilitates\nthe development of complex language-based interactions with information\nsystems. Its use of state machine modeling concepts enables model-driven,\ndynamic prompt orchestration across hierarchically nested states and\ntransitions. This improves the control of the behavior of language models and\nthus enables their effective and efficient use. We show the benefits of PROMISE\nin the context of application scenarios within health information systems and\ndemonstrate its ability to handle complex interactions.\n","authors":["Wenyuan Wu","Jasmin Heierli","Max Meisterhans","Adrian Moser","Andri Färber","Mateusz Dolata","Elena Gavagnin","Alexandre de Spindler","Gerhard Schwabe"],"pdf_url":"https://arxiv.org/pdf/2312.03699v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03689v1","updated":"2023-12-06T18:53:01Z","published":"2023-12-06T18:53:01Z","title":"Evaluating and Mitigating Discrimination in Language Model Decisions","summary":" As language models (LMs) advance, interest is growing in applying them to\nhigh-stakes societal decisions, such as determining financing or housing\neligibility. However, their potential for discrimination in such contexts\nraises ethical concerns, motivating the need for better methods to evaluate\nthese risks. We present a method for proactively evaluating the potential\ndiscriminatory impact of LMs in a wide range of use cases, including\nhypothetical use cases where they have not yet been deployed. Specifically, we\nuse an LM to generate a wide array of potential prompts that decision-makers\nmay input into an LM, spanning 70 diverse decision scenarios across society,\nand systematically vary the demographic information in each prompt. Applying\nthis methodology reveals patterns of both positive and negative discrimination\nin the Claude 2.0 model in select settings when no interventions are applied.\nWhile we do not endorse or permit the use of language models to make automated\ndecisions for the high-risk use cases we study, we demonstrate techniques to\nsignificantly decrease both positive and negative discrimination through\ncareful prompt engineering, providing pathways toward safer deployment in use\ncases where they may be appropriate. Our work enables developers and\npolicymakers to anticipate, measure, and address discrimination as language\nmodel capabilities and applications continue to expand. We release our dataset\nand prompts at https://huggingface.co/datasets/Anthropic/discrim-eval\n","authors":["Alex Tamkin","Amanda Askell","Liane Lovitt","Esin Durmus","Nicholas Joseph","Shauna Kravec","Karina Nguyen","Jared Kaplan","Deep Ganguli"],"pdf_url":"https://arxiv.org/pdf/2312.03689v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03668v1","updated":"2023-12-06T18:34:42Z","published":"2023-12-06T18:34:42Z","title":"An Integration of Pre-Trained Speech and Language Models for End-to-End\n Speech Recognition","summary":" Advances in machine learning have made it possible to perform various text\nand speech processing tasks, including automatic speech recognition (ASR), in\nan end-to-end (E2E) manner. Since typical E2E approaches require large amounts\nof training data and resources, leveraging pre-trained foundation models\ninstead of training from scratch is gaining attention. Although there have been\nattempts to use pre-trained speech and language models in ASR, most of them are\nlimited to using either. This paper explores the potential of integrating a\npre-trained speech representation model with a large language model (LLM) for\nE2E ASR. The proposed model enables E2E ASR by generating text tokens in an\nautoregressive manner via speech representations as speech prompts, taking\nadvantage of the vast knowledge provided by the LLM. Furthermore, the proposed\nmodel can incorporate remarkable developments for LLM utilization, such as\ninference optimization and parameter-efficient domain adaptation. Experimental\nresults show that the proposed model achieves performance comparable to modern\nE2E ASR models.\n","authors":["Yukiya Hono","Koh Mitsuda","Tianyu Zhao","Kentaro Mitsui","Toshiaki Wakatsuki","Kei Sawada"],"pdf_url":"https://arxiv.org/pdf/2312.03668v1.pdf","comment":"6 pages, 2 figures, 3 tables, The model is available at\n https://huggingface.co/rinna/nue-asr"},{"id":"http://arxiv.org/abs/2312.03664v1","updated":"2023-12-06T18:33:50Z","published":"2023-12-06T18:33:50Z","title":"Generative agent-based modeling with actions grounded in physical,\n social, or digital space using Concordia","summary":" Agent-based modeling has been around for decades, and applied widely across\nthe social and natural sciences. The scope of this research method is now\npoised to grow dramatically as it absorbs the new affordances provided by Large\nLanguage Models (LLM)s. Generative Agent-Based Models (GABM) are not just\nclassic Agent-Based Models (ABM)s where the agents talk to one another. Rather,\nGABMs are constructed using an LLM to apply common sense to situations, act\n\"reasonably\", recall common semantic knowledge, produce API calls to control\ndigital technologies like apps, and communicate both within the simulation and\nto researchers viewing it from the outside. Here we present Concordia, a\nlibrary to facilitate constructing and working with GABMs. Concordia makes it\neasy to construct language-mediated simulations of physically- or\ndigitally-grounded environments. Concordia agents produce their behavior using\na flexible component system which mediates between two fundamental operations:\nLLM calls and associative memory retrieval. A special agent called the Game\nMaster (GM), which was inspired by tabletop role-playing games, is responsible\nfor simulating the environment where the agents interact. Agents take actions\nby describing what they want to do in natural language. The GM then translates\ntheir actions into appropriate implementations. In a simulated physical world,\nthe GM checks the physical plausibility of agent actions and describes their\neffects. In digital environments simulating technologies such as apps and\nservices, the GM may handle API calls to integrate with external tools such as\ngeneral AI assistants (e.g., Bard, ChatGPT), and digital apps (e.g., Calendar,\nEmail, Search, etc.). Concordia was designed to support a wide array of\napplications both in scientific research and for evaluating performance of real\ndigital services by simulating users and/or generating synthetic data.\n","authors":["Alexander Sasha Vezhnevets","John P. Agapiou","Avia Aharon","Ron Ziv","Jayd Matyas","Edgar A. Duéñez-Guzmán","William A. Cunningham","Simon Osindero","Danny Karmon","Joel Z. Leibo"],"pdf_url":"https://arxiv.org/pdf/2312.03664v1.pdf","comment":"31 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03656v1","updated":"2023-12-06T18:25:53Z","published":"2023-12-06T18:25:53Z","title":"Interpretability Illusions in the Generalization of Simplified Models","summary":" A common method to study deep learning systems is to use simplified model\nrepresentations -- for example, using singular value decomposition to visualize\nthe model's hidden states in a lower dimensional space. This approach assumes\nthat the results of these simplified are faithful to the original model. Here,\nwe illustrate an important caveat to this assumption: even if the simplified\nrepresentations can accurately approximate the full model on the training set,\nthey may fail to accurately capture the model's behavior out of distribution --\nthe understanding developed from simplified representations may be an illusion.\nWe illustrate this by training Transformer models on controlled datasets with\nsystematic generalization splits. First, we train models on the Dyck\nbalanced-parenthesis languages. We simplify these models using tools like\ndimensionality reduction and clustering, and then explicitly test how these\nsimplified proxies match the behavior of the original model on various\nout-of-distribution test sets. We find that the simplified proxies are\ngenerally less faithful out of distribution. In cases where the original model\ngeneralizes to novel structures or deeper depths, the simplified versions may\nfail, or generalize better. This finding holds even if the simplified\nrepresentations do not directly depend on the training distribution. Next, we\nstudy a more naturalistic task: predicting the next character in a dataset of\ncomputer code. We find similar generalization gaps between the original model\nand simplified proxies, and conduct further analysis to investigate which\naspects of the code completion task are associated with the largest gaps.\nTogether, our results raise questions about the extent to which mechanistic\ninterpretations derived using tools like SVD can reliably predict what a model\nwill do in novel situations.\n","authors":["Dan Friedman","Andrew Lampinen","Lucas Dixon","Danqi Chen","Asma Ghandeharioun"],"pdf_url":"https://arxiv.org/pdf/2312.03656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03633v1","updated":"2023-12-06T17:29:45Z","published":"2023-12-06T17:29:45Z","title":"Not All Large Language Models (LLMs) Succumb to the \"Reversal Curse\": A\n Comparative Study of Deductive Logical Reasoning in BERT and GPT Models","summary":" The \"Reversal Curse\" refers to the scenario where auto-regressive decoder\nlarge language models (LLMs), such as ChatGPT, trained on \"A is B\" fail to\nlearn \"B is A\", demonstrating a basic failure of logical deduction. This raises\na red flag in the use of GPT models for certain general tasks such as\nconstructing knowledge graphs, considering their adherence to this symmetric\nprinciple. In our study, we examined a bidirectional LLM, BERT, and found that\nit is immune to the reversal curse. Driven by ongoing efforts to construct\nbiomedical knowledge graphs with LLMs, we also embarked on evaluating more\ncomplex but essential deductive reasoning capabilities. This process included\nfirst training encoder and decoder language models to master the intersection\n($\\cap$) and union ($\\cup$) operations on two sets and then moving on to assess\ntheir capability to infer different combinations of union ($\\cup$) and\nintersection ($\\cap$) operations on three newly created sets. The findings\nshowed that while both encoder and decoder language models, trained for tasks\ninvolving two sets (union/intersection), were proficient in such scenarios,\nthey encountered difficulties when dealing with operations that included three\nsets (various combinations of union and intersection). Our research highlights\nthe distinct characteristics of encoder and decoder models in simple and\ncomplex logical reasoning. In practice, the choice between BERT and GPT should\nbe guided by the specific requirements and nature of the task at hand,\nleveraging their respective strengths in bidirectional context comprehension\nand sequence prediction.\n","authors":["Jingye Yang","Da Wu","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04064v2","updated":"2023-12-06T17:17:00Z","published":"2023-11-07T15:25:52Z","title":"KPI Extraction from Maintenance Work Orders -- A Comparison of Expert\n Labeling, Text Classification and AI-Assisted Tagging for Computing Failure\n Rates of Wind Turbines","summary":" Maintenance work orders are commonly used to document information about wind\nturbine operation and maintenance. This includes details about proactive and\nreactive wind turbine downtimes, such as preventative and corrective\nmaintenance. However, the information contained in maintenance work orders is\noften unstructured and difficult to analyze, presenting challenges for\ndecision-makers wishing to use it for optimizing operation and maintenance. To\naddress this issue, this work compares three different approaches to calculate\nreliability by performance indicators from maintenance work orders. The first\napproach involves manual labeling of the maintenance work orders by domain\nexperts, using the schema defined in an industrial guideline to assign the\nlabel accordingly. The second approach involves the development of a model that\nautomatically labels the maintenance work orders using text classification\nmethods. Through this method, we are able to achieve macro average and weighted\naverage F1-Scores of 0.75 and 0.85 respectively. The third technique uses an\nAI-assisted tagging tool to tag and structure the raw maintenance information,\ntogether with a novel rule-based approach for extracting relevant maintenance\nwork orders for failure rate calculation. In our experiments the AI-assisted\ntool leads to a 88% drop in tagging time in comparison to the other two\napproaches, while expert labeling and text classification are more accurate in\nKPI extraction. Overall, our findings make extracting maintenance information\nfrom maintenance work orders more efficient, enable the assessment of\nreliability key performance indicators and therefore support the optimization\nof wind turbine operation and maintenance.\n","authors":["Marc-Alexander Lutz","Bastian Schäfermeier","Rachael Sexton","Michael Sharp","Alden Dima","Stefan Faulstich","Jagan Mohini Aluri"],"pdf_url":"https://arxiv.org/pdf/2311.04064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18260v2","updated":"2023-12-06T17:16:07Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs,\n\\textit{Flamingo-CXR}, by fine-tuning a well-known vision-language foundation\nmodel on radiology data. To evaluate the quality of the AI-generated reports, a\ngroup of 16 certified radiologists provide detailed evaluations of AI-generated\nand human written reports for chest X-rays from an intensive care setting in\nthe United States and an inpatient setting in India. At least one radiologist\n(out of two per case) preferred the AI report to the ground truth report in\nover 60$\\%$ of cases for both datasets. Amongst the subset of AI-generated\nreports that contain errors, the most frequently cited reasons were related to\nthe location and finding, whereas for human written reports, most mistakes were\nrelated to severity and finding. This disparity suggested potential\ncomplementarity between our AI system and human experts, prompting us to\ndevelop an assistive scenario in which \\textit{Flamingo-CXR} generates a\nfirst-draft report, which is subsequently revised by a clinician. This is the\nfirst demonstration of clinician-AI collaboration for report writing, and the\nresultant reports are assessed to be equivalent or preferred by at least one\nradiologist to reports written by experts alone in 80$\\%$ of in-patient cases\nand 60$\\%$ of intensive care cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.05736v2","updated":"2023-12-06T17:02:25Z","published":"2023-10-09T14:10:21Z","title":"LLMLingua: Compressing Prompts for Accelerated Inference of Large\n Language Models","summary":" Large language models (LLMs) have been applied in various applications due to\ntheir astonishing capabilities. With advancements in technologies such as\nchain-of-thought (CoT) prompting and in-context learning (ICL), the prompts fed\nto LLMs are becoming increasingly lengthy, even exceeding tens of thousands of\ntokens. To accelerate model inference and reduce cost, this paper presents\nLLMLingua, a coarse-to-fine prompt compression method that involves a budget\ncontroller to maintain semantic integrity under high compression ratios, a\ntoken-level iterative compression algorithm to better model the interdependence\nbetween compressed contents, and an instruction tuning based method for\ndistribution alignment between language models. We conduct experiments and\nanalysis over four datasets from different scenarios, i.e., GSM8K, BBH,\nShareGPT, and Arxiv-March23; showing that the proposed approach yields\nstate-of-the-art performance and allows for up to 20x compression with little\nperformance loss. Our code is available at https://aka.ms/LLMLingua.\n","authors":["Huiqiang Jiang","Qianhui Wu","Chin-Yew Lin","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.05736v2.pdf","comment":"Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2311.16173v2","updated":"2023-12-06T16:31:50Z","published":"2023-11-22T03:36:18Z","title":"Conditions for Length Generalization in Learning Reasoning Skills","summary":" Reasoning is a fundamental capability of AI agents. Recently, large language\nmodels (LLMs) have shown remarkable abilities to perform reasoning tasks.\nHowever, numerous evaluations of the reasoning capabilities of LLMs have also\nshowed some limitations. An outstanding limitation is length generalization,\nmeaning that when trained on reasoning problems of smaller lengths or sizes,\nthe resulting models struggle with problems of larger sizes or lengths. This\npotentially indicates some theoretical limitations of generalization in\nlearning reasoning skills. These evaluations and their observations motivated\nus to perform a theoretical study of the length generalization problem. This\nwork focuses on reasoning tasks that can be formulated as Markov dynamic\nprocesses (MDPs) and/or directed acyclic graphs (DAGs). It identifies and\nproves conditions that decide whether the length generalization problem can be\nsolved or not for a reasoning task in a particular representation. Experiments\nare also conducted to verify the theoretical results.\n","authors":["Changnan Xiao","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03577v1","updated":"2023-12-06T16:15:00Z","published":"2023-12-06T16:15:00Z","title":"Improving Bias Mitigation through Bias Experts in Natural Language\n Understanding","summary":" Biases in the dataset often enable the model to achieve high performance on\nin-distribution data, while poorly performing on out-of-distribution data. To\nmitigate the detrimental effect of the bias on the networks, previous works\nhave proposed debiasing methods that down-weight the biased examples identified\nby an auxiliary model, which is trained with explicit bias labels. However,\nfinding a type of bias in datasets is a costly process. Therefore, recent\nstudies have attempted to make the auxiliary model biased without the guidance\n(or annotation) of bias labels, by constraining the model's training\nenvironment or the capability of the model itself. Despite the promising\ndebiasing results of recent works, the multi-class learning objective, which\nhas been naively used to train the auxiliary model, may harm the bias\nmitigation effect due to its regularization effect and competitive nature\nacross classes. As an alternative, we propose a new debiasing framework that\nintroduces binary classifiers between the auxiliary model and the main model,\ncoined bias experts. Specifically, each bias expert is trained on a binary\nclassification task derived from the multi-class classification task via the\nOne-vs-Rest approach. Experimental results demonstrate that our proposed\nstrategy improves the bias identification ability of the auxiliary model.\nConsequently, our debiased model consistently outperforms the state-of-the-art\non various challenge datasets.\n","authors":["Eojin Jeon","Mingyu Lee","Juhyeong Park","Yeachan Kim","Wing-Lam Mok","SangKeun Lee"],"pdf_url":"https://arxiv.org/pdf/2312.03577v1.pdf","comment":"Accepted in EMNLP 2023 as a long paper"},{"id":"http://arxiv.org/abs/2310.00752v2","updated":"2023-12-06T16:06:08Z","published":"2023-10-01T18:01:51Z","title":"TIGERScore: Towards Building Explainable Metric for All Text Generation\n Tasks","summary":" We present TIGERScore, a \\textbf{T}rained metric that follows\n\\textbf{I}nstruction \\textbf{G}uidance to perform \\textbf{E}xplainable, and\n\\textbf{R}eference-free evaluation over a wide spectrum of text generation\ntasks. Different from other automatic evaluation methods that only provide\narcane scores, TIGERScore is guided by natural language instruction to provide\nerror analysis to pinpoint the mistakes in the generated text. Our metric is\nbased on LLaMA-2, trained on our meticulously curated instruction-tuning\ndataset MetricInstruct which covers 6 text generation tasks and 23 text\ngeneration datasets. The dataset consists of 42K quadruple in the form of\n(instruction, input, system output $\\rightarrow$ error analysis). We collected\nthe `system outputs' through from a large variety of models to cover different\ntypes of errors. To quantitatively assess our metric, we evaluate its\ncorrelation with human ratings on 5 held-in datasets, 2 held-out datasets and\nshow that TIGERScore can achieve the open-source SoTA correlation with human\nratings across these datasets and almost approaches GPT-4 evaluator. As a\nreference-free metric, its correlation can even surpass the best existing\nreference-based metrics. To further qualitatively assess the rationale\ngenerated by our metric, we conduct human evaluation on the generated\nexplanations and found that the explanations are 70.8\\% accurate. Through these\nexperimental results, we believe TIGERScore demonstrates the possibility of\nbuilding universal explainable metrics to evaluate any text generation task.\n","authors":["Dongfu Jiang","Yishan Li","Ge Zhang","Wenhao Huang","Bill Yuchen Lin","Wenhu Chen"],"pdf_url":"https://arxiv.org/pdf/2310.00752v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03567v1","updated":"2023-12-06T15:59:06Z","published":"2023-12-06T15:59:06Z","title":"XAIQA: Explainer-Based Data Augmentation for Extractive Question\n Answering","summary":" Extractive question answering (QA) systems can enable physicians and\nresearchers to query medical records, a foundational capability for designing\nclinical studies and understanding patient medical history. However, building\nthese systems typically requires expert-annotated QA pairs. Large language\nmodels (LLMs), which can perform extractive QA, depend on high quality data in\ntheir prompts, specialized for the application domain. We introduce a novel\napproach, XAIQA, for generating synthetic QA pairs at scale from data naturally\navailable in electronic health records. Our method uses the idea of a\nclassification model explainer to generate questions and answers about medical\nconcepts corresponding to medical codes. In an expert evaluation with two\nphysicians, our method identifies $2.2\\times$ more semantic matches and\n$3.8\\times$ more clinical abbreviations than two popular approaches that use\nsentence transformers to create QA pairs. In an ML evaluation, adding our QA\npairs improves performance of GPT-4 as an extractive QA model, including on\ndifficult questions. In both the expert and ML evaluations, we examine\ntrade-offs between our method and sentence transformers for QA pair generation\ndepending on question difficulty.\n","authors":["Joel Stremmel","Ardavan Saeedi","Hamid Hassanzadeh","Sanjit Batra","Jeffrey Hertzberg","Jaime Murillo","Eran Halperin"],"pdf_url":"https://arxiv.org/pdf/2312.03567v1.pdf","comment":"Extended Abstract presented at Machine Learning for Health (ML4H)\n symposium 2023, December 10th, 2023, New Orleans, United States, 8 pages"},{"id":"http://arxiv.org/abs/2209.12407v2","updated":"2023-12-06T15:36:41Z","published":"2022-09-26T04:16:02Z","title":"Entailment Semantics Can Be Extracted from an Ideal Language Model","summary":" Language models are often trained on text alone, without additional\ngrounding. There is debate as to how much of natural language semantics can be\ninferred from such a procedure. We prove that entailment judgments between\nsentences can be extracted from an ideal language model that has perfectly\nlearned its target distribution, assuming the training sentences are generated\nby Gricean agents, i.e., agents who follow fundamental principles of\ncommunication from the linguistic theory of pragmatics. We also show entailment\njudgments can be decoded from the predictions of a language model trained on\nsuch Gricean data. Our results reveal a pathway for understanding the semantic\ninformation encoded in unlabeled linguistic data and a potential framework for\nextracting semantics from language models.\n","authors":["William Merrill","Alex Warstadt","Tal Linzen"],"pdf_url":"https://arxiv.org/pdf/2209.12407v2.pdf","comment":"Accepted at CONLL 2022. Updated Dec 4, 2023 with erratum"},{"id":"http://arxiv.org/abs/2207.02160v2","updated":"2023-12-06T15:29:02Z","published":"2022-07-05T16:28:47Z","title":"A Comprehensive Review of Visual-Textual Sentiment Analysis from Social\n Media Networks","summary":" Social media networks have become a significant aspect of people's lives,\nserving as a platform for their ideas, opinions and emotions. Consequently,\nautomated sentiment analysis (SA) is critical for recognising people's feelings\nin ways that other information sources cannot. The analysis of these feelings\nrevealed various applications, including brand evaluations, YouTube film\nreviews and healthcare applications. As social media continues to develop,\npeople post a massive amount of information in different forms, including text,\nphotos, audio and video. Thus, traditional SA algorithms have become limited,\nas they do not consider the expressiveness of other modalities. By including\nsuch characteristics from various material sources, these multimodal data\nstreams provide new opportunities for optimising the expected results beyond\ntext-based SA. Our study focuses on the forefront field of multimodal SA, which\nexamines visual and textual data posted on social media networks. Many people\nare more likely to utilise this information to express themselves on these\nplatforms. To serve as a resource for academics in this rapidly growing field,\nwe introduce a comprehensive overview of textual and visual SA, including data\npre-processing, feature extraction techniques, sentiment benchmark datasets,\nand the efficacy of multiple classification methodologies suited to each field.\nWe also provide a brief introduction of the most frequently utilised data\nfusion strategies and a summary of existing research on visual-textual SA.\nFinally, we highlight the most significant challenges and investigate several\nimportant sentiment applications.\n","authors":["Israa Khalaf Salman Al-Tameemi","Mohammad-Reza Feizi-Derakhshi","Saeed Pashazadeh","Mohammad Asadpour"],"pdf_url":"https://arxiv.org/pdf/2207.02160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03549v1","updated":"2023-12-06T15:27:26Z","published":"2023-12-06T15:27:26Z","title":"Holmes: Towards Distributed Training Across Clusters with Heterogeneous\n NIC Environment","summary":" Large language models (LLMs) such as GPT-3, OPT, and LLaMA have demonstrated\nremarkable accuracy in a wide range of tasks. However, training these models\ncan incur significant expenses, often requiring tens of thousands of GPUs for\nmonths of continuous operation. Typically, this training is carried out in\nspecialized GPU clusters equipped with homogeneous high-speed Remote Direct\nMemory Access (RDMA) network interface cards (NICs). The acquisition and\nmaintenance of such dedicated clusters is challenging. Current LLM training\nframeworks, like Megatron-LM and Megatron-DeepSpeed, focus primarily on\noptimizing training within homogeneous cluster settings. In this paper, we\nintroduce Holmes, a training framework for LLMs that employs thoughtfully\ncrafted data and model parallelism strategies over the heterogeneous NIC\nenvironment. Our primary technical contribution lies in a novel scheduling\nmethod that intelligently allocates distinct computational tasklets in LLM\ntraining to specific groups of GPU devices based on the characteristics of\ntheir connected NICs. Furthermore, our proposed framework, utilizing pipeline\nparallel techniques, demonstrates scalability to multiple GPU clusters, even in\nscenarios without high-speed interconnects between nodes in distinct clusters.\nWe conducted comprehensive experiments that involved various scenarios in the\nheterogeneous NIC environment. In most cases, our framework achieves\nperformance levels close to those achievable with homogeneous RDMA-capable\nnetworks (InfiniBand or RoCE), significantly exceeding training efficiency\nwithin the pure Ethernet environment. Additionally, we verified that our\nframework outperforms other mainstream LLM frameworks under heterogeneous NIC\nenvironment in terms of training efficiency and can be seamlessly integrated\nwith them.\n","authors":["Fei Yang","Shuang Peng","Ning Sun","Fangyu Wang","Ke Tan","Fu Wu","Jiezhong Qiu","Aimin Pan"],"pdf_url":"https://arxiv.org/pdf/2312.03549v1.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.03523v1","updated":"2023-12-06T14:34:30Z","published":"2023-12-06T14:34:30Z","title":"Sig-Networks Toolkit: Signature Networks for Longitudinal Language\n Modelling","summary":" We present an open-source, pip installable toolkit, Sig-Networks, the first\nof its kind for longitudinal language modelling. A central focus is the\nincorporation of Signature-based Neural Network models, which have recently\nshown success in temporal tasks. We apply and extend published research\nproviding a full suite of signature-based models. Their components can be used\nas PyTorch building blocks in future architectures. Sig-Networks enables\ntask-agnostic dataset plug-in, seamless pre-processing for sequential data,\nparameter flexibility, automated tuning across a range of models. We examine\nsignature networks under three different NLP tasks of varying temporal\ngranularity: counselling conversations, rumour stance switch and mood changes\nin social media threads, showing SOTA performance in all three, and provide\nguidance for future tasks. We release the Toolkit as a PyTorch package with an\nintroductory video, Git repositories for preprocessing and modelling including\nsample notebooks on the modeled NLP tasks.\n","authors":["Talia Tseriotou","Ryan Sze-Yin Chan","Adam Tsakalidis","Iman Munire Bilal","Elena Kochkina","Terry Lyons","Maria Liakata"],"pdf_url":"https://arxiv.org/pdf/2312.03523v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.17030v2","updated":"2023-12-06T14:28:46Z","published":"2023-11-28T18:32:19Z","title":"Is This the Subspace You Are Looking for? An Interpretability Illusion\n for Subspace Activation Patching","summary":" Mechanistic interpretability aims to understand model behaviors in terms of\nspecific, interpretable features, often hypothesized to manifest as\nlow-dimensional subspaces of activations. Specifically, recent studies have\nexplored subspace interventions (such as activation patching) as a way to\nsimultaneously manipulate model behavior and attribute the features behind it\nto given subspaces.\n In this work, we demonstrate that these two aims diverge, potentially leading\nto an illusory sense of interpretability. Counterintuitively, even if a\nsubspace intervention makes the model's output behave as if the value of a\nfeature was changed, this effect may be achieved by activating a dormant\nparallel pathway leveraging another subspace that is causally disconnected from\nmodel outputs. We demonstrate this phenomenon in a distilled mathematical\nexample, in two real-world domains (the indirect object identification task and\nfactual recall), and present evidence for its prevalence in practice. In the\ncontext of factual recall, we further show a link to rank-1 fact editing,\nproviding a mechanistic explanation for previous work observing an\ninconsistency between fact editing performance and fact localization.\n However, this does not imply that activation patching of subspaces is\nintrinsically unfit for interpretability. To contextualize our findings, we\nalso show what a success case looks like in a task (indirect object\nidentification) where prior manual circuit analysis informs an understanding of\nthe location of a feature. We explore the additional evidence needed to argue\nthat a patched subspace is faithful.\n","authors":["Aleksandar Makelov","Georg Lange","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2311.17030v2.pdf","comment":"NeurIPS 2023 Workshop on Attributing Model Behavior at Scale"},{"id":"http://arxiv.org/abs/2305.13683v2","updated":"2023-12-06T14:09:51Z","published":"2023-05-23T04:44:22Z","title":"Error Detection for Text-to-SQL Semantic Parsing","summary":" Despite remarkable progress in text-to-SQL semantic parsing in recent years,\nthe performance of existing parsers is still far from perfect. Specifically,\nmodern text-to-SQL parsers based on deep learning are often over-confident,\nthus casting doubt on their trustworthiness when deployed for real use. In this\npaper, we propose a parser-independent error detection model for text-to-SQL\nsemantic parsing. Using a language model of code as its bedrock, we enhance our\nerror detection model with graph neural networks that learn structural features\nof both natural language questions and SQL queries. We train our model on\nrealistic parsing errors collected from a cross-domain setting, which leads to\nstronger generalization ability. Experiments with three strong text-to-SQL\nparsers featuring different decoding mechanisms show that our approach\noutperforms parser-dependent uncertainty metrics. Our model could also\neffectively improve the performance and usability of text-to-SQL semantic\nparsers regardless of their architectures. (Our implementation is available at\nhttps://github.com/OSU-NLP-Group/Text2SQL-Error-Detection)\n","authors":["Shijie Chen","Ziru Chen","Huan Sun","Yu Su"],"pdf_url":"https://arxiv.org/pdf/2305.13683v2.pdf","comment":"EMNLP 2023 (Findings); Updated with new experiment results"},{"id":"http://arxiv.org/abs/2312.03483v1","updated":"2023-12-06T13:26:16Z","published":"2023-12-06T13:26:16Z","title":"Exploring Answer Information Methods for Question Generation with\n Transformers","summary":" There has been a lot of work in question generation where different methods\nto provide target answers as input, have been employed. This experimentation\nhas been mostly carried out for RNN based models. We use three different\nmethods and their combinations for incorporating answer information and explore\ntheir effect on several automatic evaluation metrics. The methods that are used\nare answer prompting, using a custom product method using answer embeddings and\nencoder outputs, choosing sentences from the input paragraph that have answer\nrelated information, and using a separate cross-attention attention block in\nthe decoder which attends to the answer. We observe that answer prompting\nwithout any additional modes obtains the best scores across rouge, meteor\nscores. Additionally, we use a custom metric to calculate how many of the\ngenerated questions have the same answer, as the answer which is used to\ngenerate them.\n","authors":["Talha Chafekar","Aafiya Hussain","Grishma Sharma","Deepak Sharma"],"pdf_url":"https://arxiv.org/pdf/2312.03483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03480v1","updated":"2023-12-06T13:19:56Z","published":"2023-12-06T13:19:56Z","title":"AMR Parsing is Far from Solved: GrAPES, the Granular AMR Parsing\n Evaluation Suite","summary":" We present the Granular AMR Parsing Evaluation Suite (GrAPES), a challenge\nset for Abstract Meaning Representation (AMR) parsing with accompanying\nevaluation metrics. AMR parsers now obtain high scores on the standard AMR\nevaluation metric Smatch, close to or even above reported inter-annotator\nagreement. But that does not mean that AMR parsing is solved; in fact, human\nevaluation in previous work indicates that current parsers still quite\nfrequently make errors on node labels or graph structure that substantially\ndistort sentence meaning. Here, we provide an evaluation suite that tests AMR\nparsers on a range of phenomena of practical, technical, and linguistic\ninterest. Our 36 categories range from seen and unseen labels, to structural\ngeneralization, to coreference. GrAPES reveals in depth the abilities and\nshortcomings of current AMR parsers.\n","authors":["Jonas Groschwitz","Shay B. Cohen","Lucia Donatelli","Meaghan Fowlie"],"pdf_url":"https://arxiv.org/pdf/2312.03480v1.pdf","comment":"Accepted at EMNLP 2023. For the associated GitHub repository, see\n https://github.com/jgroschwitz/GrAPES"},{"id":"http://arxiv.org/abs/2305.05403v2","updated":"2023-12-06T12:48:06Z","published":"2023-05-09T12:50:16Z","title":"Completeness, Recall, and Negation in Open-World Knowledge Bases: A\n Survey","summary":" General-purpose knowledge bases (KBs) are a cornerstone of knowledge-centric\nAI. Many of them are constructed pragmatically from Web sources, and are thus\nfar from complete. This poses challenges for the consumption as well as the\ncuration of their content. While several surveys target the problem of\ncompleting incomplete KBs, the first problem is arguably to know whether and\nwhere the KB is incomplete in the first place, and to which degree.\n In this survey we discuss how knowledge about completeness, recall, and\nnegation in KBs can be expressed, extracted, and inferred. We cover (i) the\nlogical foundations of knowledge representation and querying under partial\nclosed-world semantics; (ii) the estimation of this information via statistical\npatterns; (iii) the extraction of information about recall from KBs and text;\n(iv) the identification of interesting negative statements; and (v) relaxed\nnotions of relative recall.\n This survey is targeted at two types of audiences: (1) practitioners who are\ninterested in tracking KB quality, focusing extraction efforts, and building\nquality-aware downstream applications; and (2) data management, knowledge base\nand semantic web researchers who wish to understand the state of the art of\nknowledge bases beyond the open-world assumption. Consequently, our survey\npresents both fundamental methodologies and their working, and gives\npractice-oriented recommendations on how to choose between different approaches\nfor a problem at hand.\n","authors":["Simon Razniewski","Hiba Arnaout","Shrestha Ghosh","Fabian Suchanek"],"pdf_url":"https://arxiv.org/pdf/2305.05403v2.pdf","comment":"42 pages, 8 figures, 5 tables"},{"id":"http://arxiv.org/abs/2307.02697v4","updated":"2023-12-06T12:39:33Z","published":"2023-07-06T00:06:14Z","title":"Strahler Number of Natural Language Sentences in Comparison with Random\n Trees","summary":" The Strahler number was originally proposed to characterize the complexity of\nriver bifurcation and has found various applications. This article proposes\ncomputation of the Strahler number's upper and lower limits for natural\nlanguage sentence tree structures. Through empirical measurements across\ngrammatically annotated data, the Strahler number of natural language sentences\nis shown to be almost 3 or 4, similarly to the case of river bifurcation as\nreported by Strahler (1957). From the theory behind the number, we show that it\nis one kind of lower limit on the amount of memory required to process\nsentences. We consider the Strahler number to provide reasoning that explains\nreports showing that the number of required memory areas to process sentences\nis 3 to 4 for parsing (Schuler et al., 2010), and reports indicating a\npsychological \"magical number\" of 3 to 5 (Cowan, 2001). An analytical and\nempirical analysis shows that the Strahler number is not constant but grows\nlogarithmically; therefore, the Strahler number of sentences derives from the\nrange of sentence lengths. Furthermore, the Strahler number is not different\nfor random trees, which could suggest that its origin is not specific to\nnatural language.\n","authors":["Kumiko Tanaka-Ishii","Akira Tanaka"],"pdf_url":"https://arxiv.org/pdf/2307.02697v4.pdf","comment":"34 pages, 12 figures, 11 tables"},{"id":"http://arxiv.org/abs/2312.03463v1","updated":"2023-12-06T12:37:28Z","published":"2023-12-06T12:37:28Z","title":"DBCopilot: Scaling Natural Language Querying to Massive Databases","summary":" Text-to-SQL simplifies database interactions by enabling non-experts to\nconvert their natural language (NL) questions into Structured Query Language\n(SQL) queries. While recent advances in large language models (LLMs) have\nimproved the zero-shot text-to-SQL paradigm, existing methods face scalability\nchallenges when dealing with massive, dynamically changing databases. This\npaper introduces DBCopilot, a framework that addresses these challenges by\nemploying a compact and flexible copilot model for routing across massive\ndatabases. Specifically, DBCopilot decouples the text-to-SQL process into\nschema routing and SQL generation, leveraging a lightweight\nsequence-to-sequence neural network-based router to formulate database\nconnections and navigate natural language questions through databases and\ntables. The routed schemas and questions are then fed into LLMs for efficient\nSQL generation. Furthermore, DBCopilot also introduced a reverse\nschema-to-question generation paradigm, which can learn and adapt the router\nover massive databases automatically without requiring manual intervention.\nExperimental results demonstrate that DBCopilot is a scalable and effective\nsolution for real-world text-to-SQL tasks, providing a significant advancement\nin handling large-scale schemas.\n","authors":["Tianshu Wang","Hongyu Lin","Xianpei Han","Le Sun","Xiaoyang Chen","Hao Wang","Zhenyu Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.03463v1.pdf","comment":"Code and data are available at https://github.com/tshu-w/DBCopilot"},{"id":"http://arxiv.org/abs/2312.03458v1","updated":"2023-12-06T12:34:46Z","published":"2023-12-06T12:34:46Z","title":"Think from Words(TFW): Initiating Human-Like Cognition in Large Language\n Models Through Think from Words for Japanese Text-level Classification","summary":" The proliferation of Large Language Models (LLMs) has spurred extensive\nresearch into LLM-related Prompt investigations, such as Instruction Learning\n(IL), In-context Learning (ICL), and Chain-of-Thought (CoT). These approaches\naim to improve LLMs' responses by enabling them to provide concise statements\nor examples for deeper contemplation when addressing questions. However,\nindependent thinking by LLMs can introduce variability in their thought\nprocesses, leading to potential inaccuracies. In response, our study seeks to\nbridge the gap between LLM and human-like thinking processes, recognizing that\ntext comprehension begins with understanding individual words. To tackle this\nchallenge, we have expanded the CoT method to cater to a specific domain. Our\napproach, known as \"Think from Words\" (TFW), initiates the comprehension\nprocess at the word level and then extends it to encompass the entire text. We\nalso propose \"TFW with Extra word-level information\" (TFW Extra), augmenting\ncomprehension with additional word-level data. To assess our methods, we employ\ntext classification on six Japanese datasets comprising text-level and\nword-level elements. Our findings not only validate the effectiveness of TFW\nbut also shed light on the impact of various word-level information types on\nLLMs' text comprehension, offering insights into their potential to cause\nmisinterpretations and errors in the overall comprehension of the final text.\n","authors":["Chengguang Gan","Qinghao Zhang","Tatsunori Mori"],"pdf_url":"https://arxiv.org/pdf/2312.03458v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08577v3","updated":"2023-12-06T12:34:16Z","published":"2023-10-12T17:59:30Z","title":"Visual Data-Type Understanding does not emerge from Scaling\n Vision-Language Models","summary":" Recent advances in the development of vision-language models (VLMs) are\nyielding remarkable success in recognizing visual semantic content, including\nimpressive instances of compositional image understanding. Here, we introduce\nthe novel task of Visual Data-Type Identification, a basic perceptual skill\nwith implications for data curation (e.g., noisy data-removal from large\ndatasets, domain-specific retrieval) and autonomous vision (e.g.,\ndistinguishing changing weather conditions from camera lens staining). We\ndevelop two datasets consisting of animal images altered across a diverse set\nof 27 visual data-types, spanning four broad categories. An extensive zero-shot\nevaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a nuanced\nperformance landscape. While VLMs are reasonably good at identifying certain\nstylistic \\textit{data-types}, such as cartoons and sketches, they struggle\nwith simpler data-types arising from basic manipulations like image rotations\nor additive noise. Our findings reveal that (i) model scaling alone yields\nmarginal gains for contrastively-trained models like CLIP, and (ii) there is a\npronounced drop in performance for the largest auto-regressively trained VLMs\nlike OpenFlamingo. This finding points to a blind spot in current frontier\nVLMs: they excel in recognizing semantic content but fail to acquire an\nunderstanding of visual data-types through scaling. By analyzing the\npre-training distributions of these models and incorporating data-type\ninformation into the captions during fine-tuning, we achieve a significant\nenhancement in performance. By exploring this previously uncharted task, we aim\nto set the stage for further advancing VLMs to equip them with visual data-type\nunderstanding. Code and datasets are released at\nhttps://github.com/bethgelab/DataTypeIdentification.\n","authors":["Vishaal Udandarao","Max F. Burg","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2310.08577v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.20046v2","updated":"2023-12-06T11:33:43Z","published":"2023-05-31T17:20:45Z","title":"Assessing Language Disorders using Artificial Intelligence: a Paradigm\n Shift","summary":" Speech, language, and communication deficits are present in most\nneurodegenerative syndromes. They enable the early detection, diagnosis,\ntreatment planning, and monitoring of neurocognitive disease progression as\npart of traditional neurological assessment. Nevertheless, standard speech and\nlanguage evaluation is time-consuming and resource-intensive for clinicians. We\nargue that using machine learning methodologies, natural language processing,\nand modern artificial intelligence (AI) for Language Assessment is an\nimprovement over conventional manual assessment. Using these methodologies,\nComputational Language Assessment (CLA) accomplishes three goals: (i) provides\na neuro-cognitive evaluation of speech, language, and communication in elderly\nand high-risk individuals for dementia; (ii) facilitates the diagnosis,\nprognosis, and therapy efficacy in at-risk and language-impaired populations;\nand (iii) allows easier extensibility to assess patients from a wide range of\nlanguages. By employing AI models, CLA may inform neurocognitive theory on the\nrelationship between language symptoms and their neural bases. Finally, it\nsignals a paradigm shift by significantly advancing our ability to optimize the\nprevention and treatment of elderly individuals with communication disorders,\nallowing them to age gracefully with social engagement.\n","authors":["Charalambos Themistocleous","Kyrana Tsapkini","Dimitrios Kokkinakis"],"pdf_url":"https://arxiv.org/pdf/2305.20046v2.pdf","comment":"36 pages, 2 figures, to be submited"},{"id":"http://arxiv.org/abs/2312.03414v1","updated":"2023-12-06T10:50:43Z","published":"2023-12-06T10:50:43Z","title":"Compressed Context Memory For Online Language Model Interaction","summary":" This paper presents a novel context compression method for Transformer\nlanguage models in online scenarios such as ChatGPT, where the context\ncontinually expands. As the context lengthens, the attention process requires\nmore memory and computational resources, which in turn reduces the throughput\nof the language model. To this end, we propose a compressed context memory\nsystem that continually compresses the growing context into a compact memory\nspace. The compression process simply involves integrating a lightweight\nconditional LoRA into the language model's forward pass during inference. Based\non the compressed context memory, the language model can perform inference with\nreduced memory and attention operations. Through evaluations on conversation,\npersonalization, and multi-task learning, we demonstrate that our approach\nachieves the performance level of a full context model with $5\\times$ smaller\ncontext memory space. Codes are available at\nhttps://github.com/snu-mllab/context-memory.\n","authors":["Jang-Hyun Kim","Junyoung Yeom","Sangdoo Yun","Hyun Oh Song"],"pdf_url":"https://arxiv.org/pdf/2312.03414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14246v2","updated":"2023-12-06T10:13:44Z","published":"2023-05-23T17:00:45Z","title":"Modeling Empathic Similarity in Personal Narratives","summary":" The most meaningful connections between people are often fostered through\nexpression of shared vulnerability and emotional experiences in personal\nnarratives. We introduce a new task of identifying similarity in personal\nstories based on empathic resonance, i.e., the extent to which two people\nempathize with each others' experiences, as opposed to raw semantic or lexical\nsimilarity, as has predominantly been studied in NLP. Using insights from\nsocial psychology, we craft a framework that operationalizes empathic\nsimilarity in terms of three key features of stories: main events, emotional\ntrajectories, and overall morals or takeaways. We create EmpathicStories, a\ndataset of 1,500 personal stories annotated with our empathic similarity\nfeatures, and 2,000 pairs of stories annotated with empathic similarity scores.\nUsing our dataset, we fine-tune a model to compute empathic similarity of story\npairs, and show that this outperforms semantic similarity models on automated\ncorrelation and retrieval metrics. Through a user study with 150 participants,\nwe also assess the effect our model has on retrieving stories that users\nempathize with, compared to naive semantic similarity-based retrieval, and find\nthat participants empathized significantly more with stories retrieved by our\nmodel. Our work has strong implications for the use of empathy-aware models to\nfoster human connection and empathy between people.\n","authors":["Jocelyn Shen","Maarten Sap","Pedro Colon-Hernandez","Hae Won Park","Cynthia Breazeal"],"pdf_url":"https://arxiv.org/pdf/2305.14246v2.pdf","comment":"Published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.17390v2","updated":"2023-12-06T10:07:01Z","published":"2023-05-27T07:04:15Z","title":"SwiftSage: A Generative Agent with Fast and Slow Thinking for Complex\n Interactive Tasks","summary":" We introduce SwiftSage, a novel agent framework inspired by the dual-process\ntheory of human cognition, designed to excel in action planning for complex\ninteractive reasoning tasks. SwiftSage integrates the strengths of behavior\ncloning and prompting large language models (LLMs) to enhance task completion\nperformance. The framework comprises two primary modules: the Swift module,\nrepresenting fast and intuitive thinking, and the Sage module, emulating\ndeliberate thought processes. The Swift module is a small encoder-decoder LM\nfine-tuned on the oracle agent's action trajectories, while the Sage module\nemploys LLMs such as GPT-4 for subgoal planning and grounding. We develop a\nheuristic method to harmoniously integrate the two modules, resulting in a more\nefficient and robust problem-solving process. In 30 tasks from the ScienceWorld\nbenchmark, SwiftSage significantly outperforms other methods such as SayCan,\nReAct, and Reflexion, demonstrating its effectiveness in solving complex\ninteractive tasks.\n","authors":["Bill Yuchen Lin","Yicheng Fu","Karina Yang","Faeze Brahman","Shiyu Huang","Chandra Bhagavatula","Prithviraj Ammanabrolu","Yejin Choi","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2305.17390v2.pdf","comment":"Accepted to NeurIPS 2023 (spotlight). Project website:\n https://swiftsage.github.io"},{"id":"http://arxiv.org/abs/2312.03379v1","updated":"2023-12-06T09:37:27Z","published":"2023-12-06T09:37:27Z","title":"A Text-to-Text Model for Multilingual Offensive Language Identification","summary":" The ubiquity of offensive content on social media is a growing cause for\nconcern among companies and government organizations. Recently,\ntransformer-based models such as BERT, XLNET, and XLM-R have achieved\nstate-of-the-art performance in detecting various forms of offensive content\n(e.g. hate speech, cyberbullying, and cyberaggression). However, the majority\nof these models are limited in their capabilities due to their encoder-only\narchitecture, which restricts the number and types of labels in downstream\ntasks. Addressing these limitations, this study presents the first pre-trained\nmodel with encoder-decoder architecture for offensive language identification\nwith text-to-text transformers (T5) trained on two large offensive language\nidentification datasets; SOLID and CCTK. We investigate the effectiveness of\ncombining two datasets and selecting an optimal threshold in semi-supervised\ninstances in SOLID in the T5 retraining step. Our pre-trained T5 model\noutperforms other transformer-based models fine-tuned for offensive language\ndetection, such as fBERT and HateBERT, in multiple English benchmarks.\nFollowing a similar approach, we also train the first multilingual pre-trained\nmodel for offensive language identification using mT5 and evaluate its\nperformance on a set of six different languages (German, Hindi, Korean,\nMarathi, Sinhala, and Spanish). The results demonstrate that this multilingual\nmodel achieves a new state-of-the-art on all the above datasets, showing its\nusefulness in multilingual scenarios. Our proposed T5-based models will be made\nfreely available to the community.\n","authors":["Tharindu Ranasinghe","Marcos Zampieri"],"pdf_url":"https://arxiv.org/pdf/2312.03379v1.pdf","comment":"Accepted to Findings of IJCNLP-AACL 2023"},{"id":"http://arxiv.org/abs/2312.03367v1","updated":"2023-12-06T09:08:32Z","published":"2023-12-06T09:08:32Z","title":"Lazy-k: Decoding for Constrained Token Classification","summary":" We explore the possibility of improving probabilistic models in structured\nprediction. Specifically, we combine the models with constrained decoding\napproaches in the context of token classification for information extraction.\nThe decoding methods search for constraint-satisfying label-assignments while\nmaximizing the total probability. To do this, we evaluate several existing\napproaches, as well as propose a novel decoding method called Lazy-$k$. Our\nfindings demonstrate that constrained decoding approaches can significantly\nimprove the models' performances, especially when using smaller models. The\nLazy-$k$ approach allows for more flexibility between decoding time and\naccuracy. The code for using Lazy-$k$ decoding can be found here:\nhttps://github.com/ArthurDevNL/lazyk.\n","authors":["Arthur Hemmer","Mickaël Coustaty","Nicola Bartolo","Jérôme Brachat","Jean-Marc Ogier"],"pdf_url":"https://arxiv.org/pdf/2312.03367v1.pdf","comment":"Accepted EMNLP Main 2023"},{"id":"http://arxiv.org/abs/2306.09597v3","updated":"2023-12-06T09:05:32Z","published":"2023-06-16T02:49:20Z","title":"Clickbait Detection via Large Language Models","summary":" Clickbait, which aims to induce users with some surprising and even thrilling\nheadlines for increasing click-through rates, permeates almost all online\ncontent publishers, such as news portals and social media. Recently, Large\nLanguage Models (LLMs) have emerged as a powerful instrument and achieved\ntremendous success in a series of NLP downstream tasks. However, it is not yet\nknown whether LLMs can be served as a high-quality clickbait detection system.\nIn this paper, we analyze the performance of LLMs in the few-shot and zero-shot\nscenarios on several English and Chinese benchmark datasets. Experimental\nresults show that LLMs cannot achieve the best results compared to the\nstate-of-the-art deep and fine-tuning PLMs methods. Different from human\nintuition, the experiments demonstrated that LLMs cannot make satisfied\nclickbait detection just by the headlines.\n","authors":["Han Wang","Yi Zhu","Ye Wang","Yun Li","Yunhao Yuan","Jipeng Qiang"],"pdf_url":"https://arxiv.org/pdf/2306.09597v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03361v1","updated":"2023-12-06T09:01:21Z","published":"2023-12-06T09:01:21Z","title":"KhabarChin: Automatic Detection of Important News in the Persian\n Language","summary":" Being aware of important news is crucial for staying informed and making\nwell-informed decisions efficiently. Natural Language Processing (NLP)\napproaches can significantly automate this process. This paper introduces the\ndetection of important news, in a previously unexplored area, and presents a\nnew benchmarking dataset (Khabarchin) for detecting important news in the\nPersian language. We define important news articles as those deemed significant\nfor a considerable portion of society, capable of influencing their mindset or\ndecision-making. The news articles are obtained from seven different prominent\nPersian news agencies, resulting in the annotation of 7,869 samples and the\ncreation of the dataset. Two challenges of high disagreement and imbalance\nbetween classes were faced, and solutions were provided for them. We also\npropose several learning-based models, ranging from conventional machine\nlearning to state-of-the-art transformer models, to tackle this task.\nFurthermore, we introduce the second task of important sentence detection in\nnews articles, as they often come with a significant contextual length that\nmakes it challenging for readers to identify important information. We identify\nthese sentences in a weakly supervised manner.\n","authors":["Hamed Hematian Hemati","Arash Lagzian","Moein Salimi Sartakhti","Hamid Beigy","Ehsaneddin Asgari"],"pdf_url":"https://arxiv.org/pdf/2312.03361v1.pdf","comment":"8 pages, 2 figures"},{"id":"http://arxiv.org/abs/2305.15065v2","updated":"2023-12-06T09:00:19Z","published":"2023-05-24T11:52:55Z","title":"Inference-Time Policy Adapters (IPA): Tailoring Extreme-Scale LMs\n without Fine-tuning","summary":" While extreme-scale language models have demonstrated exceptional performance\non a variety of language tasks, the degree of control over these language\nmodels through pure prompting can often be limited. Directly fine-tuning such\nlanguage models can be effective for tailoring them, but it can be either\nextremely costly (e.g., GPT-3) or not even feasible for the broader community\n(e.g., GPT-4).\n We propose Inference-time Policy Adapters (IPA), which efficiently tailors a\nlanguage model such as GPT-3 without fine-tuning it. IPA guides a large base\nmodel during decoding time through a lightweight policy adapter trained to\noptimize an arbitrary user objective with reinforcement learning.\n On five challenging text generation tasks, such as toxicity reduction and\nlexically constrained generation, IPA consistently brings significant\nimprovements over off-the-shelf language models. It outperforms competitive\nbaseline methods, sometimes even including expensive fine-tuning. In\nparticular, tailoring GPT-2 with IPA can outperform GPT-3, while tailoring\nGPT-3 with IPA brings a major performance boost over GPT-3 (and sometimes even\nover GPT-4). Our promising results highlight the potential of IPA as a\nlightweight alternative to tailoring extreme-scale language models.\n","authors":["Ximing Lu","Faeze Brahman","Peter West","Jaehun Jang","Khyathi Chandu","Abhilasha Ravichander","Lianhui Qin","Prithviraj Ammanabrolu","Liwei Jiang","Sahana Ramnath","Nouha Dziri","Jillian Fisher","Bill Yuchen Lin","Skyler Hallinan","Xiang Ren","Sean Welleck","Yejin Choi"],"pdf_url":"https://arxiv.org/pdf/2305.15065v2.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2206.08225v3","updated":"2023-12-06T08:57:52Z","published":"2022-06-16T14:51:28Z","title":"All the World's a (Hyper)Graph: A Data Drama","summary":" We introduce Hyperbard, a dataset of diverse relational data representations\nderived from Shakespeare's plays. Our representations range from simple graphs\ncapturing character co-occurrence in single scenes to hypergraphs encoding\ncomplex communication settings and character contributions as hyperedges with\nedge-specific node weights. By making multiple intuitive representations\nreadily available for experimentation, we facilitate rigorous representation\nrobustness checks in graph learning, graph mining, and network analysis,\nhighlighting the advantages and drawbacks of specific representations.\nLeveraging the data released in Hyperbard, we demonstrate that many solutions\nto popular graph mining problems are highly dependent on the representation\nchoice, thus calling current graph curation practices into question. As an\nhomage to our data source, and asserting that science can also be art, we\npresent all our points in the form of a play.\n","authors":["Corinna Coupette","Jilles Vreeken","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2206.08225v3.pdf","comment":"This is the full version of our paper; an abridged version appears in\n Digital Scholarship in the Humanities. Landing page for code and data:\n https://hyperbard.net/"},{"id":"http://arxiv.org/abs/2312.03360v1","updated":"2023-12-06T08:55:55Z","published":"2023-12-06T08:55:55Z","title":"Teaching Specific Scientific Knowledge into Large Language Models\n through Additional Training","summary":" Through additional training, we explore embedding specialized scientific\nknowledge into the Llama 2 Large Language Model (LLM). Key findings reveal that\neffective knowledge integration requires reading texts from multiple\nperspectives, especially in instructional formats. We utilize text augmentation\nto tackle the scarcity of specialized texts, including style conversions and\ntranslations. Hyperparameter optimization proves crucial, with different size\nmodels (7b, 13b, and 70b) reasonably undergoing additional training. Validating\nour methods, we construct a dataset of 65,000 scientific papers. Although we\nhave succeeded in partially embedding knowledge, the study highlights the\ncomplexities and limitations of incorporating specialized information into\nLLMs, suggesting areas for further improvement.\n","authors":["Kan Hatakeyama-Sato","Yasuhiko Igarashi","Shun Katakami","Yuta Nabae","Teruaki Hayakawa"],"pdf_url":"https://arxiv.org/pdf/2312.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.01907v2","updated":"2023-12-06T08:49:40Z","published":"2023-11-03T13:34:08Z","title":"BoschAI @ PLABA 2023: Leveraging Edit Operations in End-to-End Neural\n Sentence Simplification","summary":" Automatic simplification can help laypeople to comprehend complex scientific\ntext. Language models are frequently applied to this task by translating from\ncomplex to simple language. In this paper, we describe our system based on\nLlama 2, which ranked first in the PLABA shared task addressing the\nsimplification of biomedical text. We find that the large portion of shared\ntokens between input and output leads to weak training signals and\nconservatively editing models. To mitigate these issues, we propose\nsentence-level and token-level loss weights. They give higher weight to\nmodified tokens, indicated by edit distance and edit operations, respectively.\nWe conduct an empirical evaluation on the PLABA dataset and find that both\napproaches lead to simplifications closer to those created by human annotators\n(+1.8% / +3.5% SARI), simpler language (-1 / -1.1 FKGL) and more edits (1.6x /\n1.8x edit distance) compared to the same model fine-tuned with standard cross\nentropy. We furthermore show that the hyperparameter $\\lambda$ in token-level\nloss weights can be used to control the edit distance and the simplicity level\n(FKGL).\n","authors":["Valentin Knappich","Simon Razniewski","Annemarie Friedrich"],"pdf_url":"https://arxiv.org/pdf/2311.01907v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03342v1","updated":"2023-12-06T08:33:51Z","published":"2023-12-06T08:33:51Z","title":"Topic and genre in dialogue","summary":" In this paper we argue that topic plays a fundamental role in conversations,\nand that the concept is needed in addition to that of genre to define\ninteractions. In particular, the concepts of genre and topic need to be\nseparated and orthogonally defined. This would enable modular, reliable and\ncontrollable flexible-domain dialogue systems.\n","authors":["Amandine Decker","Ellen Breitholtz","Christine Howes","Staffan Larsson"],"pdf_url":"https://arxiv.org/pdf/2312.03342v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03330v1","updated":"2023-12-06T07:38:46Z","published":"2023-12-06T07:38:46Z","title":"Measuring Misogyny in Natural Language Generation: Preliminary Results\n from a Case Study on two Reddit Communities","summary":" Generic `toxicity' classifiers continue to be used for evaluating the\npotential for harm in natural language generation, despite mounting evidence of\ntheir shortcomings. We consider the challenge of measuring misogyny in natural\nlanguage generation, and argue that generic `toxicity' classifiers are\ninadequate for this task. We use data from two well-characterised `Incel'\ncommunities on Reddit that differ primarily in their degrees of misogyny to\nconstruct a pair of training corpora which we use to fine-tune two language\nmodels. We show that an open source `toxicity' classifier is unable to\ndistinguish meaningfully between generations from these models. We contrast\nthis with a misogyny-specific lexicon recently proposed by feminist\nsubject-matter experts, demonstrating that, despite the limitations of simple\nlexicon-based approaches, this shows promise as a benchmark to evaluate\nlanguage models for misogyny, and that it is sensitive enough to reveal the\nknown differences in these Reddit communities. Our preliminary findings\nhighlight the limitations of a generic approach to evaluating harms, and\nfurther emphasise the need for careful benchmark design and selection in\nnatural language evaluation.\n","authors":["Aaron J. Snoswell","Lucinda Nelson","Hao Xue","Flora D. Salim","Nicolas Suzor","Jean Burgess"],"pdf_url":"https://arxiv.org/pdf/2312.03330v1.pdf","comment":"This extended abstract was presented at the Generation, Evaluation\n and Metrics workshop at Empirical Methods in Natural Language Processing in\n 2023 (GEM@EMNLP 2023) in Singapore"},{"id":"http://arxiv.org/abs/2312.03312v1","updated":"2023-12-06T06:37:24Z","published":"2023-12-06T06:37:24Z","title":"Optimizing Two-Pass Cross-Lingual Transfer Learning: Phoneme Recognition\n and Phoneme to Grapheme Translation","summary":" This research optimizes two-pass cross-lingual transfer learning in\nlow-resource languages by enhancing phoneme recognition and phoneme-to-grapheme\ntranslation models. Our approach optimizes these two stages to improve speech\nrecognition across languages. We optimize phoneme vocabulary coverage by\nmerging phonemes based on shared articulatory characteristics, thus improving\nrecognition accuracy. Additionally, we introduce a global phoneme noise\ngenerator for realistic ASR noise during phoneme-to-grapheme training to reduce\nerror propagation. Experiments on the CommonVoice 12.0 dataset show significant\nreductions in Word Error Rate (WER) for low-resource languages, highlighting\nthe effectiveness of our approach. This research contributes to the\nadvancements of two-pass ASR systems in low-resource languages, offering the\npotential for improved cross-lingual transfer learning.\n","authors":["Wonjun Lee","Gary Geunbae Lee","Yunsu Kim"],"pdf_url":"https://arxiv.org/pdf/2312.03312v1.pdf","comment":"8 pages, ASRU 2023 Accepted"},{"id":"http://arxiv.org/abs/2302.07371v3","updated":"2023-12-06T06:26:52Z","published":"2023-02-14T22:07:57Z","title":"BiasTestGPT: Using ChatGPT for Social Bias Testing of Language Models","summary":" Pretrained Language Models (PLMs) harbor inherent social biases that can\nresult in harmful real-world implications. Such social biases are measured\nthrough the probability values that PLMs output for different social groups and\nattributes appearing in a set of test sentences. However, bias testing is\ncurrently cumbersome since the test sentences are generated either from a\nlimited set of manual templates or need expensive crowd-sourcing. We instead\npropose using ChatGPT for the controllable generation of test sentences, given\nany arbitrary user-specified combination of social groups and attributes\nappearing in the test sentences. When compared to template-based methods, our\napproach using ChatGPT for test sentence generation is superior in detecting\nsocial bias, especially in challenging settings such as intersectional biases.\nWe present an open-source comprehensive bias testing framework (BiasTestGPT),\nhosted on HuggingFace, that can be plugged into any open-source PLM for bias\ntesting. User testing with domain experts from various fields has shown their\ninterest in being able to test modern AI for social biases. Our tool has\nsignificantly improved their awareness of such biases in PLMs, proving to be\nlearnable and user-friendly. We thus enable seamless open-ended social bias\ntesting of PLMs by domain experts through an automatic large-scale generation\nof diverse test sentences for any combination of social categories and\nattributes.\n","authors":["Rafal Kocielnik","Shrimai Prabhumoye","Vivian Zhang","Roy Jiang","R. Michael Alvarez","Anima Anandkumar"],"pdf_url":"https://arxiv.org/pdf/2302.07371v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03303v1","updated":"2023-12-06T06:07:50Z","published":"2023-12-06T06:07:50Z","title":"Dyport: Dynamic Importance-based Hypothesis Generation Benchmarking\n Technique","summary":" This paper presents a novel benchmarking framework Dyport for evaluating\nbiomedical hypothesis generation systems. Utilizing curated datasets, our\napproach tests these systems under realistic conditions, enhancing the\nrelevance of our evaluations. We integrate knowledge from the curated databases\ninto a dynamic graph, accompanied by a method to quantify discovery importance.\nThis not only assesses hypothesis accuracy but also their potential impact in\nbiomedical research which significantly extends traditional link prediction\nbenchmarks. Applicability of our benchmarking process is demonstrated on\nseveral link prediction systems applied on biomedical semantic knowledge\ngraphs. Being flexible, our benchmarking system is designed for broad\napplication in hypothesis generation quality verification, aiming to expand the\nscope of scientific discovery within the biomedical research community.\nAvailability and implementation: Dyport framework is fully open-source. All\ncode and datasets are available at: https://github.com/IlyaTyagin/Dyport\n","authors":["Ilya Tyagin","Ilya Safro"],"pdf_url":"https://arxiv.org/pdf/2312.03303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02125v2","updated":"2023-12-06T05:19:11Z","published":"2023-12-04T18:52:26Z","title":"TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and\n Advanced Decoding Techniques","summary":" Recent advances in language models (LMs), have demonstrated significant\nefficacy in tasks related to the arts and humanities. While LMs have exhibited\nexceptional performance across a wide range of natural language processing\ntasks, there are notable challenges associated with their utilization on small\ndatasets and their ability to replicate more creative human capacities. In this\nstudy, we aim to address these challenges by training a Persian classical\npoetry generation model using a transformer architecture on a specialized\ndataset with no pretraining. Additionally, we propose a novel decoding method\nto enhance coherence and meaningfulness in the generated poetry, effectively\nmanaging the tradeoff between diversity and quality. Furthermore, the results\nof our training approach and the proposed decoding method are evaluated through\ncomprehensive set of automatic and human evaluations and showed its superior\ncapability to generate coherent and meaningful poetry in compare to other\ndecoding methods and an existing Persian large language model (LLM).\n","authors":["Amir Panahandeh","Hanie Asemi","Esmaeil Nourani"],"pdf_url":"https://arxiv.org/pdf/2312.02125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03290v1","updated":"2023-12-06T04:48:26Z","published":"2023-12-06T04:48:26Z","title":"Can language agents be alternatives to PPO? A Preliminary Empirical\n Study On OpenAI Gym","summary":" The formidable capacity for zero- or few-shot decision-making in language\nagents encourages us to pose a compelling question: Can language agents be\nalternatives to PPO agents in traditional sequential decision-making tasks? To\ninvestigate this, we first take environments collected in OpenAI Gym as our\ntestbeds and ground them to textual environments that construct the TextGym\nsimulator. This allows for straightforward and efficient comparisons between\nPPO agents and language agents, given the widespread adoption of OpenAI Gym. To\nensure a fair and effective benchmarking, we introduce $5$ levels of scenario\nfor accurate domain-knowledge controlling and a unified RL-inspired framework\nfor language agents. Additionally, we propose an innovative\nexplore-exploit-guided language (EXE) agent to solve tasks within TextGym.\nThrough numerical experiments and ablation studies, we extract valuable\ninsights into the decision-making capabilities of language agents and make a\npreliminary evaluation of their potential to be alternatives to PPO in\nclassical sequential decision-making problems. This paper sheds light on the\nperformance of language agents and paves the way for future research in this\nexciting domain. Our code is publicly available\nat~\\url{https://github.com/mail-ecnu/Text-Gym-Agents}.\n","authors":["Junjie Sheng","Zixiao Huang","Chuyun Shen","Wenhao Li","Yun Hua","Bo Jin","Hongyuan Zha","Xiangfeng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03290v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14387v3","updated":"2023-12-06T04:34:44Z","published":"2023-05-22T17:55:50Z","title":"AlpacaFarm: A Simulation Framework for Methods that Learn from Human\n Feedback","summary":" Large language models (LLMs) such as ChatGPT have seen widespread adoption\ndue to their ability to follow user instructions well. Developing these LLMs\ninvolves a complex yet poorly understood workflow requiring training with human\nfeedback. Replicating and understanding this instruction-following process\nfaces three major challenges: the high cost of data collection, the lack of\ntrustworthy evaluation, and the absence of reference method implementations. We\naddress these challenges with AlpacaFarm, a simulator that enables research and\ndevelopment for learning from feedback at a low cost. First, we design LLM\nprompts to simulate human feedback that are 45x cheaper than crowdworkers and\ndisplay high agreement with humans. Second, we propose an automatic evaluation\nand validate it against human instructions obtained on real-world interactions.\nThird, we contribute reference implementations for several methods (PPO, DPO,\nbest-of-n, expert iteration, and more) that learn from pairwise feedback.\nFinally, as an end-to-end validation of AlpacaFarm, we train and evaluate\neleven models on 10k pairs of real human feedback and show that rankings of\nmodels trained in AlpacaFarm match rankings of models trained on human data. As\na demonstration of the research possible in AlpacaFarm, we find that methods\nthat use a reward model can substantially improve over supervised fine-tuning\nand that our reference PPO implementation leads to a +10% improvement in\nwin-rate against Davinci003. We release all components of AlpacaFarm at\nhttps://github.com/tatsu-lab/alpaca_farm.\n","authors":["Yann Dubois","Xuechen Li","Rohan Taori","Tianyi Zhang","Ishaan Gulrajani","Jimmy Ba","Carlos Guestrin","Percy Liang","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2305.14387v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.10954v2","updated":"2023-12-06T03:34:00Z","published":"2023-09-19T22:41:44Z","title":"In-Context Learning for Text Classification with Many Labels","summary":" In-context learning (ICL) using large language models for tasks with many\nlabels is challenging due to the limited context window, which makes it\ndifficult to fit a sufficient number of examples in the prompt. In this paper,\nwe use a pre-trained dense retrieval model to bypass this limitation, giving\nthe model only a partial view of the full label space for each inference call.\nTesting with recent open-source LLMs (OPT, LLaMA), we set new state of the art\nperformance in few-shot settings for three common intent classification\ndatasets, with no finetuning. We also surpass fine-tuned performance on\nfine-grained sentiment classification in certain cases. We analyze the\nperformance across number of in-context examples and different model scales,\nshowing that larger models are necessary to effectively and consistently make\nuse of larger context lengths for ICL. By running several ablations, we analyze\nthe model's use of: a) the similarity of the in-context examples to the current\ninput, b) the semantic content of the class names, and c) the correct\ncorrespondence between examples and labels. We demonstrate that all three are\nneeded to varying degrees depending on the domain, contrary to certain recent\nworks.\n","authors":["Aristides Milios","Siva Reddy","Dzmitry Bahdanau"],"pdf_url":"https://arxiv.org/pdf/2309.10954v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.02439v2","updated":"2023-12-06T03:20:29Z","published":"2023-12-05T02:41:57Z","title":"Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language\n Models with Creative Humor Generation","summary":" Chain-of-Thought (CoT) guides large language models (LLMs) to reason\nstep-by-step, and can motivate their logical reasoning ability. While effective\nfor logical tasks, CoT is not conducive to creative problem-solving which often\nrequires out-of-box thoughts and is crucial for innovation advancements. In\nthis paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a\nnon-sequential, creative paradigm involving strong associations and knowledge\nleaps. To this end, we study LLMs on the popular Oogiri game which needs\nparticipants to have good creativity and strong associative thinking for\nresponding unexpectedly and humorously to the given image, text, or both, and\nthus is suitable for LoT study. Then to investigate LLMs' LoT ability in the\nOogiri game, we first build a multimodal and multilingual Oogiri-GO dataset\nwhich contains over 130,000 samples from the Oogiri game, and observe the\ninsufficient LoT ability or failures of most existing LLMs on the Oogiri game.\nAccordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve\nLLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into\nLoT-oriented instruction tuning data to train pretrained LLM for achieving\ncertain LoT humor generation and discrimination abilities. Then CLoT designs an\nexplorative self-refinement that encourages the LLM to generate more creative\nLoT data via exploring parallels between seemingly unrelated concepts and\nselects high-quality data to train itself for self-refinement. CLoT not only\nexcels in humor generation in the Oogiri game but also boosts creative\nabilities in various tasks like cloud guessing game and divergent association\ntask. These findings advance our understanding and offer a pathway to improve\nLLMs' creative capacities for innovative applications across domains. The\ndataset, code, and models will be released online.\nhttps://zhongshsh.github.io/CLoT/.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Shanghua Gao","Wushao Wen","Liang Lin","Marinka Zitnik","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02439v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2305.12524v3","updated":"2023-12-06T03:02:45Z","published":"2023-05-21T17:51:35Z","title":"TheoremQA: A Theorem-driven Question Answering dataset","summary":" The recent LLMs like GPT-4 and PaLM-2 have made tremendous progress in\nsolving fundamental math problems like GSM8K by achieving over 90% accuracy.\nHowever, their capabilities to solve more challenging math problems which\nrequire domain-specific knowledge (i.e. theorem) have yet to be investigated.\nIn this paper, we introduce TheoremQA, the first theorem-driven\nquestion-answering dataset designed to evaluate AI models' capabilities to\napply theorems to solve challenging science problems. TheoremQA is curated by\ndomain experts containing 800 high-quality questions covering 350 theorems\n(e.g. Taylor's theorem, Lagrange's theorem, Huffman coding, Quantum Theorem,\nElasticity Theorem, etc) from Math, Physics, EE&CS, and Finance. We evaluate a\nwide spectrum of 16 large language and code models with different prompting\nstrategies like Chain-of-Thoughts and Program-of-Thoughts. We found that\nGPT-4's capabilities to solve these problems are unparalleled, achieving an\naccuracy of 51% with Program-of-Thoughts Prompting. All the existing\nopen-sourced models are below 15%, barely surpassing the random-guess baseline.\nGiven the diversity and broad coverage of TheoremQA, we believe it can be used\nas a better benchmark to evaluate LLMs' capabilities to solve challenging\nscience problems. The data and code are released in\nhttps://github.com/wenhuchen/TheoremQA.\n","authors":["Wenhu Chen","Ming Yin","Max Ku","Pan Lu","Yixin Wan","Xueguang Ma","Jianyu Xu","Xinyi Wang","Tony Xia"],"pdf_url":"https://arxiv.org/pdf/2305.12524v3.pdf","comment":"Accepted to Main Conference of EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.01454v2","updated":"2023-12-06T02:53:11Z","published":"2023-12-03T16:58:10Z","title":"D-Bot: Database Diagnosis System using Large Language Models","summary":" Database administrators (DBAs) play an important role in managing,\nmaintaining and optimizing database systems. However, it is hard and tedious\nfor DBAs to manage a large number of databases and give timely response\n(waiting for hours is intolerable in many online cases). In addition, existing\nempirical methods only support limited diagnosis scenarios, which are also\nlabor-intensive to update the diagnosis rules for database version updates.\nRecently large language models (LLMs) have shown great potential in various\nfields. Thus, we propose D-Bot, an LLM-based database diagnosis system that can\nautomatically acquire knowledge from diagnosis documents, and generate\nreasonable and well-founded diagnosis report (i.e., identifying the root causes\nand solutions) within acceptable time (e.g., under 10 minutes compared to hours\nby a DBA). The techniques in D-Bot include (i) offline knowledge extraction\nfrom documents, (ii) automatic prompt generation (e.g., knowledge matching,\ntool retrieval), (iii) root cause analysis using tree search algorithm, and\n(iv) collaborative mechanism for complex anomalies with multiple root causes.\nWe verify D-Bot on real benchmarks (including 539 anomalies of six typical\napplications), and the results show that D-Bot can effectively analyze the root\ncauses of unseen anomalies and significantly outperforms traditional methods\nand vanilla models like GPT-4.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhaoyan Sun","Zhiyuan Liu","Weize Chen","Jianming Wu","Jiesi Liu","Ruohang Feng","Guoyang Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10706v2","updated":"2023-12-06T02:49:49Z","published":"2022-06-21T19:55:07Z","title":"TraSE: Towards Tackling Authorial Style from a Cognitive Science\n Perspective","summary":" Stylistic analysis of text is a key task in research areas ranging from\nauthorship attribution to forensic analysis and personality profiling. The\nexisting approaches for stylistic analysis are plagued by issues like topic\ninfluence, lack of discriminability for large number of authors and the\nrequirement for large amounts of diverse data. In this paper, the source of\nthese issues are identified along with the necessity for a cognitive\nperspective on authorial style in addressing them. A novel feature\nrepresentation, called Trajectory-based Style Estimation (TraSE), is introduced\nto support this purpose. Authorship attribution experiments with over 27,000\nauthors and 1.4 million samples in a cross-domain scenario resulted in 90%\nattribution accuracy suggesting that the feature representation is immune to\nsuch negative influences and an excellent candidate for stylistic analysis.\nFinally, a qualitative analysis is performed on TraSE using physical human\ncharacteristics, like age, to validate its claim on capturing cognitive traits.\n","authors":["Ronald Wilson","Avanti Bhandarkar","Damon Woodard"],"pdf_url":"https://arxiv.org/pdf/2206.10706v2.pdf","comment":"Experimental results in the paper are incorrectly reported due to an\n unforeseen glitch in the software prototype. The paper and its findings are\n withdrawn"},{"id":"http://arxiv.org/abs/2305.11746v2","updated":"2023-12-06T02:02:00Z","published":"2023-05-19T15:33:50Z","title":"HalOmi: A Manually Annotated Benchmark for Multilingual Hallucination\n and Omission Detection in Machine Translation","summary":" Hallucinations in machine translation are translations that contain\ninformation completely unrelated to the input. Omissions are translations that\ndo not include some of the input information. While both cases tend to be\ncatastrophic errors undermining user trust, annotated data with these types of\npathologies is extremely scarce and is limited to a few high-resource\nlanguages. In this work, we release an annotated dataset for the hallucination\nand omission phenomena covering 18 translation directions with varying resource\nlevels and scripts. Our annotation covers different levels of partial and full\nhallucinations as well as omissions both at the sentence and at the word level.\nAdditionally, we revisit previous methods for hallucination and omission\ndetection, show that conclusions made based on a single language pair largely\ndo not hold for a large-scale evaluation, and establish new solid baselines.\n","authors":["David Dale","Elena Voita","Janice Lam","Prangthip Hansanti","Christophe Ropers","Elahe Kalbassi","Cynthia Gao","Loïc Barrault","Marta R. Costa-jussà"],"pdf_url":"https://arxiv.org/pdf/2305.11746v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13675v2","updated":"2023-12-06T01:54:34Z","published":"2023-05-23T04:31:39Z","title":"Polyglot or Not? Measuring Multilingual Encyclopedic Knowledge in\n Foundation Models","summary":" In this work, we assess the ability of foundation models to recall\nencyclopedic knowledge across a wide range of linguistic contexts. To support\nthis, we: 1) produce a 20-language dataset that contains 303k factual\nassociations paired with counterfactuals, 2) evaluate 5 models in a\nmultilingual test, and 3) benchmark a diverse set of 24 models in an\nEnglish-only test. Meta's LLaMA achieves the highest scores in both\nmultilingual and English-only evaluations. Yet, an analysis of LLaMA's errors\nreveals significant limitations in its ability to recall facts in languages\nother than English, plus difficulties related to the location and gender of\nfact subjects. Overall, our findings suggest that today's foundation models are\nfar from polyglots.\n","authors":["Tim Schott","Daniel Furman","Shreshta Bhat"],"pdf_url":"https://arxiv.org/pdf/2305.13675v2.pdf","comment":"EMNLP 2023 (Main)"},{"id":"http://arxiv.org/abs/2312.03217v1","updated":"2023-12-06T01:15:40Z","published":"2023-12-06T01:15:40Z","title":"Rethinking E-Commerce Search","summary":" E-commerce search and recommendation usually operate on structured data such\nas product catalogs and taxonomies. However, creating better search and\nrecommendation systems often requires a large variety of unstructured data\nincluding customer reviews and articles on the web. Traditionally, the solution\nhas always been converting unstructured data into structured data through\ninformation extraction, and conducting search over the structured data.\nHowever, this is a costly approach that often has low quality. In this paper,\nwe envision a solution that does entirely the opposite. Instead of converting\nunstructured data (web pages, customer reviews, etc) to structured data, we\ninstead convert structured data (product inventory, catalogs, taxonomies, etc)\ninto textual data, which can be easily integrated into the text corpus that\ntrains LLMs. Then, search and recommendation can be performed through a Q/A\nmechanism through an LLM instead of using traditional information retrieval\nmethods over structured data.\n","authors":["Haixun Wang","Taesik Na"],"pdf_url":"https://arxiv.org/pdf/2312.03217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07320v2","updated":"2023-12-06T00:48:53Z","published":"2023-03-13T17:41:57Z","title":"Model-tuning Via Prompts Makes NLP Models Adversarially Robust","summary":" In recent years, NLP practitioners have converged on the following practice:\n(i) import an off-the-shelf pretrained (masked) language model; (ii) append a\nmultilayer perceptron atop the CLS token's hidden representation (with randomly\ninitialized weights); and (iii) fine-tune the entire model on a downstream task\n(MLP-FT). This procedure has produced massive gains on standard NLP benchmarks,\nbut these models remain brittle, even to mild adversarial perturbations. In\nthis work, we demonstrate surprising gains in adversarial robustness enjoyed by\nModel-tuning Via Prompts (MVP), an alternative method of adapting to downstream\ntasks. Rather than appending an MLP head to make output prediction, MVP appends\na prompt template to the input, and makes prediction via text\ninfilling/completion. Across 5 NLP datasets, 4 adversarial attacks, and 3\ndifferent models, MVP improves performance against adversarial substitutions by\nan average of 8% over standard methods and even outperforms adversarial\ntraining-based state-of-art defenses by 3.5%. By combining MVP with adversarial\ntraining, we achieve further improvements in adversarial robustness while\nmaintaining performance on unperturbed examples. Finally, we conduct ablations\nto investigate the mechanism underlying these gains. Notably, we find that the\nmain causes of vulnerability of MLP-FT can be attributed to the misalignment\nbetween pre-training and fine-tuning tasks, and the randomly initialized MLP\nparameters.\n","authors":["Mrigank Raman","Pratyush Maini","J. Zico Kolter","Zachary C. Lipton","Danish Pruthi"],"pdf_url":"https://arxiv.org/pdf/2303.07320v2.pdf","comment":"Accepted to the EMNLP 2023 Conference"},{"id":"http://arxiv.org/abs/2305.13406v3","updated":"2023-12-06T00:19:24Z","published":"2023-05-22T18:43:31Z","title":"DADA: Dialect Adaptation via Dynamic Aggregation of Linguistic Rules","summary":" Existing large language models (LLMs) that mainly focus on Standard American\nEnglish (SAE) often lead to significantly worse performance when being applied\nto other English dialects. While existing mitigations tackle discrepancies for\nindividual target dialects, they assume access to high-accuracy dialect\nidentification systems. The boundaries between dialects are inherently\nflexible, making it difficult to categorize language into discrete predefined\ncategories. In this paper, we propose DADA (Dialect Adaptation via Dynamic\nAggregation), a modular approach to imbue SAE-trained models with\nmulti-dialectal robustness by composing adapters which handle specific\nlinguistic features. The compositional architecture of DADA allows for both\ntargeted adaptation to specific dialect variants and simultaneous adaptation to\nvarious dialects. We show that DADA is effective for both single task and\ninstruction finetuned language models, offering an extensible and interpretable\nframework for adapting existing LLMs to different English dialects.\n","authors":["Yanchen Liu","William Held","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.13406v3.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.03195v1","updated":"2023-12-06T00:08:44Z","published":"2023-12-06T00:08:44Z","title":"Detecting Rumor Veracity with Only Textual Information by Double-Channel\n Structure","summary":" Kyle (1985) proposes two types of rumors: informed rumors which are based on\nsome private information and uninformed rumors which are not based on any\ninformation (i.e. bluffing). Also, prior studies find that when people have\ncredible source of information, they are likely to use a more confident textual\ntone in their spreading of rumors. Motivated by these theoretical findings, we\npropose a double-channel structure to determine the ex-ante veracity of rumors\non social media. Our ultimate goal is to classify each rumor into true, false,\nor unverifiable category. We first assign each text into either certain\n(informed rumor) or uncertain (uninformed rumor) category. Then, we apply lie\ndetection algorithm to informed rumors and thread-reply agreement detection\nalgorithm to uninformed rumors. Using the dataset of SemEval 2019 Task 7, which\nrequires ex-ante threefold classification (true, false, or unverifiable) of\nsocial media rumors, our model yields a macro-F1 score of 0.4027, outperforming\nall the baseline models and the second-place winner (Gorrell et al., 2019).\nFurthermore, we empirically validate that the double-channel structure\noutperforms single-channel structures which use either lie detection or\nagreement detection algorithm to all posts.\n","authors":["Alex Kim","Sangwon Yoon"],"pdf_url":"https://arxiv.org/pdf/2312.03195v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03194v1","updated":"2023-12-06T00:05:25Z","published":"2023-12-06T00:05:25Z","title":"Corporate Bankruptcy Prediction with Domain-Adapted BERT","summary":" This study performs BERT-based analysis, which is a representative\ncontextualized language model, on corporate disclosure data to predict\nimpending bankruptcies. Prior literature on bankruptcy prediction mainly\nfocuses on developing more sophisticated prediction methodologies with\nfinancial variables. However, in our study, we focus on improving the quality\nof input dataset. Specifically, we employ BERT model to perform sentiment\nanalysis on MD&A disclosures. We show that BERT outperforms dictionary-based\npredictions and Word2Vec-based predictions in terms of adjusted R-square in\nlogistic regression, k-nearest neighbor (kNN-5), and linear kernel support\nvector machine (SVM). Further, instead of pre-training the BERT model from\nscratch, we apply self-learning with confidence-based filtering to corporate\ndisclosure data (10-K). We achieve the accuracy rate of 91.56% and demonstrate\nthat the domain adaptation procedure brings a significant improvement in\nprediction accuracy.\n","authors":["Alex Kim","Sangwon Yoon"],"pdf_url":"https://arxiv.org/pdf/2312.03194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00245v2","updated":"2023-12-06T23:46:36Z","published":"2023-05-31T23:39:18Z","title":"From Pixels to UI Actions: Learning to Follow Instructions via Graphical\n User Interfaces","summary":" Much of the previous work towards digital agents for graphical user\ninterfaces (GUIs) has relied on text-based representations (derived from HTML\nor other structured data sources), which are not always readily available.\nThese input representations have been often coupled with custom, task-specific\naction spaces. This paper focuses on creating agents that interact with the\ndigital world using the same conceptual interface that humans commonly use --\nvia pixel-based screenshots and a generic action space corresponding to\nkeyboard and mouse actions. Building upon recent progress in pixel-based\npretraining, we show, for the first time, that it is possible for such agents\nto outperform human crowdworkers on the MiniWob++ benchmark of GUI-based\ninstruction following tasks.\n","authors":["Peter Shaw","Mandar Joshi","James Cohan","Jonathan Berant","Panupong Pasupat","Hexiang Hu","Urvashi Khandelwal","Kenton Lee","Kristina Toutanova"],"pdf_url":"https://arxiv.org/pdf/2306.00245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09778v2","updated":"2023-12-06T23:18:44Z","published":"2023-08-18T18:58:54Z","title":"Towards Grounded Visual Spatial Reasoning in Multi-Modal Vision Language\n Models","summary":" With pre-training of vision-and-language models (VLMs) on large-scale\ndatasets of image-text pairs, several recent works showed that these\npre-trained models lack fine-grained understanding, such as the ability to\ncount and recognize verbs, attributes, or relationships. The focus of this work\nis to study the ability of these models to understand spatial relations.\nPreviously, this has been tackled using image-text matching (e.g., Visual\nSpatial Reasoning benchmark) or visual question answering (e.g., GQA or VQAv2),\nboth showing poor performance and a large gap compared to human performance. In\nthis work, we use explainability tools to understand the causes of poor\nperformance better and present an alternative fine-grained, compositional\napproach for ranking spatial clauses. We combine the evidence from grounding\nnoun phrases corresponding to objects and their locations to compute the final\nrank of the spatial clause. We demonstrate the approach on representative VLMs\n(such as LXMERT, GPV, and MDETR) and compare and highlight their abilities to\nreason about spatial relationships.\n","authors":["Navid Rajabi","Jana Kosecka"],"pdf_url":"https://arxiv.org/pdf/2308.09778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03912v1","updated":"2023-12-06T21:12:22Z","published":"2023-12-06T21:12:22Z","title":"Collaboration or Corporate Capture? Quantifying NLP's Reliance on\n Industry Artifacts and Contributions","summary":" The advent of transformers, higher computational budgets, and big data has\nengendered remarkable progress in Natural Language Processing (NLP). Impressive\nperformance of industry pre-trained models has garnered public attention in\nrecent years and made news headlines. That these are industry models is\nnoteworthy. Rarely, if ever, are academic institutes producing exciting new NLP\nmodels. Using these models is critical for competing on NLP benchmarks and\ncorrespondingly to stay relevant in NLP research. We surveyed 100 papers\npublished at EMNLP 2022 to determine whether this phenomenon constitutes a\nreliance on industry for NLP publications.\n We find that there is indeed a substantial reliance. Citations of industry\nartifacts and contributions across categories is at least three times greater\nthan industry publication rates per year. Quantifying this reliance does not\nsettle how we ought to interpret the results. We discuss two possible\nperspectives in our discussion: 1) Is collaboration with industry still\ncollaboration in the absence of an alternative? Or 2) has free NLP inquiry been\ncaptured by the motivations and research direction of private corporations?\n","authors":["Will Aitken","Mohamed Abdalla","Karen Rudie","Catherine Stinson"],"pdf_url":"https://arxiv.org/pdf/2312.03912v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.01860v3","updated":"2023-12-06T21:09:01Z","published":"2023-09-04T23:31:29Z","title":"Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition\n and Translation","summary":" In this paper, we devise a mechanism for the addition of multi-modal\ninformation with an existing pipeline for continuous sign language recognition\nand translation. In our procedure, we have incorporated optical flow\ninformation with RGB images to enrich the features with movement-related\ninformation. This work studies the feasibility of such modality inclusion using\na cross-modal encoder. The plugin we have used is very lightweight and doesn't\nneed to include a separate feature extractor for the new modality in an\nend-to-end manner. We have applied the changes in both sign language\nrecognition and translation, improving the result in each case. We have\nevaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language\nrecognition and the RWTH-PHOENIX-2014T dataset for translation. On the\nrecognition task, our approach reduced the WER by 0.9, and on the translation\ntask, our approach increased most of the BLEU scores by ~0.6 on the test set.\n","authors":["Zaber Ibn Abdul Hakim","Rasman Mubtasim Swargo","Muhammad Abdullah Adnan"],"pdf_url":"https://arxiv.org/pdf/2309.01860v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03905v1","updated":"2023-12-06T20:58:07Z","published":"2023-12-06T20:58:07Z","title":"A Pseudo-Semantic Loss for Autoregressive Models with Logical\n Constraints","summary":" Neuro-symbolic AI bridges the gap between purely symbolic and neural\napproaches to learning. This often requires maximizing the likelihood of a\nsymbolic constraint w.r.t the neural network's output distribution. Such output\ndistributions are typically assumed to be fully-factorized. This limits the\napplicability of neuro-symbolic learning to the more expressive autoregressive\ndistributions, e.g., transformers. Under such distributions, computing the\nlikelihood of even simple constraints is #P-hard. Instead of attempting to\nenforce the constraint on the entire output distribution, we propose to do so\non a random, local approximation thereof. More precisely, we optimize the\nlikelihood of the constraint under a pseudolikelihood-based approximation\ncentered around a model sample. Our approximation is factorized, allowing the\nreuse of solutions to sub-problems, a main tenet for efficiently computing\nneuro-symbolic losses. Moreover, it is a local, high-fidelity approximation of\nthe likelihood, exhibiting low entropy and KL-divergence around the model\nsample. We evaluate our approach on Sudoku and shortest-path prediction cast as\nautoregressive generation, and observe that we greatly improve upon the base\nmodel's ability to predict logically-consistent outputs. We also evaluate on\nthe task of detoxifying large language models. Using a simple constraint\ndisallowing a list of toxic words, we are able to steer the model's outputs\naway from toxic generations, achieving SoTA detoxification compared to previous\napproaches.\n","authors":["Kareem Ahmed","Kai-Wei Chang","Guy Van den Broeck"],"pdf_url":"https://arxiv.org/pdf/2312.03905v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03897v1","updated":"2023-12-06T20:41:47Z","published":"2023-12-06T20:41:47Z","title":"Revisiting the Optimality of Word Lengths","summary":" Zipf (1935) posited that wordforms are optimized to minimize utterances'\ncommunicative costs. Under the assumption that cost is given by an utterance's\nlength, he supported this claim by showing that words' lengths are inversely\ncorrelated with their frequencies. Communicative cost, however, can be\noperationalized in different ways. Piantadosi et al. (2011) claim that cost\nshould be measured as the distance between an utterance's information rate and\nchannel capacity, which we dub the channel capacity hypothesis (CCH) here.\nFollowing this logic, they then proposed that a word's length should be\nproportional to the expected value of its surprisal (negative log-probability\nin context). In this work, we show that Piantadosi et al.'s derivation does not\nminimize CCH's cost, but rather a lower bound, which we term CCH-lower. We\npropose a novel derivation, suggesting an improved way to minimize CCH's cost.\nUnder this method, we find that a language's word lengths should instead be\nproportional to the surprisal's expectation plus its variance-to-mean ratio.\nExperimentally, we compare these three communicative cost functions: Zipf's,\nCCH-lower , and CCH. Across 13 languages and several experimental settings, we\nfind that length is better predicted by frequency than either of the other\nhypotheses. In fact, when surprisal's expectation, or expectation plus\nvariance-to-mean ratio, is estimated using better language models, it leads to\nworse word length predictions. We take these results as evidence that Zipf's\nlongstanding hypothesis holds.\n","authors":["Tiago Pimentel","Clara Meister","Ethan Gotlieb Wilcox","Kyle Mahowald","Ryan Cotterell"],"pdf_url":"https://arxiv.org/pdf/2312.03897v1.pdf","comment":"Published at EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.03872v1","updated":"2023-12-06T19:37:08Z","published":"2023-12-06T19:37:08Z","title":"The BigCode Project Governance Card","summary":" This document serves as an overview of the different mechanisms and areas of\ngovernance in the BigCode project. It aims to support transparency by providing\nrelevant information about choices that were made during the project to the\nbroader public, and to serve as an example of intentional governance of an open\nresearch project that future endeavors can leverage to shape their own\napproach. The first section, Project Structure, covers the project\norganization, its stated goals and values, its internal decision processes, and\nits funding and resources. The second section, Data and Model Governance,\ncovers decisions relating to the questions of data subject consent, privacy,\nand model release.\n","authors":[" BigCode collaboration","Sean Hughes","Harm de Vries","Jennifer Robinson","Carlos Muñoz Ferrandis","Loubna Ben Allal","Leandro von Werra","Jennifer Ding","Sebastien Paquet","Yacine Jernite"],"pdf_url":"https://arxiv.org/pdf/2312.03872v1.pdf","comment":"12 pages, related papers arXiv:2305.06161 and arXiv:2301.03988 and\n arXiv:2211.15533v1, learn more at https://www.bigcode-project.org/"},{"id":"http://arxiv.org/abs/2312.03863v1","updated":"2023-12-06T19:18:42Z","published":"2023-12-06T19:18:42Z","title":"Efficient Large Language Models: A Survey","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nimportant tasks such as natural language understanding, language generation,\nand complex reasoning and have the potential to make a substantial impact on\nour society. Such capabilities, however, come with the considerable resources\nthey demand, highlighting the strong need to develop effective techniques for\naddressing their efficiency challenges. In this survey, we provide a systematic\nand comprehensive review of efficient LLMs research. We organize the literature\nin a taxonomy consisting of three main categories, covering distinct yet\ninterconnected efficient LLMs topics from model-centric, data-centric, and\nframework-centric perspective, respectively. We have also created a GitHub\nrepository where we compile the papers featured in this survey at\nhttps://github.com/AIoT-MLSys-Lab/EfficientLLMs,\nhttps://github.com/AIoT-MLSys-Lab/Efficient-LLMs-Survey, and will actively\nmaintain this repository and incorporate new research as it emerges. We hope\nour survey can serve as a valuable resource to help researchers and\npractitioners gain a systematic understanding of the research developments in\nefficient LLMs and inspire them to contribute to this important and exciting\nfield.\n","authors":["Zhongwei Wan","Xin Wang","Che Liu","Samiul Alam","Yu Zheng","Zhongnan Qu","Shen Yan","Yi Zhu","Quanlu Zhang","Mosharaf Chowdhury","Mi Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03863v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11616v2","updated":"2023-12-06T19:00:13Z","published":"2023-10-17T22:42:12Z","title":"Unveiling the General Intelligence Factor in Language Models: A\n Psychometric Approach","summary":" This study uncovers the factor of general intelligence, or g, in language\nmodels, extending the psychometric theory traditionally applied to humans and\ncertain animal species. Utilizing factor analysis on two extensive datasets -\nOpen LLM Leaderboard with 1,232 models and General Language Understanding\nEvaluation (GLUE) Leaderboard with 88 models - we find compelling evidence for\na unidimensional, highly stable g factor that accounts for 85% of the variance\nin model performance. The study also finds a moderate correlation of .49\nbetween model size and g. The discovery of g in language models offers a\nunified metric for model evaluation and opens new avenues for more robust,\ng-based model ability assessment. These findings lay the foundation for\nunderstanding and future research on artificial general intelligence from a\npsychometric perspective and have practical implications for model evaluation\nand development.\n","authors":["David Ilić"],"pdf_url":"https://arxiv.org/pdf/2310.11616v2.pdf","comment":"10 pages (including appendix), 7 figures"},{"id":"http://arxiv.org/abs/2312.03818v1","updated":"2023-12-06T18:59:30Z","published":"2023-12-06T18:59:30Z","title":"Alpha-CLIP: A CLIP Model Focusing on Wherever You Want","summary":" Contrastive Language-Image Pre-training (CLIP) plays an essential role in\nextracting valuable content information from images across diverse tasks. It\naligns textual and visual modalities to comprehend the entire image, including\nall the details, even those irrelevant to specific tasks. However, for a finer\nunderstanding and controlled editing of images, it becomes crucial to focus on\nspecific regions of interest, which can be indicated as points, masks, or boxes\nby humans or perception models. To fulfill the requirements, we introduce\nAlpha-CLIP, an enhanced version of CLIP with an auxiliary alpha channel to\nsuggest attentive regions and fine-tuned with constructed millions of RGBA\nregion-text pairs. Alpha-CLIP not only preserves the visual recognition ability\nof CLIP but also enables precise control over the emphasis of image contents.\nIt demonstrates effectiveness in various tasks, including but not limited to\nopen-world recognition, multimodal large language models, and conditional 2D /\n3D generation. It has a strong potential to serve as a versatile tool for\nimage-related tasks.\n","authors":["Zeyi Sun","Ye Fang","Tong Wu","Pan Zhang","Yuhang Zang","Shu Kong","Yuanjun Xiong","Dahua Lin","Jiaqi Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03818v1.pdf","comment":"project page: https://aleafy.github.io/alpha-clip; code:\n https://github.com/SunzeY/AlphaCLIP"},{"id":"http://arxiv.org/abs/2312.03815v1","updated":"2023-12-06T18:50:26Z","published":"2023-12-06T18:50:26Z","title":"LLM as OS (llmao), Agents as Apps: Envisioning AIOS, Agents and the\n AIOS-Agent Ecosystem","summary":" This paper envisions a revolutionary AIOS-Agent ecosystem, where Large\nLanguage Model (LLM) serves as the (Artificial) Intelligent Operating System\n(IOS, or AIOS)--an operating system ``with soul''. Upon this foundation, a\ndiverse range of LLM-based AI Agent Applications (Agents, or AAPs) are\ndeveloped, enriching the AIOS-Agent ecosystem and signaling a paradigm shift\nfrom the traditional OS-APP ecosystem. We envision that LLM's impact will not\nbe limited to the AI application level, instead, it will in turn revolutionize\nthe design and implementation of computer system, architecture, software, and\nprogramming language, featured by several main concepts: LLM as OS\n(system-level), Agents as Applications (application-level), Natural Language as\nProgramming Interface (user-level), and Tools as Devices/Libraries\n(hardware/middleware-level).\n","authors":["Yingqiang Ge","Yujie Ren","Wenyue Hua","Shuyuan Xu","Juntao Tan","Yongfeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03815v1.pdf","comment":"35 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.03813v1","updated":"2023-12-06T18:27:07Z","published":"2023-12-06T18:27:07Z","title":"Improving Activation Steering in Language Models with Mean-Centring","summary":" Recent work in activation steering has demonstrated the potential to better\ncontrol the outputs of Large Language Models (LLMs), but it involves finding\nsteering vectors. This is difficult because engineers do not typically know how\nfeatures are represented in these models. We seek to address this issue by\napplying the idea of mean-centring to steering vectors. We find that taking the\naverage of activations associated with a target dataset, and then subtracting\nthe mean of all training activations, results in effective steering vectors. We\ntest this method on a variety of models on natural language tasks by steering\naway from generating toxic text, and steering the completion of a story towards\na target genre. We also apply mean-centring to extract function vectors, more\neffectively triggering the execution of a range of natural language tasks by a\nsignificant margin (compared to previous baselines). This suggests that\nmean-centring can be used to easily improve the effectiveness of activation\nsteering in a wide range of contexts.\n","authors":["Ole Jorgensen","Dylan Cope","Nandi Schoots","Murray Shanahan"],"pdf_url":"https://arxiv.org/pdf/2312.03813v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03789v1","updated":"2023-12-06T12:03:27Z","published":"2023-12-06T12:03:27Z","title":"Comparative Analysis of Multilingual Text Classification &\n Identification through Deep Learning and Embedding Visualization","summary":" This research conducts a comparative study on multilingual text\nclassification methods, utilizing deep learning and embedding visualization.\nThe study employs LangDetect, LangId, FastText, and Sentence Transformer on a\ndataset encompassing 17 languages. It explores dimensionality's impact on\nclustering, revealing FastText's clearer clustering in 2D visualization due to\nits extensive multilingual corpus training. Notably, the FastText multi-layer\nperceptron model achieved remarkable accuracy, precision, recall, and F1 score,\noutperforming the Sentence Transformer model. The study underscores the\neffectiveness of these techniques in multilingual text classification,\nemphasizing the importance of large multilingual corpora for training\nembeddings. It lays the groundwork for future research and assists\npractitioners in developing language detection and classification systems.\nAdditionally, it includes the comparison of multi-layer perceptron, LSTM, and\nConvolution models for classification.\n","authors":["Arinjay Wyawhare"],"pdf_url":"https://arxiv.org/pdf/2312.03789v1.pdf","comment":"9 Pages, 10 Figures, 1 Table"},{"id":"http://arxiv.org/abs/2312.03788v1","updated":"2023-12-06T11:10:55Z","published":"2023-12-06T11:10:55Z","title":"SmoothQuant+: Accurate and Efficient 4-bit Post-Training\n WeightQuantization for LLM","summary":" Large language models (LLMs) have shown remarkable capabilities in various\ntasks. However their huge model size and the consequent demand for\ncomputational and memory resources also pose challenges to model deployment.\nCurrently, 4-bit post-training quantization (PTQ) has achieved some success in\nLLMs, reducing the memory footprint by approximately 75% compared to FP16\nmodels, albeit with some accuracy loss. In this paper, we propose SmoothQuant+,\nan accurate and efficient 4-bit weight-only PTQ that requires no additional\ntraining, which enables lossless in accuracy for LLMs for the first time. Based\non the fact that the loss of weight quantization is amplified by the activation\noutliers, SmoothQuant+ smoothes the activation outliers by channel before\nquantization, while adjusting the corresponding weights for mathematical\nequivalence, and then performs group-wise 4-bit weight quantization for linear\nlayers. We have integrated SmoothQuant+ into the vLLM framework, an advanced\nhigh-throughput inference engine specially developed for LLMs, and equipped it\nwith an efficient W4A16 CUDA kernels, so that vLLM can seamlessly support\nSmoothQuant+ 4-bit weight quantization. Our results show that, with\nSmoothQuant+, the Code Llama-34B model can be quantized and deployed on a A100\n40GB GPU, achieving lossless accuracy and a throughput increase of 1.9 to 4.0\ntimes compared to the FP16 model deployed on two A100 40GB GPUs. Moreover, the\nlatency per token is only 68% of the FP16 model deployed on two A100 40GB GPUs.\nThis is the state-of-the-art 4-bit weight quantization for LLMs as we know.\n","authors":["Jiayi Pan","Chengcan Wang","Kaifu Zheng","Yangguang Li","Zhenyu Wang","Bin Feng"],"pdf_url":"https://arxiv.org/pdf/2312.03788v1.pdf","comment":null}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.03704v1","updated":"2023-12-06T18:59:58Z","published":"2023-12-06T18:59:58Z","title":"Relightable Gaussian Codec Avatars","summary":" The fidelity of relighting is bounded by both geometry and appearance\nrepresentations. For geometry, both mesh and volumetric approaches have\ndifficulty modeling intricate structures like 3D hair geometry. For appearance,\nexisting relighting models are limited in fidelity and often too slow to render\nin real-time with high-resolution continuous environments. In this work, we\npresent Relightable Gaussian Codec Avatars, a method to build high-fidelity\nrelightable head avatars that can be animated to generate novel expressions.\nOur geometry model based on 3D Gaussians can capture 3D-consistent\nsub-millimeter details such as hair strands and pores on dynamic face\nsequences. To support diverse materials of human heads such as the eyes, skin,\nand hair in a unified manner, we present a novel relightable appearance model\nbased on learnable radiance transfer. Together with global illumination-aware\nspherical harmonics for the diffuse components, we achieve real-time relighting\nwith spatially all-frequency reflections using spherical Gaussians. This\nappearance model can be efficiently relit under both point light and continuous\nillumination. We further improve the fidelity of eye reflections and enable\nexplicit gaze control by introducing relightable explicit eye models. Our\nmethod outperforms existing approaches without compromising real-time\nperformance. We also demonstrate real-time relighting of avatars on a tethered\nconsumer VR headset, showcasing the efficiency and fidelity of our avatars.\n","authors":["Shunsuke Saito","Gabriel Schwartz","Tomas Simon","Junxuan Li","Giljoo Nam"],"pdf_url":"https://arxiv.org/pdf/2312.03704v1.pdf","comment":"Website: https://shunsukesaito.github.io/rgca/"},{"id":"http://arxiv.org/abs/2312.03703v1","updated":"2023-12-06T18:59:44Z","published":"2023-12-06T18:59:44Z","title":"Skeleton-in-Context: Unified Skeleton Sequence Modeling with In-Context\n Learning","summary":" In-context learning provides a new perspective for multi-task modeling for\nvision and NLP. Under this setting, the model can perceive tasks from prompts\nand accomplish them without any extra task-specific head predictions or model\nfine-tuning. However, Skeleton sequence modeling via in-context learning\nremains unexplored. Directly applying existing in-context models from other\nareas onto skeleton sequences fails due to the inter-frame and cross-task pose\nsimilarity that makes it outstandingly hard to perceive the task correctly from\na subtle context. To address this challenge, we propose Skeleton-in-Context\n(SiC), an effective framework for in-context skeleton sequence modeling. Our\nSiC is able to handle multiple skeleton-based tasks simultaneously after a\nsingle training process and accomplish each task from context according to the\ngiven prompt. It can further generalize to new, unseen tasks according to\ncustomized prompts. To facilitate context perception, we additionally propose a\ntask-unified prompt, which adaptively learns tasks of different natures, such\nas partial joint-level generation, sequence-level prediction, or 2D-to-3D\nmotion prediction. We conduct extensive experiments to evaluate the\neffectiveness of our SiC on multiple tasks, including motion prediction, pose\nestimation, joint completion, and future pose estimation. We also evaluate its\ngeneralization capability on unseen tasks such as motion-in-between. These\nexperiments show that our model achieves state-of-the-art multi-task\nperformance and even outperforms single-task methods on certain tasks.\n","authors":["Xinshun Wang","Zhongbin Fang","Xia Li","Xiangtai Li","Chen Chen","Mengyuan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03703v1.pdf","comment":"Project page: https://github.com/fanglaosi/Skeleton-in-Context"},{"id":"http://arxiv.org/abs/2312.03701v1","updated":"2023-12-06T18:59:31Z","published":"2023-12-06T18:59:31Z","title":"Self-conditioned Image Generation via Generating Representations","summary":" This paper presents $\\textbf{R}$epresentation-$\\textbf{C}$onditioned image\n$\\textbf{G}$eneration (RCG), a simple yet effective image generation framework\nwhich sets a new benchmark in class-unconditional image generation. RCG does\nnot condition on any human annotations. Instead, it conditions on a\nself-supervised representation distribution which is mapped from the image\ndistribution using a pre-trained encoder. During generation, RCG samples from\nsuch representation distribution using a representation diffusion model (RDM),\nand employs a pixel generator to craft image pixels conditioned on the sampled\nrepresentation. Such a design provides substantial guidance during the\ngenerative process, resulting in high-quality image generation. Tested on\nImageNet 256$\\times$256, RCG achieves a Frechet Inception Distance (FID) of\n3.31 and an Inception Score (IS) of 253.4. These results not only significantly\nimprove the state-of-the-art of class-unconditional image generation but also\nrival the current leading methods in class-conditional image generation,\nbridging the long-standing performance gap between these two tasks. Code is\navailable at https://github.com/LTH14/rcg.\n","authors":["Tianhong Li","Dina Katabi","Kaiming He"],"pdf_url":"https://arxiv.org/pdf/2312.03701v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03700v1","updated":"2023-12-06T18:59:19Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":" Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v1.pdf","comment":"Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2312.03698v1","updated":"2023-12-06T18:59:03Z","published":"2023-12-06T18:59:03Z","title":"Intrinsic Harmonization for Illumination-Aware Compositing","summary":" Despite significant advancements in network-based image harmonization\ntechniques, there still exists a domain disparity between typical training\npairs and real-world composites encountered during inference. Most existing\nmethods are trained to reverse global edits made on segmented image regions,\nwhich fail to accurately capture the lighting inconsistencies between the\nforeground and background found in composited images. In this work, we\nintroduce a self-supervised illumination harmonization approach formulated in\nthe intrinsic image domain. First, we estimate a simple global lighting model\nfrom mid-level vision representations to generate a rough shading for the\nforeground region. A network then refines this inferred shading to generate a\nharmonious re-shading that aligns with the background scene. In order to match\nthe color appearance of the foreground and background, we utilize ideas from\nprior harmonization approaches to perform parameterized image edits in the\nalbedo domain. To validate the effectiveness of our approach, we present\nresults from challenging real-world composites and conduct a user study to\nobjectively measure the enhanced realism achieved compared to state-of-the-art\nharmonization methods.\n","authors":["Chris Careaga","Yağız Aksoy","S. Mahdi H. Miangoleh"],"pdf_url":"https://arxiv.org/pdf/2312.03698v1.pdf","comment":"10 pages, 8 figures. Accepted to SIGGRAPH Asia 2023 (Conference\n Track). Project page: https://yaksoy.github.io/intrinsicCompositing/"},{"id":"http://arxiv.org/abs/2312.03692v1","updated":"2023-12-06T18:54:44Z","published":"2023-12-06T18:54:44Z","title":"Memory Triggers: Unveiling Memorization in Text-To-Image Generative\n Models through Word-Level Duplication","summary":" Diffusion-based models, such as the Stable Diffusion model, have\nrevolutionized text-to-image synthesis with their ability to produce\nhigh-quality, high-resolution images. These advancements have prompted\nsignificant progress in image generation and editing tasks. However, these\nmodels also raise concerns due to their tendency to memorize and potentially\nreplicate exact training samples, posing privacy risks and enabling adversarial\nattacks. Duplication in training datasets is recognized as a major factor\ncontributing to memorization, and various forms of memorization have been\nstudied so far. This paper focuses on two distinct and underexplored types of\nduplication that lead to replication during inference in diffusion-based\nmodels, particularly in the Stable Diffusion model. We delve into these\nlesser-studied duplication phenomena and their implications through two case\nstudies, aiming to contribute to the safer and more responsible use of\ngenerative models in various applications.\n","authors":["Ali Naseh","Jaechul Roh","Amir Houmansadr"],"pdf_url":"https://arxiv.org/pdf/2312.03692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02934v2","updated":"2023-12-06T18:45:29Z","published":"2023-12-05T18:05:14Z","title":"WoVoGen: World Volume-aware Diffusion for Controllable Multi-camera\n Driving Scene Generation","summary":" Generating multi-camera street-view videos is critical for augmenting\nautonomous driving datasets, addressing the urgent demand for extensive and\nvaried data. Due to the limitations in diversity and challenges in handling\nlighting conditions, traditional rendering-based methods are increasingly being\nsupplanted by diffusion-based methods. However, a significant challenge in\ndiffusion-based methods is ensuring that the generated sensor data preserve\nboth intra-world consistency and inter-sensor coherence. To address these\nchallenges, we combine an additional explicit world volume and propose the\nWorld Volume-aware Multi-camera Driving Scene Generator (WoVoGen). This system\nis specifically designed to leverage 4D world volume as a foundational element\nfor video generation. Our model operates in two distinct phases: (i)\nenvisioning the future 4D temporal world volume based on vehicle control\nsequences, and (ii) generating multi-camera videos, informed by this envisioned\n4D temporal world volume and sensor interconnectivity. The incorporation of the\n4D world volume empowers WoVoGen not only to generate high-quality street-view\nvideos in response to vehicle control inputs but also to facilitate scene\nediting tasks.\n","authors":["Jiachen Lu","Ze Huang","Jiahui Zhang","Zeyu Yang","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.02934v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03678v1","updated":"2023-12-06T18:41:01Z","published":"2023-12-06T18:41:01Z","title":"Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching","summary":" Non-isometric shape correspondence remains a fundamental challenge in\ncomputer vision. Traditional methods using Laplace-Beltrami operator (LBO)\neigenmodes face limitations in characterizing high-frequency extrinsic shape\nchanges like bending and creases. We propose a novel approach of combining the\nnon-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell\nhessian with the intrinsic ones of the LBO, creating a hybrid spectral space in\nwhich we construct functional maps. To this end, we present a theoretical\nframework to effectively integrate non-orthogonal basis functions into\ndescriptor- and learning-based functional map methods. Our approach can be\nincorporated easily into existing functional map pipelines across varying\napplications and is able to handle complex deformations beyond isometries. We\nshow extensive evaluations across various supervised and unsupervised settings\nand demonstrate significant improvements. Notably, our approach achieves up to\n15% better mean geodesic error for non-isometric correspondence settings and up\nto 45% improvement in scenarios with topological noise.\n","authors":["Lennart Bastian","Yizheng Xie","Nassir Navab","Zorah Lähner"],"pdf_url":"https://arxiv.org/pdf/2312.03678v1.pdf","comment":"17 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.03667v1","updated":"2023-12-06T18:34:32Z","published":"2023-12-06T18:34:32Z","title":"WarpDiffusion: Efficient Diffusion Model for High-Fidelity Virtual\n Try-on","summary":" Image-based Virtual Try-On (VITON) aims to transfer an in-shop garment image\nonto a target person. While existing methods focus on warping the garment to\nfit the body pose, they often overlook the synthesis quality around the\ngarment-skin boundary and realistic effects like wrinkles and shadows on the\nwarped garments. These limitations greatly reduce the realism of the generated\nresults and hinder the practical application of VITON techniques. Leveraging\nthe notable success of diffusion-based models in cross-modal image synthesis,\nsome recent diffusion-based methods have ventured to tackle this issue.\nHowever, they tend to either consume a significant amount of training resources\nor struggle to achieve realistic try-on effects and retain garment details. For\nefficient and high-fidelity VITON, we propose WarpDiffusion, which bridges the\nwarping-based and diffusion-based paradigms via a novel informative and local\ngarment feature attention mechanism. Specifically, WarpDiffusion incorporates\nlocal texture attention to reduce resource consumption and uses a novel\nauto-mask module that effectively retains only the critical areas of the warped\ngarment while disregarding unrealistic or erroneous portions. Notably,\nWarpDiffusion can be integrated as a plug-and-play component into existing\nVITON methodologies, elevating their synthesis quality. Extensive experiments\non high-resolution VITON benchmarks and an in-the-wild test set demonstrate the\nsuperiority of WarpDiffusion, surpassing state-of-the-art methods both\nqualitatively and quantitatively.\n","authors":["xujie zhang","Xiu Li","Michael Kampffmeyer","Xin Dong","Zhenyu Xie","Feida Zhu","Haoye Dong","Xiaodan Liang"],"pdf_url":"https://arxiv.org/pdf/2312.03667v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03661v1","updated":"2023-12-06T18:32:33Z","published":"2023-12-06T18:32:33Z","title":"Reason2Drive: Towards Interpretable and Chain-based Reasoning for\n Autonomous Driving","summary":" Large vision-language models (VLMs) have garnered increasing interest in\nautonomous driving areas, due to their advanced capabilities in complex\nreasoning tasks essential for highly autonomous vehicle behavior. Despite their\npotential, research in autonomous systems is hindered by the lack of datasets\nwith annotated reasoning chains that explain the decision-making processes in\ndriving. To bridge this gap, we present Reason2Drive, a benchmark dataset with\nover 600K video-text pairs, aimed at facilitating the study of interpretable\nreasoning in complex driving environments. We distinctly characterize the\nautonomous driving process as a sequential combination of perception,\nprediction, and reasoning steps, and the question-answer pairs are\nautomatically collected from a diverse range of open-source outdoor driving\ndatasets, including nuScenes, Waymo and ONCE. Moreover, we introduce a novel\naggregated evaluation metric to assess chain-based reasoning performance in\nautonomous systems, addressing the semantic ambiguities of existing metrics\nsuch as BLEU and CIDEr. Based on the proposed benchmark, we conduct experiments\nto assess various existing VLMs, revealing insights into their reasoning\ncapabilities. Additionally, we develop an efficient approach to empower VLMs to\nleverage object-level perceptual elements in both feature extraction and\nprediction, further enhancing their reasoning accuracy. The code and dataset\nwill be released.\n","authors":["Ming Nie","Renyuan Peng","Chunwei Wang","Xinyue Cai","Jianhua Han","Hang Xu","Li Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03661v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.14269v2","updated":"2023-12-06T18:21:17Z","published":"2023-05-23T17:20:47Z","title":"Source-Free Domain Adaptation for RGB-D Semantic Segmentation with\n Vision Transformers","summary":" With the increasing availability of depth sensors, multimodal frameworks that\ncombine color information with depth data are gaining interest. However, ground\ntruth data for semantic segmentation is burdensome to provide, thus making\ndomain adaptation a significant research area. Yet most domain adaptation\nmethods are not able to effectively handle multimodal data. Specifically, we\naddress the challenging source-free domain adaptation setting where the\nadaptation is performed without reusing source data. We propose MISFIT:\nMultImodal Source-Free Information fusion Transformer, a depth-aware framework\nwhich injects depth data into a segmentation module based on vision\ntransformers at multiple stages, namely at the input, feature and output\nlevels. Color and depth style transfer helps early-stage domain alignment while\nre-wiring self-attention between modalities creates mixed features, allowing\nthe extraction of better semantic content. Furthermore, a depth-based entropy\nminimization strategy is also proposed to adaptively weight regions at\ndifferent distances. Our framework, which is also the first approach using\nRGB-D vision transformers for source-free semantic segmentation, shows\nnoticeable performance improvements with respect to standard strategies.\n","authors":["Giulia Rizzoli","Donald Shenaj","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2305.14269v2.pdf","comment":"WACV 2024, 2nd Workshop on Pretraining (WACVW)"},{"id":"http://arxiv.org/abs/2210.16380v3","updated":"2023-12-06T18:06:10Z","published":"2022-10-28T19:39:14Z","title":"Incorporating Crowdsourced Annotator Distributions into Ensemble\n Modeling to Improve Classification Trustworthiness for Ancient Greek Papyri","summary":" Performing classification on noisy, crowdsourced image datasets can prove\nchallenging even for the best neural networks. Two issues which complicate the\nproblem on such datasets are class imbalance and ground-truth uncertainty in\nlabeling. The AL-ALL and AL-PUB datasets - consisting of tightly cropped,\nindividual characters from images of ancient Greek papyri - are strongly\naffected by both issues. The application of ensemble modeling to such datasets\ncan help identify images where the ground-truth is questionable and quantify\nthe trustworthiness of those samples. As such, we apply stacked generalization\nconsisting of nearly identical ResNets with different loss functions: one\nutilizing sparse cross-entropy (CXE) and the other Kullback-Liebler Divergence\n(KLD). Both networks use labels drawn from a crowd-sourced consensus. This\nconsensus is derived from a Normalized Distribution of Annotations (NDA) based\non all annotations for a given character in the dataset. For the second\nnetwork, the KLD is calculated with respect to the NDA. For our ensemble model,\nwe apply a k-nearest neighbors model to the outputs of the CXE and KLD\nnetworks. Individually, the ResNet models have approximately 93% accuracy,\nwhile the ensemble model achieves an accuracy of > 95%, increasing the\nclassification trustworthiness. We also perform an analysis of the Shannon\nentropy of the various models' output distributions to measure classification\nuncertainty. Our results suggest that entropy is useful for predicting model\nmisclassifications.\n","authors":["Graham West","Matthew I. Swindall","Ben Keener","Timothy Player","Alex C. Williams","James H. Brusuelas","John F. Wallin"],"pdf_url":"https://arxiv.org/pdf/2210.16380v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03647v1","updated":"2023-12-06T18:05:41Z","published":"2023-12-06T18:05:41Z","title":"Editable Stain Transformation Of Histological Images Using Unpaired GANs","summary":" Double staining in histopathology, particularly for metaplastic breast\ncancer, typically employs H&E and P63 dyes. However, P63's tissue damage and\nhigh cost necessitate alternative methods. This study introduces xAI-CycleGAN,\nan advanced architecture combining Mask CycleGAN with explainability features\nand structure-preserving capabilities for transforming H&E stained breast\ntissue images into P63-like images. The architecture allows for output editing,\nenhancing resemblance to actual images and enabling further model refinement.\nWe showcase xAI-CycleGAN's efficacy in maintaining structural integrity and\ngenerating high-quality images. Additionally, a histopathologist survey\nindicates the generated images' realism is often comparable to actual images,\nvalidating our model's high-quality output.\n","authors":["Tibor Sloboda","Lukáš Hudec","Wanda Benešová"],"pdf_url":"https://arxiv.org/pdf/2312.03647v1.pdf","comment":"13 pages, 5 figures, AIRCAD ICIAP 2023 in Udine"},{"id":"http://arxiv.org/abs/2312.03641v1","updated":"2023-12-06T17:49:57Z","published":"2023-12-06T17:49:57Z","title":"MotionCtrl: A Unified and Flexible Motion Controller for Video\n Generation","summary":" Motions in a video primarily consist of camera motion, induced by camera\nmovement, and object motion, resulting from object movement. Accurate control\nof both camera and object motion is essential for video generation. However,\nexisting works either mainly focus on one type of motion or do not clearly\ndistinguish between the two, limiting their control capabilities and diversity.\nTherefore, this paper presents MotionCtrl, a unified and flexible motion\ncontroller for video generation designed to effectively and independently\ncontrol camera and object motion. The architecture and training strategy of\nMotionCtrl are carefully devised, taking into account the inherent properties\nof camera motion, object motion, and imperfect training data. Compared to\nprevious methods, MotionCtrl offers three main advantages: 1) It effectively\nand independently controls camera motion and object motion, enabling more\nfine-grained motion control and facilitating flexible and diverse combinations\nof both types of motion. 2) Its motion conditions are determined by camera\nposes and trajectories, which are appearance-free and minimally impact the\nappearance or shape of objects in generated videos. 3) It is a relatively\ngeneralizable model that can adapt to a wide array of camera poses and\ntrajectories once trained. Extensive qualitative and quantitative experiments\nhave been conducted to demonstrate the superiority of MotionCtrl over existing\nmethods.\n","authors":["Zhouxia Wang","Ziyang Yuan","Xintao Wang","Tianshui Chen","Menghan Xia","Ping Luo","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.03641v1.pdf","comment":"Project Page: https://wzhouxiff.github.io/projects/MotionCtrl/"},{"id":"http://arxiv.org/abs/2312.03640v1","updated":"2023-12-06T17:47:16Z","published":"2023-12-06T17:47:16Z","title":"Training Neural Networks on RAW and HDR Images for Restoration Tasks","summary":" The vast majority of standard image and video content available online is\nrepresented in display-encoded color spaces, in which pixel values are\nconveniently scaled to a limited range (0-1) and the color distribution is\napproximately perceptually uniform. In contrast, both camera RAW and high\ndynamic range (HDR) images are often represented in linear color spaces, in\nwhich color values are linearly related to colorimetric quantities of light.\nWhile training on commonly available display-encoded images is a\nwell-established practice, there is no consensus on how neural networks should\nbe trained for tasks on RAW and HDR images in linear color spaces. In this\nwork, we test several approaches on three popular image restoration\napplications: denoising, deblurring, and single-image super-resolution. We\nexamine whether HDR/RAW images need to be display-encoded using popular\ntransfer functions (PQ, PU21, mu-law), or whether it is better to train in\nlinear color spaces, but use loss functions that correct for perceptual\nnon-uniformity. Our results indicate that neural networks train significantly\nbetter on HDR and RAW images represented in display-encoded color spaces, which\noffer better perceptual uniformity than linear spaces. This small change to the\ntraining strategy can bring a very substantial gain in performance, up to 10-15\ndB.\n","authors":["Lei Luo","Alexandre Chapiro","Xiaoyu Xiang","Yuchen Fan","Rakesh Ranjan","Rafal Mantiuk"],"pdf_url":"https://arxiv.org/pdf/2312.03640v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03631v1","updated":"2023-12-06T17:28:03Z","published":"2023-12-06T17:28:03Z","title":"MOCHa: Multi-Objective Reinforcement Mitigating Caption Hallucinations","summary":" While recent years have seen rapid progress in image-conditioned text\ngeneration, image captioning still suffers from the fundamental issue of\nhallucinations, the generation of spurious details that cannot be inferred from\nthe given image. Dedicated methods for reducing hallucinations in image\ncaptioning largely focus on closed-vocabulary object tokens, ignoring most\ntypes of hallucinations that occur in practice. In this work, we propose MOCHa,\nan approach that harnesses advancements in reinforcement learning (RL) to\naddress the sequence-level nature of hallucinations in an open-world setup. To\noptimize for caption fidelity to the input image, we leverage ground-truth\nreference captions as proxies to measure the logical consistency of generated\ncaptions. However, optimizing for caption fidelity alone fails to preserve the\nsemantic adequacy of generations; therefore, we propose a multi-objective\nreward function that jointly targets these qualities, without requiring any\nstrong supervision. We demonstrate that these goals can be simultaneously\noptimized with our framework, enhancing performance for various captioning\nmodels of different scales. Our qualitative and quantitative results\ndemonstrate MOCHa's superior performance across various established metrics. We\nalso demonstrate the benefit of our method in the open-vocabulary setting. To\nthis end, we contribute OpenCHAIR, a new benchmark for quantifying\nopen-vocabulary hallucinations in image captioning models, constructed using\ngenerative foundation models. We will release our code, benchmark, and trained\nmodels.\n","authors":["Assaf Ben-Kish","Moran Yanuka","Morris Alper","Raja Giryes","Hadar Averbuch-Elor"],"pdf_url":"https://arxiv.org/pdf/2312.03631v1.pdf","comment":"Website Link: https://assafbk.github.io/mocha/"},{"id":"http://arxiv.org/abs/2312.03628v1","updated":"2023-12-06T17:19:00Z","published":"2023-12-06T17:19:00Z","title":"Boosting Segment Anything Model Towards Open-Vocabulary Learning","summary":" The recent Segment Anything Model (SAM) has emerged as a new paradigmatic\nvision foundation model, showcasing potent zero-shot generalization and\nflexible prompting. Despite SAM finding applications and adaptations in various\ndomains, its primary limitation lies in the inability to grasp object\nsemantics. In this paper, we present Sambor to seamlessly integrate SAM with\nthe open-vocabulary object detector in an end-to-end framework. While retaining\nall the remarkable capabilities inherent to SAM, we enhance it with the\ncapacity to detect arbitrary objects based on human inputs like category names\nor reference expressions. To accomplish this, we introduce a novel SideFormer\nmodule that extracts SAM features to facilitate zero-shot object localization\nand inject comprehensive semantic information for open-vocabulary recognition.\nIn addition, we devise an open-set region proposal network (Open-set RPN),\nenabling the detector to acquire the open-set proposals generated by SAM.\nSambor demonstrates superior zero-shot performance across benchmarks, including\nCOCO and LVIS, proving highly competitive against previous SoTA methods. We\naspire for this work to serve as a meaningful endeavor in endowing SAM to\nrecognize diverse object categories and advancing open-vocabulary learning with\nthe support of vision foundation models.\n","authors":["Xumeng Han","Longhui Wei","Xuehui Yu","Zhiyang Dou","Xin He","Kuiran Wang","Zhenjun Han","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2312.03628v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18260v2","updated":"2023-12-06T17:16:07Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs,\n\\textit{Flamingo-CXR}, by fine-tuning a well-known vision-language foundation\nmodel on radiology data. To evaluate the quality of the AI-generated reports, a\ngroup of 16 certified radiologists provide detailed evaluations of AI-generated\nand human written reports for chest X-rays from an intensive care setting in\nthe United States and an inpatient setting in India. At least one radiologist\n(out of two per case) preferred the AI report to the ground truth report in\nover 60$\\%$ of cases for both datasets. Amongst the subset of AI-generated\nreports that contain errors, the most frequently cited reasons were related to\nthe location and finding, whereas for human written reports, most mistakes were\nrelated to severity and finding. This disparity suggested potential\ncomplementarity between our AI system and human experts, prompting us to\ndevelop an assistive scenario in which \\textit{Flamingo-CXR} generates a\nfirst-draft report, which is subsequently revised by a clinician. This is the\nfirst demonstration of clinician-AI collaboration for report writing, and the\nresultant reports are assessed to be equivalent or preferred by at least one\nradiologist to reports written by experts alone in 80$\\%$ of in-patient cases\nand 60$\\%$ of intensive care cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03626v1","updated":"2023-12-06T17:13:15Z","published":"2023-12-06T17:13:15Z","title":"TokenCompose: Grounding Diffusion with Token-level Supervision","summary":" We present TokenCompose, a Latent Diffusion Model for text-to-image\ngeneration that achieves enhanced consistency between user-specified text\nprompts and model-generated images. Despite its tremendous success, the\nstandard denoising process in the Latent Diffusion Model takes text prompts as\nconditions only, absent explicit constraint for the consistency between the\ntext prompts and the image contents, leading to unsatisfactory results for\ncomposing multiple object categories. TokenCompose aims to improve\nmulti-category instance composition by introducing the token-wise consistency\nterms between the image content and object segmentation maps in the finetuning\nstage. TokenCompose can be applied directly to the existing training pipeline\nof text-conditioned diffusion models without extra human labeling information.\nBy finetuning Stable Diffusion, the model exhibits significant improvements in\nmulti-category instance composition and enhanced photorealism for its generated\nimages.\n","authors":["Zirui Wang","Zhizhou Sha","Zheng Ding","Yilin Wang","Zhuowen Tu"],"pdf_url":"https://arxiv.org/pdf/2312.03626v1.pdf","comment":"Preprint. Project link: https://mlpc-ucsd.github.io/TokenCompose"},{"id":"http://arxiv.org/abs/2312.03611v1","updated":"2023-12-06T16:55:53Z","published":"2023-12-06T16:55:53Z","title":"DreamComposer: Controllable 3D Object Generation via Multi-View\n Conditions","summary":" Utilizing pre-trained 2D large-scale generative models, recent works are\ncapable of generating high-quality novel views from a single in-the-wild image.\nHowever, due to the lack of information from multiple views, these works\nencounter difficulties in generating controllable novel views. In this paper,\nwe present DreamComposer, a flexible and scalable framework that can enhance\nexisting view-aware diffusion models by injecting multi-view conditions.\nSpecifically, DreamComposer first uses a view-aware 3D lifting module to obtain\n3D representations of an object from multiple views. Then, it renders the\nlatent features of the target view from 3D representations with the multi-view\nfeature fusion module. Finally the target view features extracted from\nmulti-view inputs are injected into a pre-trained diffusion model. Experiments\nshow that DreamComposer is compatible with state-of-the-art diffusion models\nfor zero-shot novel view synthesis, further enhancing them to generate\nhigh-fidelity novel view images with multi-view conditions, ready for\ncontrollable 3D object reconstruction and various other applications.\n","authors":["Yunhan Yang","Yukun Huang","Xiaoyang Wu","Yuan-Chen Guo","Song-Hai Zhang","Hengshuang Zhao","Tong He","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03611v1.pdf","comment":"Project Page: https://yhyang-myron.github.io/DreamComposer/"},{"id":"http://arxiv.org/abs/2312.03608v1","updated":"2023-12-06T16:54:24Z","published":"2023-12-06T16:54:24Z","title":"Automated Multimodal Data Annotation via Calibration With Indoor\n Positioning System","summary":" Learned object detection methods based on fusion of LiDAR and camera data\nrequire labeled training samples, but niche applications, such as warehouse\nrobotics or automated infrastructure, require semantic classes not available in\nlarge existing datasets. Therefore, to facilitate the rapid creation of\nmultimodal object detection datasets and alleviate the burden of human\nlabeling, we propose a novel automated annotation pipeline. Our method uses an\nindoor positioning system (IPS) to produce accurate detection labels for both\npoint clouds and images and eliminates manual annotation entirely. In an\nexperiment, the system annotates objects of interest 261.8 times faster than a\nhuman baseline and speeds up end-to-end dataset creation by 61.5%.\n","authors":["Ryan Rubel","Andrew Dudash","Mohammad Goli","James O'Hara","Karl Wunderlich"],"pdf_url":"https://arxiv.org/pdf/2312.03608v1.pdf","comment":"Contains eight pages and 10 figures. A version of this document was\n accepted to IEEE IRC 2023"},{"id":"http://arxiv.org/abs/2312.03606v1","updated":"2023-12-06T16:53:17Z","published":"2023-12-06T16:53:17Z","title":"DiffusionSat: A Generative Foundation Model for Satellite Imagery","summary":" Diffusion models have achieved state-of-the-art results on many modalities\nincluding images, speech, and video. However, existing models are not tailored\nto support remote sensing data, which is widely used in important applications\nincluding environmental monitoring and crop-yield prediction. Satellite images\nare significantly different from natural images -- they can be multi-spectral,\nirregularly sampled across time -- and existing diffusion models trained on\nimages from the Web do not support them. Furthermore, remote sensing data is\ninherently spatio-temporal, requiring conditional generation tasks not\nsupported by traditional methods based on captions or images. In this paper, we\npresent DiffusionSat, to date the largest generative foundation model trained\non a collection of publicly available large, high-resolution remote sensing\ndatasets. As text-based captions are sparsely available for satellite images,\nwe incorporate the associated metadata such as geolocation as conditioning\ninformation. Our method produces realistic samples and can be used to solve\nmultiple generative tasks including temporal generation, superresolution given\nmulti-spectral inputs and in-painting. Our method outperforms previous\nstate-of-the-art methods for satellite image generation and is the first\nlarge-scale $\\textit{generative}$ foundation model for satellite imagery.\n","authors":["Samar Khanna","Patrick Liu","Linqi Zhou","Chenlin Meng","Robin Rombach","Marshall Burke","David Lobell","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2312.03606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03596v1","updated":"2023-12-06T16:35:59Z","published":"2023-12-06T16:35:59Z","title":"MMM: Generative Masked Motion Model","summary":" Recent advances in text-to-motion generation using diffusion and\nautoregressive models have shown promising results. However, these models often\nsuffer from a trade-off between real-time performance, high fidelity, and\nmotion editability. To address this gap, we introduce MMM, a novel yet simple\nmotion generation paradigm based on Masked Motion Model. MMM consists of two\nkey components: (1) a motion tokenizer that transforms 3D human motion into a\nsequence of discrete tokens in latent space, and (2) a conditional masked\nmotion transformer that learns to predict randomly masked motion tokens,\nconditioned on the pre-computed text tokens. By attending to motion and text\ntokens in all directions, MMM explicitly captures inherent dependency among\nmotion tokens and semantic mapping between motion and text tokens. During\ninference, this allows parallel and iterative decoding of multiple motion\ntokens that are highly consistent with fine-grained text descriptions,\ntherefore simultaneously achieving high-fidelity and high-speed motion\ngeneration. In addition, MMM has innate motion editability. By simply placing\nmask tokens in the place that needs editing, MMM automatically fills the gaps\nwhile guaranteeing smooth transitions between editing and non-editing parts.\nExtensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM\nsurpasses current leading methods in generating high-quality motion (evidenced\nby superior FID scores of 0.08 and 0.429), while offering advanced editing\nfeatures such as body-part modification, motion in-betweening, and the\nsynthesis of long motion sequences. In addition, MMM is two orders of magnitude\nfaster on a single mid-range GPU than editable motion diffusion models. Our\nproject page is available at \\url{https://exitudio.github.io/MMM-page}.\n","authors":["Ekkasit Pinyoanuntapong","Pu Wang","Minwoo Lee","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03594v1","updated":"2023-12-06T16:34:46Z","published":"2023-12-06T16:34:46Z","title":"A Task is Worth One Word: Learning with Task Prompts for High-Quality\n Versatile Image Inpainting","summary":" Achieving high-quality versatile image inpainting, where user-specified\nregions are filled with plausible content according to user intent, presents a\nsignificant challenge. Existing methods face difficulties in simultaneously\naddressing context-aware image inpainting and text-guided object inpainting due\nto the distinct optimal training strategies required. To overcome this\nchallenge, we introduce PowerPaint, the first high-quality and versatile\ninpainting model that excels in both tasks. First, we introduce learnable task\nprompts along with tailored fine-tuning strategies to guide the model's focus\non different inpainting targets explicitly. This enables PowerPaint to\naccomplish various inpainting tasks by utilizing different task prompts,\nresulting in state-of-the-art performance. Second, we demonstrate the\nversatility of the task prompt in PowerPaint by showcasing its effectiveness as\na negative prompt for object removal. Additionally, we leverage prompt\ninterpolation techniques to enable controllable shape-guided object inpainting.\nFinally, we extensively evaluate PowerPaint on various inpainting benchmarks to\ndemonstrate its superior performance for versatile image inpainting. We release\nour codes and models on our project page: https://powerpaint.github.io/.\n","authors":["Junhao Zhuang","Yanhong Zeng","Wenran Liu","Chun Yuan","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03594v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03587v1","updated":"2023-12-06T16:24:47Z","published":"2023-12-06T16:24:47Z","title":"Language-Informed Visual Concept Learning","summary":" Our understanding of the visual world is centered around various concept\naxes, characterizing different aspects of visual entities. While different\nconcept axes can be easily specified by language, e.g. color, the exact visual\nnuances along each axis often exceed the limitations of linguistic\narticulations, e.g. a particular style of painting. In this work, our goal is\nto learn a language-informed visual concept representation, by simply\ndistilling large pre-trained vision-language models. Specifically, we train a\nset of concept encoders to encode the information pertinent to a set of\nlanguage-informed concept axes, with an objective of reproducing the input\nimage through a pre-trained Text-to-Image (T2I) model. To encourage better\ndisentanglement of different concept encoders, we anchor the concept embeddings\nto a set of text embeddings obtained from a pre-trained Visual Question\nAnswering (VQA) model. At inference time, the model extracts concept embeddings\nalong various axes from new test images, which can be remixed to generate\nimages with novel compositions of visual concepts. With a lightweight test-time\nfinetuning procedure, it can also generalize to novel concepts unseen at\ntraining.\n","authors":["Sharon Lee","Yunzhi Zhang","Shangzhe Wu","Jiajun Wu"],"pdf_url":"https://arxiv.org/pdf/2312.03587v1.pdf","comment":"The first two authors contributed equally and are alphabetically\n ordered. Project page:\n https://ai.stanford.edu/~yzzhang/projects/concept-axes/"},{"id":"http://arxiv.org/abs/2312.03585v1","updated":"2023-12-06T16:21:06Z","published":"2023-12-06T16:21:06Z","title":"Foundation Model Assisted Weakly Supervised Semantic Segmentation","summary":" This work aims to leverage pre-trained foundation models, such as contrastive\nlanguage-image pre-training (CLIP) and segment anything model (SAM), to address\nweakly supervised semantic segmentation (WSSS) using image-level labels. To\nthis end, we propose a coarse-to-fine framework based on CLIP and SAM for\ngenerating high-quality segmentation seeds. Specifically, we construct an image\nclassification task and a seed segmentation task, which are jointly performed\nby CLIP with frozen weights and two sets of learnable task-specific prompts. A\nSAM-based seeding (SAMS) module is designed and applied to each task to produce\neither coarse or fine seed maps. Moreover, we design a multi-label contrastive\nloss supervised by image-level labels and a CAM activation loss supervised by\nthe generated coarse seed map. These losses are used to learn the prompts,\nwhich are the only parts need to be learned in our framework. Once the prompts\nare learned, we input each image along with the learned segmentation-specific\nprompts into CLIP and the SAMS module to produce high-quality segmentation\nseeds. These seeds serve as pseudo labels to train an off-the-shelf\nsegmentation network like other two-stage WSSS methods. Experiments show that\nour method achieves the state-of-the-art performance on PASCAL VOC 2012 and\ncompetitive results on MS COCO 2014.\n","authors":["Xiaobo Yang","Xiaojin Gong"],"pdf_url":"https://arxiv.org/pdf/2312.03585v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03584v1","updated":"2023-12-06T16:19:51Z","published":"2023-12-06T16:19:51Z","title":"Context Diffusion: In-Context Aware Image Generation","summary":" We propose Context Diffusion, a diffusion-based framework that enables image\ngeneration models to learn from visual examples presented in context. Recent\nwork tackles such in-context learning for image generation, where a query image\nis provided alongside context examples and text prompts. However, the quality\nand fidelity of the generated images deteriorate when the prompt is not\npresent, demonstrating that these models are unable to truly learn from the\nvisual context. To address this, we propose a novel framework that separates\nthe encoding of the visual context and preserving the structure of the query\nimages. This results in the ability to learn from the visual context and text\nprompts, but also from either one of them. Furthermore, we enable our model to\nhandle few-shot settings, to effectively address diverse in-context learning\nscenarios. Our experiments and user study demonstrate that Context Diffusion\nexcels in both in-domain and out-of-domain tasks, resulting in an overall\nenhancement in image quality and fidelity compared to counterpart models.\n","authors":["Ivona Najdenkoska","Animesh Sinha","Abhimanyu Dubey","Dhruv Mahajan","Vignesh Ramanathan","Filip Radenovic"],"pdf_url":"https://arxiv.org/pdf/2312.03584v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v4","updated":"2023-12-06T16:16:38Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code will be released at\nhttps://github.com/Even-JK/PEFT-3D.\n","authors":["Ivan Tang","Ray Zhang","Zoey Guo","Xianzheng Ma","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v4.pdf","comment":"10 pages. The specialized PEFT framework for 3D pre-trained models,\n which achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Even-JK/PEFT-3D"},{"id":"http://arxiv.org/abs/2312.03568v1","updated":"2023-12-06T16:01:29Z","published":"2023-12-06T16:01:29Z","title":"DocBinFormer: A Two-Level Transformer Network for Effective Document\n Image Binarization","summary":" In real life, various degradation scenarios exist that might damage document\nimages, making it harder to recognize and analyze them, thus binarization is a\nfundamental and crucial step for achieving the most optimal performance in any\ndocument analysis task. We propose DocBinFormer (Document Binarization\nTransformer), a novel two-level vision transformer (TL-ViT) architecture based\non vision transformers for effective document image binarization. The presented\narchitecture employs a two-level transformer encoder to effectively capture\nboth global and local feature representation from the input images. These\ncomplimentary bi-level features are exploited for efficient document image\nbinarization, resulting in improved results for system-generated as well as\nhandwritten document images in a comprehensive approach. With the absence of\nconvolutional layers, the transformer encoder uses the pixel patches and\nsub-patches along with their positional information to operate directly on\nthem, while the decoder generates a clean (binarized) output image from the\nlatent representation of the patches. Instead of using a simple vision\ntransformer block to extract information from the image patches, the proposed\narchitecture uses two transformer blocks for greater coverage of the extracted\nfeature space on a global and local scale. The encoded feature representation\nis used by the decoder block to generate the corresponding binarized output.\nExtensive experiments on a variety of DIBCO and H-DIBCO benchmarks show that\nthe proposed model outperforms state-of-the-art techniques on four metrics. The\nsource code will be made available at\nhttps://github.com/RisabBiswas/DocBinFormer.\n","authors":["Risab Biswas","Swalpa Kumar Roy","Ning Wang","Umapada Pal","Guang-Bin Huang"],"pdf_url":"https://arxiv.org/pdf/2312.03568v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03562v1","updated":"2023-12-06T15:52:31Z","published":"2023-12-06T15:52:31Z","title":"Enhancing Kinship Verification through Multiscale Retinex and Combined\n Deep-Shallow features","summary":" The challenge of kinship verification from facial images represents a\ncutting-edge and formidable frontier in the realms of pattern recognition and\ncomputer vision. This area of study holds a myriad of potential applications,\nspanning from image annotation and forensic analysis to social media research.\nOur research stands out by integrating a preprocessing method named Multiscale\nRetinex (MSR), which elevates image quality and amplifies contrast, ultimately\nbolstering the end results. Strategically, our methodology capitalizes on the\nharmonious blend of deep and shallow texture descriptors, merging them\nproficiently at the score level through the Logistic Regression (LR) method. To\nelucidate, we employ the Local Phase Quantization (LPQ) descriptor to extract\nshallow texture characteristics. For deep feature extraction, we turn to the\nprowess of the VGG16 model, which is pre-trained on a convolutional neural\nnetwork (CNN). The robustness and efficacy of our method have been put to the\ntest through meticulous experiments on three rigorous kinship datasets, namely:\nCornell Kin Face, UB Kin Face, and TS Kin Face.\n","authors":["El Ouanas Belabbaci","Mohammed Khammari","Ammar Chouchane","Mohcene Bessaoudi","Abdelmalik Ouamane","Yassine Himeur","Shadi Atalla","Wathiq Mansoor"],"pdf_url":"https://arxiv.org/pdf/2312.03562v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.17903v3","updated":"2023-12-06T15:52:03Z","published":"2023-05-29T06:26:52Z","title":"Deeply Coupled Cross-Modal Prompt Learning","summary":" Recent advancements in multimodal foundation models (e.g., CLIP) have\nexcelled in zero-shot generalization. Prompt tuning involved in the knowledge\ntransfer from foundation models to downstream tasks has gained significant\nattention recently. Existing prompt-tuning methods in cross-modal learning,\nhowever, either solely focus on language branch, or learn vision-language\ninteraction in a shallow mechanism. In this context, we propose a Deeply\ncoupled Cross-modal Prompt learning (DCP) method based on CLIP. DCP flexibly\naccommodates the interplay between vision and language with a Cross-Modal\nPrompt Attention (CMPA) mechanism, which enables the mutual exchange of\nrespective representation through a well-connected multi-head attention module\nprogressively and strongly. We then conduct comprehensive few-shot learning\nexperiments on 11 image classification datasets and analyze the robustness to\ndomain shift as well. Thorough experimental analysis evidently demonstrates the\nsuperb few-shot generalization and compelling domain adaption capacity of a\nwell-executed DCP. The code can be found at https://github.com/GingL/CMPA.\n","authors":["Xuejing Liu","Wei Tang","Jinghui Lu","Rui Zhao","Zhaojun Guo","Fei Tan"],"pdf_url":"https://arxiv.org/pdf/2305.17903v3.pdf","comment":"Accepted by ACL 2023 findings"},{"id":"http://arxiv.org/abs/2312.03558v1","updated":"2023-12-06T15:40:28Z","published":"2023-12-06T15:40:28Z","title":"When an Image is Worth 1,024 x 1,024 Words: A Case Study in\n Computational Pathology","summary":" This technical report presents LongViT, a vision Transformer that can process\ngigapixel images in an end-to-end manner. Specifically, we split the gigapixel\nimage into a sequence of millions of patches and project them linearly into\nembeddings. LongNet is then employed to model the extremely long sequence,\ngenerating representations that capture both short-range and long-range\ndependencies. The linear computation complexity of LongNet, along with its\ndistributed algorithm, enables us to overcome the constraints of both\ncomputation and memory. We apply LongViT in the field of computational\npathology, aiming for cancer diagnosis and prognosis within gigapixel\nwhole-slide images. Experimental results demonstrate that LongViT effectively\nencodes gigapixel images and outperforms previous state-of-the-art methods on\ncancer subtyping and survival prediction. Code and models will be available at\nhttps://aka.ms/LongViT.\n","authors":["Wenhui Wang","Shuming Ma","Hanwen Xu","Naoto Usuyama","Jiayu Ding","Hoifung Poon","Furu Wei"],"pdf_url":"https://arxiv.org/pdf/2312.03558v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03556v1","updated":"2023-12-06T15:39:03Z","published":"2023-12-06T15:39:03Z","title":"Personalized Face Inpainting with Diffusion Models by Parallel Visual\n Attention","summary":" Face inpainting is important in various applications, such as photo\nrestoration, image editing, and virtual reality. Despite the significant\nadvances in face generative models, ensuring that a person's unique facial\nidentity is maintained during the inpainting process is still an elusive goal.\nCurrent state-of-the-art techniques, exemplified by MyStyle, necessitate\nresource-intensive fine-tuning and a substantial number of images for each new\nidentity. Furthermore, existing methods often fall short in accommodating\nuser-specified semantic attributes, such as beard or expression. To improve\ninpainting results, and reduce the computational complexity during inference,\nthis paper proposes the use of Parallel Visual Attention (PVA) in conjunction\nwith diffusion models. Specifically, we insert parallel attention matrices to\neach cross-attention module in the denoising network, which attends to features\nextracted from reference images by an identity encoder. We train the added\nattention modules and identity encoder on CelebAHQ-IDI, a dataset proposed for\nidentity-preserving face inpainting. Experiments demonstrate that PVA attains\nunparalleled identity resemblance in both face inpainting and face inpainting\nwith language guidance tasks, in comparison to various benchmarks, including\nMyStyle, Paint by Example, and Custom Diffusion. Our findings reveal that PVA\nensures good identity preservation while offering effective\nlanguage-controllability. Additionally, in contrast to Custom Diffusion, PVA\nrequires just 40 fine-tuning steps for each new identity, which translates to a\nsignificant speed increase of over 20 times.\n","authors":["Jianjin Xu","Saman Motamed","Praneetha Vaddamanu","Chen Henry Wu","Christian Haene","Jean-Charles Bazin","Fernando de la Torre"],"pdf_url":"https://arxiv.org/pdf/2312.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03548v1","updated":"2023-12-06T15:26:38Z","published":"2023-12-06T15:26:38Z","title":"Texture-Semantic Collaboration Network for ORSI Salient Object Detection","summary":" Salient object detection (SOD) in optical remote sensing images (ORSIs) has\nbecome increasingly popular recently. Due to the characteristics of ORSIs,\nORSI-SOD is full of challenges, such as multiple objects, small objects, low\nilluminations, and irregular shapes. To address these challenges, we propose a\nconcise yet effective Texture-Semantic Collaboration Network (TSCNet) to\nexplore the collaboration of texture cues and semantic cues for ORSI-SOD.\nSpecifically, TSCNet is based on the generic encoder-decoder structure. In\naddition to the encoder and decoder, TSCNet includes a vital Texture-Semantic\nCollaboration Module (TSCM), which performs valuable feature modulation and\ninteraction on basic features extracted from the encoder. The main idea of our\nTSCM is to make full use of the texture features at the lowest level and the\nsemantic features at the highest level to achieve the expression enhancement of\nsalient regions on features. In the TSCM, we first enhance the position of\npotential salient regions using semantic features. Then, we render and restore\nthe object details using the texture features. Meanwhile, we also perceive\nregions of various scales, and construct interactions between different\nregions. Thanks to the perfect combination of TSCM and generic structure, our\nTSCNet can take care of both the position and details of salient objects,\neffectively handling various scenes. Extensive experiments on three datasets\ndemonstrate that our TSCNet achieves competitive performance compared to 14\nstate-of-the-art methods. The code and results of our method are available at\nhttps://github.com/MathLee/TSCNet.\n","authors":["Gongyang Li","Zhen Bai","Zhi Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03548v1.pdf","comment":"5 pages, 3 figures, Accepted by IEEE Transactions on Circuits and\n Systems II: Express Briefs 2023"},{"id":"http://arxiv.org/abs/2206.04425v2","updated":"2023-12-06T15:20:41Z","published":"2022-06-09T11:27:26Z","title":"Multiple Instance Learning for Digital Pathology: A Review on the\n State-of-the-Art, Limitations & Future Potential","summary":" Digital whole slides images contain an enormous amount of information\nproviding a strong motivation for the development of automated image analysis\ntools. Particularly deep neural networks show high potential with respect to\nvarious tasks in the field of digital pathology. However, a limitation is given\nby the fact that typical deep learning algorithms require (manual) annotations\nin addition to the large amounts of image data, to enable effective training.\nMultiple instance learning exhibits a powerful tool for learning deep neural\nnetworks in a scenario without fully annotated data. These methods are\nparticularly effective in this domain, due to the fact that labels for a\ncomplete whole slide image are often captured routinely, whereas labels for\npatches, regions or pixels are not. This potential already resulted in a\nconsiderable number of publications, with the majority published in the last\nthree years. Besides the availability of data and a high motivation from the\nmedical perspective, the availability of powerful graphics processing units\nexhibits an accelerator in this field. In this paper, we provide an overview of\nwidely and effectively used concepts of used deep multiple instance learning\napproaches, recent advances and also critically discuss remaining challenges\nand future potential.\n","authors":["Michael Gadermayr","Maximilian Tschuchnig"],"pdf_url":"https://arxiv.org/pdf/2206.04425v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03543v1","updated":"2023-12-06T15:14:30Z","published":"2023-12-06T15:14:30Z","title":"GPT-4 Enhanced Multimodal Grounding for Autonomous Driving: Leveraging\n Cross-Modal Attention with Large Language Models","summary":" In the field of autonomous vehicles (AVs), accurately discerning commander\nintent and executing linguistic commands within a visual context presents a\nsignificant challenge. This paper introduces a sophisticated encoder-decoder\nframework, developed to address visual grounding in AVs.Our Context-Aware\nVisual Grounding (CAVG) model is an advanced system that integrates five core\nencoders-Text, Image, Context, and Cross-Modal-with a Multimodal decoder. This\nintegration enables the CAVG model to adeptly capture contextual semantics and\nto learn human emotional features, augmented by state-of-the-art Large Language\nModels (LLMs) including GPT-4. The architecture of CAVG is reinforced by the\nimplementation of multi-head cross-modal attention mechanisms and a\nRegion-Specific Dynamic (RSD) layer for attention modulation. This\narchitectural design enables the model to efficiently process and interpret a\nrange of cross-modal inputs, yielding a comprehensive understanding of the\ncorrelation between verbal commands and corresponding visual scenes. Empirical\nevaluations on the Talk2Car dataset, a real-world benchmark, demonstrate that\nCAVG establishes new standards in prediction accuracy and operational\nefficiency. Notably, the model exhibits exceptional performance even with\nlimited training data, ranging from 50% to 75% of the full dataset. This\nfeature highlights its effectiveness and potential for deployment in practical\nAV applications. Moreover, CAVG has shown remarkable robustness and\nadaptability in challenging scenarios, including long-text command\ninterpretation, low-light conditions, ambiguous command contexts, inclement\nweather conditions, and densely populated urban environments. The code for the\nproposed model is available at our Github.\n","authors":["Haicheng Liao","Huanming Shen","Zhenning Li","Chengyue Wang","Guofa Li","Yiming Bie","Chengzhong Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03543v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03540v1","updated":"2023-12-06T15:07:12Z","published":"2023-12-06T15:07:12Z","title":"FoodFusion: A Latent Diffusion Model for Realistic Food Image Generation","summary":" Current state-of-the-art image generation models such as Latent Diffusion\nModels (LDMs) have demonstrated the capacity to produce visually striking\nfood-related images. However, these generated images often exhibit an artistic\nor surreal quality that diverges from the authenticity of real-world food\nrepresentations. This inadequacy renders them impractical for applications\nrequiring realistic food imagery, such as training models for image-based\ndietary assessment. To address these limitations, we introduce FoodFusion, a\nLatent Diffusion model engineered specifically for the faithful synthesis of\nrealistic food images from textual descriptions. The development of the\nFoodFusion model involves harnessing an extensive array of open-source food\ndatasets, resulting in over 300,000 curated image-caption pairs. Additionally,\nwe propose and employ two distinct data cleaning methodologies to ensure that\nthe resulting image-text pairs maintain both realism and accuracy. The\nFoodFusion model, thus trained, demonstrates a remarkable ability to generate\nfood images that exhibit a significant improvement in terms of both realism and\ndiversity over the publicly available image generation models. We openly share\nthe dataset and fine-tuned models to support advancements in this critical\nfield of food image synthesis at https://bit.ly/genai4good.\n","authors":["Olivia Markham","Yuhao Chen","Chi-en Amy Tai","Alexander Wong"],"pdf_url":"https://arxiv.org/pdf/2312.03540v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.06930v3","updated":"2023-12-06T14:59:53Z","published":"2022-11-13T15:41:50Z","title":"PaintNet: Unstructured Multi-Path Learning from 3D Point Clouds for\n Robotic Spray Painting","summary":" Popular industrial robotic problems such as spray painting and welding\nrequire (i) conditioning on free-shape 3D objects and (ii) planning of multiple\ntrajectories to solve the task. Yet, existing solutions make strong assumptions\non the form of input surfaces and the nature of output paths, resulting in\nlimited approaches unable to cope with real-data variability. By leveraging on\nrecent advances in 3D deep learning, we introduce a novel framework capable of\ndealing with arbitrary 3D surfaces, and handling a variable number of unordered\noutput paths (i.e. unstructured). Our approach predicts local path segments,\nwhich can be later concatenated to reconstruct long-horizon paths. We\nextensively validate the proposed method in the context of robotic spray\npainting by releasing PaintNet, the first public dataset of expert\ndemonstrations on free-shape 3D objects collected in a real industrial\nscenario. A thorough experimental analysis demonstrates the capabilities of our\nmodel to promptly predict smooth output paths that cover up to 95% of\npreviously unseen object surfaces, even without explicitly optimizing for paint\ncoverage.\n","authors":["Gabriele Tiboni","Raffaello Camoriano","Tatiana Tommasi"],"pdf_url":"https://arxiv.org/pdf/2211.06930v3.pdf","comment":"Presented as conference paper at IEEE/RSJ IROS 2023, Detroit, USA.\n Project website at https://gabrieletiboni.github.io/paintnet"},{"id":"http://arxiv.org/abs/2312.03533v1","updated":"2023-12-06T14:54:10Z","published":"2023-12-06T14:54:10Z","title":"Low-shot Object Learning with Mutual Exclusivity Bias","summary":" This paper introduces Low-shot Object Learning with Mutual Exclusivity Bias\n(LSME), the first computational framing of mutual exclusivity bias, a\nphenomenon commonly observed in infants during word learning. We provide a\nnovel dataset, comprehensive baselines, and a state-of-the-art method to enable\nthe ML community to tackle this challenging learning task. The goal of LSME is\nto analyze an RGB image of a scene containing multiple objects and correctly\nassociate a previously-unknown object instance with a provided category label.\nThis association is then used to perform low-shot learning to test category\ngeneralization. We provide a data generation pipeline for the LSME problem and\nconduct a thorough analysis of the factors that contribute to its difficulty.\nAdditionally, we evaluate the performance of multiple baselines, including\nstate-of-the-art foundation models. Finally, we present a baseline approach\nthat outperforms state-of-the-art models in terms of low-shot accuracy.\n","authors":["Anh Thai","Ahmad Humayun","Stefan Stojanov","Zixuan Huang","Bikram Boote","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2312.03533v1.pdf","comment":"Accepted at NeurIPS 2023, Datasets and Benchmarks Track. Project\n website https://ngailapdi.github.io/projects/lsme/"},{"id":"http://arxiv.org/abs/2312.03528v1","updated":"2023-12-06T14:43:38Z","published":"2023-12-06T14:43:38Z","title":"Personalized Pose Forecasting","summary":" Human pose forecasting is the task of predicting articulated human motion\ngiven past human motion. There exists a number of popular benchmarks that\nevaluate an array of different models performing human pose forecasting. These\nbenchmarks do not reflect that a human interacting system, such as a delivery\nrobot, observes and plans for the motion of the same individual over an\nextended period of time. Every individual has unique and distinct movement\npatterns. This is however not reflected in existing benchmarks that evaluate a\nmodel's ability to predict an average human's motion rather than a particular\nindividual's. We reformulate the human motion forecasting problem and present a\nmodel-agnostic personalization method. Motion forecasting personalization can\nbe performed efficiently online by utilizing a low-parametric time-series\nanalysis model that personalizes neural network pose predictions.\n","authors":["Maria Priisalu","Ted Kronvall","Cristian Sminchisescu"],"pdf_url":"https://arxiv.org/pdf/2312.03528v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03526v1","updated":"2023-12-06T14:40:05Z","published":"2023-12-06T14:40:05Z","title":"On the Diversity and Realism of Distilled Dataset: An Efficient Dataset\n Distillation Paradigm","summary":" Contemporary machine learning requires training large neural networks on\nmassive datasets and thus faces the challenges of high computational demands.\nDataset distillation, as a recent emerging strategy, aims to compress\nreal-world datasets for efficient training. However, this line of research\ncurrently struggle with large-scale and high-resolution datasets, hindering its\npracticality and feasibility. To this end, we re-examine the existing dataset\ndistillation methods and identify three properties required for large-scale\nreal-world applications, namely, realism, diversity, and efficiency. As a\nremedy, we propose RDED, a novel computationally-efficient yet effective data\ndistillation paradigm, to enable both diversity and realism of the distilled\ndata. Extensive empirical results over various neural architectures and\ndatasets demonstrate the advancement of RDED: we can distill the full\nImageNet-1K to a small dataset comprising 10 images per class within 7 minutes,\nachieving a notable 42% top-1 accuracy with ResNet-18 on a single RTX-4090 GPU\n(while the SOTA only achieves 21% but requires 6 hours).\n","authors":["Peng Sun","Bei Shi","Daiwei Yu","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2312.03526v1.pdf","comment":"17 pages, 20 figures"},{"id":"http://arxiv.org/abs/2312.03520v1","updated":"2023-12-06T14:29:16Z","published":"2023-12-06T14:29:16Z","title":"Defense Against Adversarial Attacks using Convolutional Auto-Encoders","summary":" Deep learning models, while achieving state-of-the-art performance on many\ntasks, are susceptible to adversarial attacks that exploit inherent\nvulnerabilities in their architectures. Adversarial attacks manipulate the\ninput data with imperceptible perturbations, causing the model to misclassify\nthe data or produce erroneous outputs. This work is based on enhancing the\nrobustness of targeted classifier models against adversarial attacks. To\nachieve this, an convolutional autoencoder-based approach is employed that\neffectively counters adversarial perturbations introduced to the input images.\nBy generating images closely resembling the input images, the proposed\nmethodology aims to restore the model's accuracy.\n","authors":["Shreyasi Mandal"],"pdf_url":"https://arxiv.org/pdf/2312.03520v1.pdf","comment":"9 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.03519v1","updated":"2023-12-06T14:25:47Z","published":"2023-12-06T14:25:47Z","title":"Active Wildfires Detection and Dynamic Escape Routes Planning for Humans\n through Information Fusion between Drones and Satellites","summary":" UAVs are playing an increasingly important role in the field of wilderness\nrescue by virtue of their flexibility. This paper proposes a fusion of UAV\nvision technology and satellite image analysis technology for active wildfires\ndetection and road networks extraction of wildfire areas and real-time dynamic\nescape route planning for people in distress. Firstly, the fire source location\nand the segmentation of smoke and flames are targeted based on Sentinel 2\nsatellite imagery. Secondly, the road segmentation and the road condition\nassessment are performed by D-linkNet and NDVI values in the central area of\nthe fire source by UAV. Finally, the dynamic optimal route planning for humans\nin real time is performed by the weighted A* algorithm in the road network with\nthe dynamic fire spread model. Taking the Chongqing wildfire on August 24,\n2022, as a case study, the results demonstrate that the dynamic escape route\nplanning algorithm can provide an optimal real-time navigation path for humans\nin the presence of fire through the information fusion of UAVs and satellites.\n","authors":["Chang Liu","Tamas Sziranyi"],"pdf_url":"https://arxiv.org/pdf/2312.03519v1.pdf","comment":"6 pages, 10 figures, conference"},{"id":"http://arxiv.org/abs/2312.03517v1","updated":"2023-12-06T14:24:26Z","published":"2023-12-06T14:24:26Z","title":"FRDiff: Feature Reuse for Exquisite Zero-shot Acceleration of Diffusion\n Models","summary":" The substantial computational costs of diffusion models, particularly due to\nthe repeated denoising steps crucial for high-quality image generation, present\na major obstacle to their widespread adoption. While several studies have\nattempted to address this issue by reducing the number of score function\nevaluations using advanced ODE solvers without fine-tuning, the decreased\nnumber of denoising iterations misses the opportunity to update fine details,\nresulting in noticeable quality degradation. In our work, we introduce an\nadvanced acceleration technique that leverages the temporal redundancy inherent\nin diffusion models. Reusing feature maps with high temporal similarity opens\nup a new opportunity to save computation without sacrificing output quality. To\nrealize the practical benefits of this intuition, we conduct an extensive\nanalysis and propose a novel method, FRDiff. FRDiff is designed to harness the\nadvantages of both reduced NFE and feature reuse, achieving a Pareto frontier\nthat balances fidelity and latency trade-offs in various generative tasks.\n","authors":["Junhyuk So","Jungwon Lee","Eunhyeok Park"],"pdf_url":"https://arxiv.org/pdf/2312.03517v1.pdf","comment":"under review"},{"id":"http://arxiv.org/abs/2312.03511v1","updated":"2023-12-06T14:13:38Z","published":"2023-12-06T14:13:38Z","title":"Kandinsky 3.0 Technical Report","summary":" We present Kandinsky 3.0, a large-scale text-to-image generation model based\non latent diffusion, continuing the series of text-to-image Kandinsky models\nand reflecting our progress to achieve higher quality and realism of image\ngeneration. Compared to previous versions of Kandinsky 2.x, Kandinsky 3.0\nleverages a two times larger U-Net backbone, a ten times larger text encoder\nand removes diffusion mapping. We describe the architecture of the model, the\ndata collection procedure, the training technique, and the production system of\nuser interaction. We focus on the key components that, as we have identified as\na result of a large number of experiments, had the most significant impact on\nimproving the quality of our model compared to the others. By our side-by-side\ncomparisons, Kandinsky becomes better in text understanding and works better on\nspecific domains. Project page: https://ai-forever.github.io/Kandinsky-3\n","authors":["Vladimir Arkhipkin","Andrei Filatov","Viacheslav Vasilev","Anastasia Maltseva","Said Azizov","Igor Pavlov","Julia Agafonova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2312.03511v1.pdf","comment":"Project page: https://ai-forever.github.io/Kandinsky-3"},{"id":"http://arxiv.org/abs/2312.03509v1","updated":"2023-12-06T14:08:05Z","published":"2023-12-06T14:08:05Z","title":"Gravitational cell detection and tracking in fluorescence microscopy\n data","summary":" Automatic detection and tracking of cells in microscopy images are major\napplications of computer vision technologies in both biomedical research and\nclinical practice. Though machine learning methods are increasingly common in\nthese fields, classical algorithms still offer significant advantages for both\ntasks, including better explainability, faster computation, lower hardware\nrequirements and more consistent performance. In this paper, we present a novel\napproach based on gravitational force fields that can compete with, and\npotentially outperform modern machine learning models when applied to\nfluorescence microscopy images. This method includes detection, segmentation,\nand tracking elements, with the results demonstrated on a Cell Tracking\nChallenge dataset.\n","authors":["Nikomidisz Eftimiu","Michal Kozubek"],"pdf_url":"https://arxiv.org/pdf/2312.03509v1.pdf","comment":"5 pages, 2 figures, 1 formula, 1 table, submitted to the 21st\n International Symposium on Biomedical Imaging (ISBI 2024)"},{"id":"http://arxiv.org/abs/2312.03502v1","updated":"2023-12-06T13:59:22Z","published":"2023-12-06T13:59:22Z","title":"Improving the Generalization of Segmentation Foundation Model under\n Distribution Shift via Weakly Supervised Adaptation","summary":" The success of large language models has inspired the computer vision\ncommunity to explore image segmentation foundation model that is able to\nzero/few-shot generalize through prompt engineering. Segment-Anything(SAM),\namong others, is the state-of-the-art image segmentation foundation model\ndemonstrating strong zero/few-shot generalization. Despite the success, recent\nstudies reveal the weakness of SAM under strong distribution shift. In\nparticular, SAM performs awkwardly on corrupted natural images, camouflaged\nimages, medical images, etc. Motivated by the observations, we aim to develop a\nself-training based strategy to adapt SAM to target distribution. Given the\nunique challenges of large source dataset, high computation cost and incorrect\npseudo label, we propose a weakly supervised self-training architecture with\nanchor regularization and low-rank finetuning to improve the robustness and\ncomputation efficiency of adaptation. We validate the effectiveness on 5 types\nof downstream segmentation tasks including natural clean/corrupted images,\nmedical images, camouflaged images and robotic images. Our proposed method is\ntask-agnostic in nature and outperforms pre-trained SAM and state-of-the-art\ndomain adaptation methods on almost all downstream tasks with the same testing\nprompt inputs.\n","authors":["Haojie Zhang","Yongyi Su","Xun Xu","Kui Jia"],"pdf_url":"https://arxiv.org/pdf/2312.03502v1.pdf","comment":"20 pages, 12 figures"},{"id":"http://arxiv.org/abs/2308.12462v3","updated":"2023-12-06T13:46:05Z","published":"2023-08-23T22:55:45Z","title":"Overcoming Generic Knowledge Loss with Selective Parameter Update","summary":" Foundation models encompass an extensive knowledge base and offer remarkable\ntransferability. However, this knowledge becomes outdated or insufficient over\ntime. The challenge lies in continuously updating foundation models to\naccommodate novel information while retaining their original capabilities.\nLeveraging the fact that foundation models have initial knowledge on various\ntasks and domains, we propose a novel approach that, instead of updating all\nparameters equally, localizes the updates to a sparse set of parameters\nrelevant to the task being learned. We strike a balance between efficiency and\nnew task performance, while maintaining the transferability and\ngeneralizability of foundation models. We extensively evaluate our method on\nfoundational vision-language models with a diverse spectrum of continual\nlearning tasks. Our method achieves improvements on the accuracy of the newly\nlearned tasks up to 7% while preserving the pretraining knowledge with a\nnegligible decrease of 0.9% on a representative control set accuracy.\n","authors":["Wenxuan Zhang","Paul Janson","Rahaf Aljundi","Mohamed Elhoseiny"],"pdf_url":"https://arxiv.org/pdf/2308.12462v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03490v1","updated":"2023-12-06T13:31:52Z","published":"2023-12-06T13:31:52Z","title":"PneumoLLM: Harnessing the Power of Large Language Model for\n Pneumoconiosis Diagnosis","summary":" The conventional pretraining-and-finetuning paradigm, while effective for\ncommon diseases with ample data, faces challenges in diagnosing data-scarce\noccupational diseases like pneumoconiosis. Recently, large language models\n(LLMs) have exhibits unprecedented ability when conducting multiple tasks in\ndialogue, bringing opportunities to diagnosis. A common strategy might involve\nusing adapter layers for vision-language alignment and diagnosis in a dialogic\nmanner. Yet, this approach often requires optimization of extensive learnable\nparameters in the text branch and the dialogue head, potentially diminishing\nthe LLMs' efficacy, especially with limited training data. In our work, we\ninnovate by eliminating the text branch and substituting the dialogue head with\na classification head. This approach presents a more effective method for\nharnessing LLMs in diagnosis with fewer learnable parameters. Furthermore, to\nbalance the retention of detailed image information with progression towards\naccurate diagnosis, we introduce the contextual multi-token engine. This engine\nis specialized in adaptively generating diagnostic tokens. Additionally, we\npropose the information emitter module, which unidirectionally emits\ninformation from image tokens to diagnosis tokens. Comprehensive experiments\nvalidate the superiority of our methods and the effectiveness of proposed\nmodules. Our codes can be found at\nhttps://github.com/CodeMonsterPHD/PneumoLLM/tree/main.\n","authors":["Meiyue Song","Zhihua Yu","Jiaxin Wang","Jiarui Wang","Yuting Lu","Baicun Li","Xiaoxu Wang","Qinghua Huang","Zhijun Li","Nikolaos I. Kanellakis","Jiangfeng Liu","Jing Wang","Binglu Wang","Juntao Yang"],"pdf_url":"https://arxiv.org/pdf/2312.03490v1.pdf","comment":"submitted to Medical Image Analysis"},{"id":"http://arxiv.org/abs/2310.00500v2","updated":"2023-12-06T13:16:52Z","published":"2023-09-30T21:41:21Z","title":"Self-Supervised Open-Ended Classification with Small Visual Language\n Models","summary":" We present Self-Context Adaptation (SeCAt), a self-supervised approach that\nunlocks few-shot abilities for open-ended classification with small visual\nlanguage models. Our approach imitates image captions in a self-supervised way\nbased on clustering a large pool of images followed by assigning\nsemantically-unrelated names to clusters. By doing so, we construct a training\nsignal consisting of interleaved sequences of image and pseudocaption pairs and\na query image, which we denote as the 'self-context' sequence. Based on this\nsignal the model is trained to produce the right pseudo-caption. We demonstrate\nthe performance and flexibility of SeCAt on several multimodal few-shot\ndatasets, spanning various granularities. By using models with approximately 1B\nparameters we outperform the few-shot abilities of much larger models, such as\nFrozen and FROMAGe. SeCAt opens new possibilities for research and applications\nin open-ended few-shot learning that otherwise requires access to large or\nproprietary models.\n","authors":["Mohammad Mahdi Derakhshani","Ivona Najdenkoska","Cees G. M. Snoek","Marcel Worring","Yuki M. Asano"],"pdf_url":"https://arxiv.org/pdf/2310.00500v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03477v1","updated":"2023-12-06T13:10:02Z","published":"2023-12-06T13:10:02Z","title":"From Detection to Action Recognition: An Edge-Based Pipeline for Robot\n Human Perception","summary":" Mobile service robots are proving to be increasingly effective in a range of\napplications, such as healthcare, monitoring Activities of Daily Living (ADL),\nand facilitating Ambient Assisted Living (AAL). These robots heavily rely on\nHuman Action Recognition (HAR) to interpret human actions and intentions.\nHowever, for HAR to function effectively on service robots, it requires prior\nknowledge of human presence (human detection) and identification of individuals\nto monitor (human tracking). In this work, we propose an end-to-end pipeline\nthat encompasses the entire process, starting from human detection and\ntracking, leading to action recognition. The pipeline is designed to operate in\nnear real-time while ensuring all stages of processing are performed on the\nedge, reducing the need for centralised computation. To identify the most\nsuitable models for our mobile robot, we conducted a series of experiments\ncomparing state-of-the-art solutions based on both their detection performance\nand efficiency. To evaluate the effectiveness of our proposed pipeline, we\nproposed a dataset comprising daily household activities. By presenting our\nfindings and analysing the results, we demonstrate the efficacy of our approach\nin enabling mobile robots to understand and respond to human behaviour in\nreal-world scenarios relying mainly on the data from their RGB cameras.\n","authors":["Petros Toupas","Georgios Tsamis","Dimitrios Giakoumis","Konstantinos Votis","Dimitrios Tzovaras"],"pdf_url":"https://arxiv.org/pdf/2312.03477v1.pdf","comment":"7 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.03461v1","updated":"2023-12-06T12:36:53Z","published":"2023-12-06T12:36:53Z","title":"HiFi4G: High-Fidelity Human Performance Rendering via Compact Gaussian\n Splatting","summary":" We have recently seen tremendous progress in photo-real human modeling and\nrendering. Yet, efficiently rendering realistic human performance and\nintegrating it into the rasterization pipeline remains challenging. In this\npaper, we present HiFi4G, an explicit and compact Gaussian-based approach for\nhigh-fidelity human performance rendering from dense footage. Our core\nintuition is to marry the 3D Gaussian representation with non-rigid tracking,\nachieving a compact and compression-friendly representation. We first propose a\ndual-graph mechanism to obtain motion priors, with a coarse deformation graph\nfor effective initialization and a fine-grained Gaussian graph to enforce\nsubsequent constraints. Then, we utilize a 4D Gaussian optimization scheme with\nadaptive spatial-temporal regularizers to effectively balance the non-rigid\nprior and Gaussian updating. We also present a companion compression scheme\nwith residual compensation for immersive experiences on various platforms. It\nachieves a substantial compression rate of approximately 25 times, with less\nthan 2MB of storage per frame. Extensive experiments demonstrate the\neffectiveness of our approach, which significantly outperforms existing\napproaches in terms of optimization speed, rendering quality, and storage\noverhead.\n","authors":["Yuheng Jiang","Zhehao Shen","Penghao Wang","Zhuo Su","Yu Hong","Yingliang Zhang","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03461v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03459v1","updated":"2023-12-06T12:34:47Z","published":"2023-12-06T12:34:47Z","title":"F3-Pruning: A Training-Free and Generalized Pruning Strategy towards\n Faster and Finer Text-to-Video Synthesis","summary":" Recently Text-to-Video (T2V) synthesis has undergone a breakthrough by\ntraining transformers or diffusion models on large-scale datasets.\nNevertheless, inferring such large models incurs huge costs.Previous inference\nacceleration works either require costly retraining or are model-specific.To\naddress this issue, instead of retraining we explore the inference process of\ntwo mainstream T2V models using transformers and diffusion models.The\nexploration reveals the redundancy in temporal attention modules of both\nmodels, which are commonly utilized to establish temporal relations among\nframes.Consequently, we propose a training-free and generalized pruning\nstrategy called F3-Pruning to prune redundant temporal attention\nweights.Specifically, when aggregate temporal attention values are ranked below\na certain ratio, corresponding weights will be pruned.Extensive experiments on\nthree datasets using a classic transformer-based model CogVideo and a typical\ndiffusion-based model Tune-A-Video verify the effectiveness of F3-Pruning in\ninference acceleration, quality assurance and broad applicability.\n","authors":["Sitong Su","Jianzhi Liu","Lianli Gao","Jingkuan Song"],"pdf_url":"https://arxiv.org/pdf/2312.03459v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08577v3","updated":"2023-12-06T12:34:16Z","published":"2023-10-12T17:59:30Z","title":"Visual Data-Type Understanding does not emerge from Scaling\n Vision-Language Models","summary":" Recent advances in the development of vision-language models (VLMs) are\nyielding remarkable success in recognizing visual semantic content, including\nimpressive instances of compositional image understanding. Here, we introduce\nthe novel task of Visual Data-Type Identification, a basic perceptual skill\nwith implications for data curation (e.g., noisy data-removal from large\ndatasets, domain-specific retrieval) and autonomous vision (e.g.,\ndistinguishing changing weather conditions from camera lens staining). We\ndevelop two datasets consisting of animal images altered across a diverse set\nof 27 visual data-types, spanning four broad categories. An extensive zero-shot\nevaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a nuanced\nperformance landscape. While VLMs are reasonably good at identifying certain\nstylistic \\textit{data-types}, such as cartoons and sketches, they struggle\nwith simpler data-types arising from basic manipulations like image rotations\nor additive noise. Our findings reveal that (i) model scaling alone yields\nmarginal gains for contrastively-trained models like CLIP, and (ii) there is a\npronounced drop in performance for the largest auto-regressively trained VLMs\nlike OpenFlamingo. This finding points to a blind spot in current frontier\nVLMs: they excel in recognizing semantic content but fail to acquire an\nunderstanding of visual data-types through scaling. By analyzing the\npre-training distributions of these models and incorporating data-type\ninformation into the captions during fine-tuning, we achieve a significant\nenhancement in performance. By exploring this previously uncharted task, we aim\nto set the stage for further advancing VLMs to equip them with visual data-type\nunderstanding. Code and datasets are released at\nhttps://github.com/bethgelab/DataTypeIdentification.\n","authors":["Vishaal Udandarao","Max F. Burg","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2310.08577v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03455v1","updated":"2023-12-06T12:27:25Z","published":"2023-12-06T12:27:25Z","title":"Data is Overrated: Perceptual Metrics Can Lead Learning in the Absence\n of Training Data","summary":" Perceptual metrics are traditionally used to evaluate the quality of natural\nsignals, such as images and audio. They are designed to mimic the perceptual\nbehaviour of human observers and usually reflect structures found in natural\nsignals. This motivates their use as loss functions for training generative\nmodels such that models will learn to capture the structure held in the metric.\nWe take this idea to the extreme in the audio domain by training a compressive\nautoencoder to reconstruct uniform noise, in lieu of natural data. We show that\ntraining with perceptual losses improves the reconstruction of spectrograms and\nre-synthesized audio at test time over models trained with a standard Euclidean\nloss. This demonstrates better generalisation to unseen natural signals when\nusing perceptual metrics.\n","authors":["Tashi Namgyal","Alexander Hepburn","Raul Santos-Rodriguez","Valero Laparra","Jesus Malo"],"pdf_url":"https://arxiv.org/pdf/2312.03455v1.pdf","comment":"Machine Learning for Audio Workshop, NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.02663v2","updated":"2023-12-06T12:23:36Z","published":"2023-12-05T11:02:45Z","title":"FaceStudio: Put Your Face Everywhere in Seconds","summary":" This study investigates identity-preserving image synthesis, an intriguing\ntask in image generation that seeks to maintain a subject's identity while\nadding a personalized, stylistic touch. Traditional methods, such as Textual\nInversion and DreamBooth, have made strides in custom image creation, but they\ncome with significant drawbacks. These include the need for extensive resources\nand time for fine-tuning, as well as the requirement for multiple reference\nimages. To overcome these challenges, our research introduces a novel approach\nto identity-preserving synthesis, with a particular focus on human images. Our\nmodel leverages a direct feed-forward mechanism, circumventing the need for\nintensive fine-tuning, thereby facilitating quick and efficient image\ngeneration. Central to our innovation is a hybrid guidance framework, which\ncombines stylized images, facial images, and textual prompts to guide the image\ngeneration process. This unique combination enables our model to produce a\nvariety of applications, such as artistic portraits and identity-blended\nimages. Our experimental results, including both qualitative and quantitative\nevaluations, demonstrate the superiority of our method over existing baseline\nmodels and previous works, particularly in its remarkable efficiency and\nability to preserve the subject's identity with high fidelity.\n","authors":["Yuxuan Yan","Chi Zhang","Rui Wang","Yichao Zhou","Gege Zhang","Pei Cheng","Gang Yu","Bin Fu"],"pdf_url":"https://arxiv.org/pdf/2312.02663v2.pdf","comment":"Project homepage: https://icoz69.github.io/facestudio/"},{"id":"http://arxiv.org/abs/2312.03447v1","updated":"2023-12-06T12:10:56Z","published":"2023-12-06T12:10:56Z","title":"Quantum-Inspired Neural Network Model of Optical Illusions","summary":" Ambiguous optical illusions have been a paradigmatic object of fascination,\nresearch and inspiration in arts, psychology and video games. However, accurate\ncomputational models of perception of ambiguous figures have been elusive. In\nthis paper, we design and train a deep neural network model to simulate the\nhuman's perception of the Necker cube, an ambiguous drawing with several\nalternating possible interpretations. Defining the weights of the neural\nnetwork connection using a quantum generator of truly random numbers, in\nagreement with the emerging concepts of quantum artificial intelligence and\nquantum cognition we reveal that the actual perceptual state of the Necker cube\nis a qubit-like superposition of the two fundamental perceptual states\npredicted by classical theories. Our results will find applications in video\ngames and virtual reality systems employed for training of astronauts and\noperators of unmanned aerial vehicles. They will also be useful for researchers\nworking in the fields of machine learning and vision, psychology of perception\nand quantum-mechanical models of human mind and decision-making.\n","authors":["Ivan S. Maksymov"],"pdf_url":"https://arxiv.org/pdf/2312.03447v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03443v1","updated":"2023-12-06T11:54:50Z","published":"2023-12-06T11:54:50Z","title":"Data-driven Crop Growth Simulation on Time-varying Generated Images\n using Multi-conditional Generative Adversarial Networks","summary":" Image-based crop growth modeling can substantially contribute to precision\nagriculture by revealing spatial crop development over time, which allows an\nearly and location-specific estimation of relevant future plant traits, such as\nleaf area or biomass. A prerequisite for realistic and sharp crop image\ngeneration is the integration of multiple growth-influencing conditions in a\nmodel, such as an image of an initial growth stage, the associated growth time,\nand further information about the field treatment. We present a two-stage\nframework consisting first of an image prediction model and second of a growth\nestimation model, which both are independently trained. The image prediction\nmodel is a conditional Wasserstein generative adversarial network (CWGAN). In\nthe generator of this model, conditional batch normalization (CBN) is used to\nintegrate different conditions along with the input image. This allows the\nmodel to generate time-varying artificial images dependent on multiple\ninfluencing factors of different kinds. These images are used by the second\npart of the framework for plant phenotyping by deriving plant-specific traits\nand comparing them with those of non-artificial (real) reference images. For\nvarious crop datasets, the framework allows realistic, sharp image predictions\nwith a slight loss of quality from short-term to long-term predictions.\nSimulations of varying growth-influencing conditions performed with the trained\nframework provide valuable insights into how such factors relate to crop\nappearances, which is particularly useful in complex, less explored crop\nmixture systems. Further results show that adding process-based simulated\nbiomass as a condition increases the accuracy of the derived phenotypic traits\nfrom the predicted images. This demonstrates the potential of our framework to\nserve as an interface between an image- and process-based crop growth model.\n","authors":["Lukas Drees","Dereje T. Demie","Madhuri R. Paul","Johannes Leonhardt","Sabine J. Seidel","Thomas F. Döring","Ribana Roscher"],"pdf_url":"https://arxiv.org/pdf/2312.03443v1.pdf","comment":"26 pages, 16 figures, code available at\n https://github.com/luked12/crop-growth-cgan"},{"id":"http://arxiv.org/abs/2312.03442v1","updated":"2023-12-06T11:51:06Z","published":"2023-12-06T11:51:06Z","title":"High-Quality Facial Geometry and Appearance Capture at Home","summary":" Facial geometry and appearance capture have demonstrated tremendous success\nin 3D scanning real humans in studios. Recent works propose to democratize this\ntechnique while keeping the results high quality. However, they are still\ninconvenient for daily usage. In addition, they focus on an easier problem of\nonly capturing facial skin. This paper proposes a novel method for high-quality\nface capture, featuring an easy-to-use system and the capability to model the\ncomplete face with skin, mouth interior, hair, and eyes. We reconstruct facial\ngeometry and appearance from a single co-located smartphone flashlight sequence\ncaptured in a dim room where the flashlight is the dominant light source (e.g.\nrooms with curtains or at night). To model the complete face, we propose a\nnovel hybrid representation to effectively model both eyes and other facial\nregions, along with novel techniques to learn it from images. We apply a\ncombined lighting model to compactly represent real illuminations and exploit a\nmorphable face albedo model as a reflectance prior to disentangle diffuse and\nspecular. Experiments show that our method can capture high-quality 3D\nrelightable scans.\n","authors":["Yuxuan Han","Junfeng Lyu","Feng Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03442v1.pdf","comment":"Project page: https://yxuhan.github.io/CoRA/index.html ; Github repo:\n https://github.com/yxuhan/CoRA"},{"id":"http://arxiv.org/abs/2312.03441v1","updated":"2023-12-06T11:50:14Z","published":"2023-12-06T11:50:14Z","title":"UFineBench: Towards Text-based Person Retrieval with Ultra-fine\n Granularity","summary":" Existing text-based person retrieval datasets often have relatively\ncoarse-grained text annotations. This hinders the model to comprehend the\nfine-grained semantics of query texts in real scenarios. To address this\nproblem, we contribute a new benchmark named \\textbf{UFineBench} for text-based\nperson retrieval with ultra-fine granularity.\n Firstly, we construct a new \\textbf{dataset} named UFine6926. We collect a\nlarge number of person images and manually annotate each image with two\ndetailed textual descriptions, averaging 80.8 words each. The average word\ncount is three to four times that of the previous datasets. In addition of\nstandard in-domain evaluation, we also propose a special \\textbf{evaluation\nparadigm} more representative of real scenarios. It contains a new evaluation\nset with cross domains, cross textual granularity and cross textual styles,\nnamed UFine3C, and a new evaluation metric for accurately measuring retrieval\nability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a\nmore efficient \\textbf{algorithm} especially designed for text-based person\nretrieval with ultra fine-grained texts. It achieves fine granularity mining by\nadopting a shared cross-modal granularity decoder and hard negative match\nmechanism.\n With standard in-domain evaluation, CFAM establishes competitive performance\nacross various datasets, especially on our ultra fine-grained UFine6926.\nFurthermore, by evaluating on UFine3C, we demonstrate that training on our\nUFine6926 significantly improves generalization to real scenarios compared with\nother coarse-grained datasets. The dataset and code will be made publicly\navailable at \\url{https://github.com/Zplusdragon/UFineBench}.\n","authors":["Jialong Zuo","Hanyu Zhou","Ying Nie","Feng Zhang","Tianyu Guo","Nong Sang","Yunhe Wang","Changxin Gao"],"pdf_url":"https://arxiv.org/pdf/2312.03441v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03437v1","updated":"2023-12-06T11:38:26Z","published":"2023-12-06T11:38:26Z","title":"Data-Centric Digital Agriculture: A Perspective","summary":" In response to the increasing global demand for food, feed, fiber, and fuel,\ndigital agriculture is rapidly evolving to meet these demands while reducing\nenvironmental impact. This evolution involves incorporating data science,\nmachine learning, sensor technologies, robotics, and new management strategies\nto establish a more sustainable agricultural framework. So far, machine\nlearning research in digital agriculture has predominantly focused on\nmodel-centric approaches, focusing on model design and evaluation. These\nefforts aim to optimize model accuracy and efficiency, often treating data as a\nstatic benchmark. Despite the availability of agricultural data and\nmethodological advancements, a saturation point has been reached, with many\nestablished machine learning methods achieving comparable levels of accuracy\nand facing similar limitations. To fully realize the potential of digital\nagriculture, it is crucial to have a comprehensive understanding of the role of\ndata in the field and to adopt data-centric machine learning. This involves\ndeveloping strategies to acquire and curate valuable data and implementing\neffective learning and evaluation strategies that utilize the intrinsic value\nof data. This approach has the potential to create accurate, generalizable, and\nadaptable machine learning methods that effectively and sustainably address\nagricultural tasks such as yield prediction, weed detection, and early disease\nidentification\n","authors":["Ribana Roscher","Lukas Roth","Cyrill Stachniss","Achim Walter"],"pdf_url":"https://arxiv.org/pdf/2312.03437v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03431v1","updated":"2023-12-06T11:25:52Z","published":"2023-12-06T11:25:52Z","title":"Gaussian-Flow: 4D Reconstruction with Dynamic 3D Gaussian Particle","summary":" We introduce Gaussian-Flow, a novel point-based approach for fast dynamic\nscene reconstruction and real-time rendering from both multi-view and monocular\nvideos. In contrast to the prevalent NeRF-based approaches hampered by slow\ntraining and rendering speeds, our approach harnesses recent advancements in\npoint-based 3D Gaussian Splatting (3DGS). Specifically, a novel Dual-Domain\nDeformation Model (DDDM) is proposed to explicitly model attribute deformations\nof each Gaussian point, where the time-dependent residual of each attribute is\ncaptured by a polynomial fitting in the time domain, and a Fourier series\nfitting in the frequency domain. The proposed DDDM is capable of modeling\ncomplex scene deformations across long video footage, eliminating the need for\ntraining separate 3DGS for each frame or introducing an additional implicit\nneural field to model 3D dynamics. Moreover, the explicit deformation modeling\nfor discretized Gaussian points ensures ultra-fast training and rendering of a\n4D scene, which is comparable to the original 3DGS designed for static 3D\nreconstruction. Our proposed approach showcases a substantial efficiency\nimprovement, achieving a $5\\times$ faster training speed compared to the\nper-frame 3DGS modeling. In addition, quantitative results demonstrate that the\nproposed Gaussian-Flow significantly outperforms previous leading methods in\nnovel view rendering quality. Project page:\nhttps://nju-3dv.github.io/projects/Gaussian-Flow\n","authors":["Youtian Lin","Zuozhuo Dai","Siyu Zhu","Yao Yao"],"pdf_url":"https://arxiv.org/pdf/2312.03431v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03430v1","updated":"2023-12-06T11:25:40Z","published":"2023-12-06T11:25:40Z","title":"ShareCMP: Polarization-Aware RGB-P Semantic Segmentation","summary":" Multimodal semantic segmentation is developing rapidly, but the modality of\nRGB-Polarization remains underexplored. To delve into this problem, we\nconstruct a UPLight RGB-P segmentation benchmark with 12 typical underwater\nsemantic classes which provides data support for Autonomous Underwater Vehicles\n(AUVs) to perform special perception tasks. In this work, we design the\nShareCMP, an RGB-P semantic segmentation framework with a shared dual-branch\narchitecture, which reduces the number of parameters by about 26-33% compared\nto previous dual-branch models. It encompasses a Polarization Generate\nAttention (PGA) module designed to generate polarization modal images with\nricher polarization properties for the encoder. In addition, we introduce the\nClass Polarization-Aware Loss (CPALoss) to improve the learning and\nunderstanding of the encoder for polarization modal information and to optimize\nthe PGA module. With extensive experiments on a total of three RGB-P\nbenchmarks, our ShareCMP achieves state-of-the-art performance in mIoU with\nfewer parameters on the UPLight (92.45%), ZJU (92.7%), and MCubeS (50.99%)\ndatasets. The code is available at https://github.com/LEFTeyex/ShareCMP.\n","authors":["Zhuoyan Liu","Bo Wang","Lizhi Wang","Chenyu Mao","Ye Li"],"pdf_url":"https://arxiv.org/pdf/2312.03430v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03420v1","updated":"2023-12-06T11:06:46Z","published":"2023-12-06T11:06:46Z","title":"Artist-Friendly Relightable and Animatable Neural Heads","summary":" An increasingly common approach for creating photo-realistic digital avatars\nis through the use of volumetric neural fields. The original neural radiance\nfield (NeRF) allowed for impressive novel view synthesis of static heads when\ntrained on a set of multi-view images, and follow up methods showed that these\nneural representations can be extended to dynamic avatars. Recently, new\nvariants also surpassed the usual drawback of baked-in illumination in neural\nrepresentations, showing that static neural avatars can be relit in any\nenvironment. In this work we simultaneously tackle both the motion and\nillumination problem, proposing a new method for relightable and animatable\nneural heads. Our method builds on a proven dynamic avatar approach based on a\nmixture of volumetric primitives, combined with a recently-proposed lightweight\nhardware setup for relightable neural fields, and includes a novel architecture\nthat allows relighting dynamic neural avatars performing unseen expressions in\nany environment, even with nearfield illumination and viewpoints.\n","authors":["Yingyan Xu","Prashanth Chandran","Sebastian Weiss","Markus Gross","Gaspard Zoss","Derek Bradley"],"pdf_url":"https://arxiv.org/pdf/2312.03420v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.06791v2","updated":"2023-12-06T11:06:06Z","published":"2023-11-12T09:58:16Z","title":"InfMLLM: A Unified Framework for Visual-Language Tasks","summary":" Large language models (LLMs) have proven their remarkable versatility in\nhandling a comprehensive range of language-centric applications. To expand\nLLMs' capabilities to a broader spectrum of modal inputs, multimodal large\nlanguage models (MLLMs) have attracted growing interest. This work delves into\nenabling LLMs to tackle more vision-language-related tasks, particularly image\ncaptioning, visual question answering (VQA,) and visual grounding. To this end,\nwe implemented a three-stage training scheme: starting with lightweight\nalignment pretraining, then moderate-weight multitask hybrid training, and\nfinally, LLM fine-tuning to improve instruction following capability.\nThroughout the training process, the requirements on GPU memory gradually\nincrease. To effectively manage the number of visual embeddings passed to the\nLLM while preserving their positional information, we introduce a\nstraightforward visual adapter module dubbed pool-adapter. Our experiments\ndemonstrate that preserving the positional information of visual embeddings\nthrough the pool-adapter is particularly beneficial for tasks like visual\ngrounding. We name our proposed approach InfMLLM and have evaluated it\nextensively on various benchmark datasets. Our results demonstrate that InfMLLM\nachieves either state-of-the-art (SOTA) performance or performance comparable\nto recent MLLMs. The code and model will be made open-source at:\n\\url{https://github.com/mightyzau/InfMLLM}.\n","authors":["Qiang Zhou","Zhibin Wang","Wei Chu","Yinghui Xu","Hao Li","Yuan Qi"],"pdf_url":"https://arxiv.org/pdf/2311.06791v2.pdf","comment":"8"},{"id":"http://arxiv.org/abs/2212.07495v2","updated":"2023-12-06T10:55:40Z","published":"2022-12-14T20:28:50Z","title":"SAIF: Sparse Adversarial and Imperceptible Attack Framework","summary":" Adversarial attacks hamper the decision-making ability of neural networks by\nperturbing the input signal. The addition of calculated small distortion to\nimages, for instance, can deceive a well-trained image classification network.\nIn this work, we propose a novel attack technique called Sparse Adversarial and\nInterpretable Attack Framework (SAIF). Specifically, we design imperceptible\nattacks that contain low-magnitude perturbations at a small number of pixels\nand leverage these sparse attacks to reveal the vulnerability of classifiers.\nWe use the Frank-Wolfe (conditional gradient) algorithm to simultaneously\noptimize the attack perturbations for bounded magnitude and sparsity with\n$O(1/\\sqrt{T})$ convergence. Empirical results show that SAIF computes highly\nimperceptible and interpretable adversarial examples, and outperforms\nstate-of-the-art sparse attack methods on the ImageNet dataset.\n","authors":["Tooba Imtiaz","Morgan Kohler","Jared Miller","Zifeng Wang","Mario Sznaier","Octavia Camps","Jennifer Dy"],"pdf_url":"https://arxiv.org/pdf/2212.07495v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03409v1","updated":"2023-12-06T10:47:11Z","published":"2023-12-06T10:47:11Z","title":"DeepPyramid+: Medical Image Segmentation using Pyramid View Fusion and\n Deformable Pyramid Reception","summary":" Semantic Segmentation plays a pivotal role in many applications related to\nmedical image and video analysis. However, designing a neural network\narchitecture for medical image and surgical video segmentation is challenging\ndue to the diverse features of relevant classes, including heterogeneity,\ndeformability, transparency, blunt boundaries, and various distortions. We\npropose a network architecture, DeepPyramid+, which addresses diverse\nchallenges encountered in medical image and surgical video segmentation. The\nproposed DeepPyramid+ incorporates two major modules, namely \"Pyramid View\nFusion\" (PVF) and \"Deformable Pyramid Reception,\" (DPR), to address the\noutlined challenges. PVF replicates a deduction process within the neural\nnetwork, aligning with the human visual system, thereby enhancing the\nrepresentation of relative information at each pixel position. Complementarily,\nDPR introduces shape- and scale-adaptive feature extraction techniques using\ndilated deformable convolutions, enhancing accuracy and robustness in handling\nheterogeneous classes and deformable shapes. Extensive experiments conducted on\ndiverse datasets, including endometriosis videos, MRI images, OCT scans, and\ncataract and laparoscopy videos, demonstrate the effectiveness of DeepPyramid+\nin handling various challenges such as shape and scale variation, reflection,\nand blur degradation. DeepPyramid+ demonstrates significant improvements in\nsegmentation performance, achieving up to a 3.65% increase in Dice coefficient\nfor intra-domain segmentation and up to a 17% increase in Dice coefficient for\ncross-domain segmentation. DeepPyramid+ consistently outperforms\nstate-of-the-art networks across diverse modalities considering different\nbackbone networks, showcasing its versatility.\n","authors":["Negin Ghamsarian","Sebastian Wolf","Martin Zinkernagel","Klaus Schoeffmann","Raphael Sznitman"],"pdf_url":"https://arxiv.org/pdf/2312.03409v1.pdf","comment":"13 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.03408v1","updated":"2023-12-06T10:46:53Z","published":"2023-12-06T10:46:53Z","title":"Open-sourced Data Ecosystem in Autonomous Driving: the Present and\n Future","summary":" With the continuous maturation and application of autonomous driving\ntechnology, a systematic examination of open-source autonomous driving datasets\nbecomes instrumental in fostering the robust evolution of the industry\necosystem. Current autonomous driving datasets can broadly be categorized into\ntwo generations. The first-generation autonomous driving datasets are\ncharacterized by relatively simpler sensor modalities, smaller data scale, and\nis limited to perception-level tasks. KITTI, introduced in 2012, serves as a\nprominent representative of this initial wave. In contrast, the\nsecond-generation datasets exhibit heightened complexity in sensor modalities,\ngreater data scale and diversity, and an expansion of tasks from perception to\nencompass prediction and control. Leading examples of the second generation\ninclude nuScenes and Waymo, introduced around 2019. This comprehensive review,\nconducted in collaboration with esteemed colleagues from both academia and\nindustry, systematically assesses over seventy open-source autonomous driving\ndatasets from domestic and international sources. It offers insights into\nvarious aspects, such as the principles underlying the creation of high-quality\ndatasets, the pivotal role of data engine systems, and the utilization of\ngenerative foundation models to facilitate scalable data generation.\nFurthermore, this review undertakes an exhaustive analysis and discourse\nregarding the characteristics and data scales that future third-generation\nautonomous driving datasets should possess. It also delves into the scientific\nand technical challenges that warrant resolution. These endeavors are pivotal\nin advancing autonomous innovation and fostering technological enhancement in\ncritical domains. For further details, please refer to\nhttps://github.com/OpenDriveLab/DriveAGI.\n","authors":["Hongyang Li","Yang Li","Huijie Wang","Jia Zeng","Pinlong Cai","Huilin Xu","Dahua Lin","Junchi Yan","Feng Xu","Lu Xiong","Jingdong Wang","Futang Zhu","Kai Yan","Chunjing Xu","Tiancai Wang","Beipeng Mu","Shaoqing Ren","Zhihui Peng","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2312.03408v1.pdf","comment":"This article is a simplified English translation of corresponding\n Chinese article. Please refer to Chinese version for the complete content"},{"id":"http://arxiv.org/abs/2312.03406v1","updated":"2023-12-06T10:42:40Z","published":"2023-12-06T10:42:40Z","title":"SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting","summary":" Spatiotemporal forecasting tasks, such as weather forecasting and traffic\nprediction, offer significant societal benefits. These tasks can be effectively\napproached as image forecasting problems using computer vision models. Vector\nquantization (VQ) is a well-known method for discrete representation that\nimproves the latent space, leading to enhanced generalization and transfer\nlearning capabilities. One of the main challenges in using VQ for\nspatiotemporal forecasting is how to balance between keeping enough details and\nremoving noises from the original patterns for better generalization. We\naddress this challenge by developing sparse vector quantization, or {\\bf SVQ}\nfor short, that leverages sparse regression to make better trade-off between\nthe two objectives. The main innovation of this work is to approximate sparse\nregression by a two-layer MLP and a randomly fixed or learnable matrix,\ndramatically improving its computational efficiency. Through experiments\nconducted on diverse datasets in multiple fields including weather forecasting,\ntraffic flow prediction, and video forecasting, we unequivocally demonstrate\nthat our proposed method consistently enhances the performance of base models\nand achieves state-of-the-art results across all benchmarks.\n","authors":["Chao Chen","Tian Zhou","Yanjun Zhao","Hui Liu","Liang Sun","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2312.03406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03401v1","updated":"2023-12-06T10:27:15Z","published":"2023-12-06T10:27:15Z","title":"Predicting Postoperative Intraocular Lens Dislocation in Cataract\n Surgery via Deep Learning","summary":" A critical yet unpredictable complication following cataract surgery is\nintraocular lens dislocation. Postoperative stability is imperative, as even a\ntiny decentration of multifocal lenses or inadequate alignment of the torus in\ntoric lenses due to postoperative rotation can lead to a significant drop in\nvisual acuity. Investigating possible intraoperative indicators that can\npredict post-surgical instabilities of intraocular lenses can help prevent this\ncomplication. In this paper, we develop and evaluate the first fully-automatic\nframework for the computation of lens unfolding delay, rotation, and\ninstability during surgery. Adopting a combination of three types of CNNs,\nnamely recurrent, region-based, and pixel-based, the proposed framework is\nemployed to assess the possibility of predicting post-operative lens\ndislocation during cataract surgery. This is achieved via performing a\nlarge-scale study on the statistical differences between the behavior of\ndifferent brands of intraocular lenses and aligning the results with expert\nsurgeons' hypotheses and observations about the lenses. We exploit a\nlarge-scale dataset of cataract surgery videos featuring four intraocular lens\nbrands. Experimental results confirm the reliability of the proposed framework\nin evaluating the lens' statistics during the surgery. The Pearson correlation\nand t-test results reveal significant correlations between lens unfolding delay\nand lens rotation and significant differences between the intra-operative\nrotations stability of four groups of lenses. These results suggest that the\nproposed framework can help surgeons select the lenses based on the patient's\neye conditions and predict post-surgical lens dislocation.\n","authors":["Negin Ghamsarian","Doris Putzgruber-Adamitsch","Stephanie Sarny","Raphael Sznitman","Klaus Schoeffmann","Yosuf El-Shabrawi"],"pdf_url":"https://arxiv.org/pdf/2312.03401v1.pdf","comment":"12 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03391v1","updated":"2023-12-06T10:01:43Z","published":"2023-12-06T10:01:43Z","title":"Action Scene Graphs for Long-Form Understanding of Egocentric Videos","summary":" We present Egocentric Action Scene Graphs (EASGs), a new representation for\nlong-form understanding of egocentric videos. EASGs extend standard\nmanually-annotated representations of egocentric videos, such as verb-noun\naction labels, by providing a temporally evolving graph-based description of\nthe actions performed by the camera wearer, including interacted objects, their\nrelationships, and how actions unfold in time. Through a novel annotation\nprocedure, we extend the Ego4D dataset by adding manually labeled Egocentric\nAction Scene Graphs offering a rich set of annotations designed for long-from\negocentric video understanding. We hence define the EASG generation task and\nprovide a baseline approach, establishing preliminary benchmarks. Experiments\non two downstream tasks, egocentric action anticipation and egocentric activity\nsummarization, highlight the effectiveness of EASGs for long-form egocentric\nvideo understanding. We will release the dataset and the code to replicate\nexperiments and annotations.\n","authors":["Ivan Rodin","Antonino Furnari","Kyle Min","Subarna Tripathi","Giovanni Maria Farinella"],"pdf_url":"https://arxiv.org/pdf/2312.03391v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01546v2","updated":"2023-12-06T09:43:41Z","published":"2023-06-02T13:50:30Z","title":"Publicly available datasets of breast histopathology H&E whole-slide\n images: A scoping review","summary":" Advancements in digital pathology and computing resources have made a\nsignificant impact in the field of computational pathology for breast cancer\ndiagnosis and treatment. However, access to high-quality labeled\nhistopathological images of breast cancer is a big challenge that limits the\ndevelopment of accurate and robust deep learning models. In this scoping\nreview, we identified the publicly available datasets of breast H&E stained\nwhole-slide images (WSI) that can be used to develop deep learning algorithms.\nWe systematically searched nine scientific literature databases and nine\nresearch data repositories and found 17 publicly available datasets containing\n10385 H&E WSIs of breast cancer. Moreover, we reported image metadata and\ncharacteristics for each dataset to assist researchers in selecting proper\ndatasets for specific tasks in breast cancer computational pathology. In\naddition, we compiled two lists of breast H&E patches and private datasets as\nsupplementary resources for researchers. Notably, only 28% of the included\narticles utilized multiple datasets, and only 14% used an external validation\nset, suggesting that the performance of other developed models may be\nsusceptible to overestimation. The TCGA-BRCA was used in 52% of the selected\nstudies. This dataset has a considerable selection bias that can impact the\nrobustness and generalizability of the trained algorithms. There is also a lack\nof consistent metadata reporting of breast WSI datasets that can be an issue in\ndeveloping accurate deep learning models, indicating the necessity of\nestablishing explicit guidelines for documenting breast WSI dataset\ncharacteristics and metadata.\n","authors":["Masoud Tafavvoghi","Lars Ailo Bongo","Nikita Shvetsov","Lill-Tove Rasmussen Busund","Kajsa Møllersen"],"pdf_url":"https://arxiv.org/pdf/2306.01546v2.pdf","comment":"27 pages (including references), 8 figures, 3 tables, 5 supporting\n information materials"},{"id":"http://arxiv.org/abs/2204.01278v2","updated":"2023-12-06T09:40:37Z","published":"2022-04-04T07:04:50Z","title":"Technical Report on Subspace Pyramid Fusion Network for Semantic\n Segmentation","summary":" The following is a technical report to test the validity of the proposed\nSubspace Pyramid Fusion Module (SPFM) to capture multi-scale feature\nrepresentations, which is more useful for semantic segmentation. In this\ninvestigation, we have proposed the Efficient Shuffle Attention Module(ESAM) to\nreconstruct the skip-connections paths by fusing multi-level global context\nfeatures. Experimental results on two well-known semantic segmentation\ndatasets, including Camvid and Cityscapes, show the effectiveness of our\nproposed method.\n","authors":["Mohammed A. M. Elhassan","Chenhui Yang","Chenxi Huang","Tewodros Legesse Munea"],"pdf_url":"https://arxiv.org/pdf/2204.01278v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03378v1","updated":"2023-12-06T09:33:33Z","published":"2023-12-06T09:33:33Z","title":"Riemannian Complex Matrix Convolution Network for PolSAR Image\n Classification","summary":" Recently, deep learning methods have achieved superior performance for\nPolarimetric Synthetic Aperture Radar(PolSAR) image classification. Existing\ndeep learning methods learn PolSAR data by converting the covariance matrix\ninto a feature vector or complex-valued vector as the input. However, all these\nmethods cannot learn the structure of complex matrix directly and destroy the\nchannel correlation. To learn geometric structure of complex matrix, we propose\na Riemannian complex matrix convolution network for PolSAR image classification\nin Riemannian space for the first time, which directly utilizes the complex\nmatrix as the network input and defines the Riemannian operations to learn\ncomplex matrix's features. The proposed Riemannian complex matrix convolution\nnetwork considers PolSAR complex matrix endowed in Riemannian manifold, and\ndefines a series of new Riemannian convolution, ReLu and LogEig operations in\nRiemannian space, which breaks through the Euclidean constraint of conventional\nnetworks. Then, a CNN module is appended to enhance contextual Riemannian\nfeatures. Besides, a fast kernel learning method is developed for the proposed\nmethod to learn class-specific features and reduce the computation time\neffectively. Experiments are conducted on three sets of real PolSAR data with\ndifferent bands and sensors. Experiments results demonstrates the proposed\nmethod can obtain superior performance than the state-of-the-art methods.\n","authors":["Junfei Shi","Wei Wang","Haiyan Jin","Mengmeng Nie","Shanshan Ji"],"pdf_url":"https://arxiv.org/pdf/2312.03378v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19378v2","updated":"2023-12-06T09:30:09Z","published":"2023-10-30T09:35:43Z","title":"Few-shot Hybrid Domain Adaptation of Image Generators","summary":" Can a pre-trained generator be adapted to the hybrid of multiple target\ndomains and generate images with integrated attributes of them? In this work,\nwe introduce a new task -- Few-shot Hybrid Domain Adaptation (HDA). Given a\nsource generator and several target domains, HDA aims to acquire an adapted\ngenerator that preserves the integrated attributes of all target domains,\nwithout overriding the source domain's characteristics. Compared with Domain\nAdaptation (DA), HDA offers greater flexibility and versatility to adapt\ngenerators to more composite and expansive domains. Simultaneously, HDA also\npresents more challenges than DA as we have access only to images from\nindividual target domains and lack authentic images from the hybrid domain. To\naddress this issue, we introduce a discriminator-free framework that directly\nencodes different domains' images into well-separable subspaces. To achieve\nHDA, we propose a novel directional subspace loss comprised of a distance loss\nand a direction loss. Concretely, the distance loss blends the attributes of\nall target domains by reducing the distances from generated images to all\ntarget subspaces. The direction loss preserves the characteristics from the\nsource domain by guiding the adaptation along the perpendicular to subspaces.\nExperiments show that our method can obtain numerous domain-specific attributes\nin a single adapted generator, which surpasses the baseline methods in semantic\nsimilarity, image fidelity, and cross-domain consistency.\n","authors":["Hengjia Li","Yang Liu","Linxuan Xia","Yuqi Lin","Tu Zheng","Zheng Yang","Wenxiao Wang","Xiaohui Zhong","Xiaobo Ren","Xiaofei He"],"pdf_url":"https://arxiv.org/pdf/2310.19378v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03372v1","updated":"2023-12-06T09:13:34Z","published":"2023-12-06T09:13:34Z","title":"Evaluating the point cloud of individual trees generated from images\n based on Neural Radiance fields (NeRF) method","summary":" Three-dimensional (3D) reconstruction of trees has always been a key task in\nprecision forestry management and research. Due to the complex branch\nmorphological structure of trees themselves and the occlusions from tree stems,\nbranches and foliage, it is difficult to recreate a complete three-dimensional\ntree model from a two-dimensional image by conventional photogrammetric\nmethods. In this study, based on tree images collected by various cameras in\ndifferent ways, the Neural Radiance Fields (NeRF) method was used for\nindividual tree reconstruction and the exported point cloud models are compared\nwith point cloud derived from photogrammetric reconstruction and laser scanning\nmethods. The results show that the NeRF method performs well in individual tree\n3D reconstruction, as it has higher successful reconstruction rate, better\nreconstruction in the canopy area, it requires less amount of images as input.\nCompared with photogrammetric reconstruction method, NeRF has significant\nadvantages in reconstruction efficiency and is adaptable to complex scenes, but\nthe generated point cloud tends to be noisy and low resolution. The accuracy of\ntree structural parameters (tree height and diameter at breast height)\nextracted from the photogrammetric point cloud is still higher than those of\nderived from the NeRF point cloud. The results of this study illustrate the\ngreat potential of NeRF method for individual tree reconstruction, and it\nprovides new ideas and research directions for 3D reconstruction and\nvisualization of complex forest scenes.\n","authors":["Hongyu Huang","Guoji Tian","Chongcheng Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03372v1.pdf","comment":"25 pages; 6 figures"},{"id":"http://arxiv.org/abs/2312.03368v1","updated":"2023-12-06T09:09:27Z","published":"2023-12-06T09:09:27Z","title":"Bottom-Up Instance Segmentation of Catheters for Chest X-Rays","summary":" Chest X-ray (CXR) is frequently employed in emergency departments and\nintensive care units to verify the proper placement of central lines and tubes\nand to rule out related complications. The automation of the X-ray reading\nprocess can be a valuable support tool for non-specialist technicians and\nminimize reporting delays due to non-availability of experts. While existing\nsolutions for automated catheter segmentation and malposition detection show\npromising results, the disentanglement of individual catheters remains an open\nchallenge, especially in complex cases where multiple devices appear\nsuperimposed in the X-ray projection. Moreover, conventional top-down instance\nsegmentation methods are ineffective on such thin and long devices, that often\nextend through the entire image. In this paper, we propose a deep learning\napproach based on associative embeddings for catheter instance segmentation,\nable to overcome those limitations and effectively handle device intersections.\n","authors":["Francesca Boccardi","Axel Saalbach","Heinrich Schulz","Samuele Salti","Ilyas Sirazitdinov"],"pdf_url":"https://arxiv.org/pdf/2312.03368v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03357v1","updated":"2023-12-06T08:54:04Z","published":"2023-12-06T08:54:04Z","title":"RING-NeRF: A Versatile Architecture based on Residual Implicit Neural\n Grids","summary":" Since their introduction, Neural Fields have become very popular for 3D\nreconstruction and new view synthesis. Recent researches focused on\naccelerating the process, as well as improving the robustness to variation of\nthe observation distance and limited number of supervised viewpoints. However,\nthose approaches often led to dedicated solutions that cannot be easily\ncombined. To tackle this issue, we introduce a new simple but efficient\narchitecture named RING-NeRF, based on Residual Implicit Neural Grids, that\nprovides a control on the level of detail of the mapping function between the\nscene and the latent spaces. Associated with a distance-aware forward mapping\nmechanism and a continuous coarse-to-fine reconstruction process, our versatile\narchitecture demonstrates both fast training and state-of-the-art performances\nin terms of: (1) anti-aliased rendering, (2) reconstruction quality from few\nsupervised viewpoints, and (3) robustness in the absence of appropriate\nscene-specific initialization for SDF-based NeRFs. We also demonstrate that our\narchitecture can dynamically add grids to increase the details of the\nreconstruction, opening the way to adaptive reconstruction.\n","authors":["Doriand Petit","Steve Bourgeois","Dumitru Pavel","Vincent Gay-Bellile","Florian Chabot","Loic Barthe"],"pdf_url":"https://arxiv.org/pdf/2312.03357v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03350v1","updated":"2023-12-06T08:49:55Z","published":"2023-12-06T08:49:55Z","title":"PointMoment:Mixed-Moment-based Self-Supervised Representation Learning\n for 3D Point Clouds","summary":" Large and rich data is a prerequisite for effective training of deep neural\nnetworks. However, the irregularity of point cloud data makes manual annotation\ntime-consuming and laborious. Self-supervised representation learning, which\nleverages the intrinsic structure of large-scale unlabelled data to learn\nmeaningful feature representations, has attracted increasing attention in the\nfield of point cloud research. However, self-supervised representation learning\noften suffers from model collapse, resulting in reduced information and\ndiversity of the learned representation, and consequently degrading the\nperformance of downstream tasks. To address this problem, we propose\nPointMoment, a novel framework for point cloud self-supervised representation\nlearning that utilizes a high-order mixed moment loss function rather than the\nconventional contrastive loss function. Moreover, our framework does not\nrequire any special techniques such as asymmetric network architectures,\ngradient stopping, etc. Specifically, we calculate the high-order mixed moment\nof the feature variables and force them to decompose into products of their\nindividual moment, thereby making multiple variables more independent and\nminimizing the feature redundancy. We also incorporate a contrastive learning\napproach to maximize the feature invariance under different data augmentations\nof the same point cloud. Experimental results show that our approach\noutperforms previous unsupervised learning methods on the downstream task of 3D\npoint cloud classification and segmentation.\n","authors":["Xin Cao","Xinxin Han","Yifan Wang","Mengna Yang","Kang Li"],"pdf_url":"https://arxiv.org/pdf/2312.03350v1.pdf","comment":"15 pages,3 figures"},{"id":"http://arxiv.org/abs/2312.03345v1","updated":"2023-12-06T08:36:29Z","published":"2023-12-06T08:36:29Z","title":"GraNet: A Multi-Level Graph Network for 6-DoF Grasp Pose Generation in\n Cluttered Scenes","summary":" 6-DoF object-agnostic grasping in unstructured environments is a critical yet\nchallenging task in robotics. Most current works use non-optimized approaches\nto sample grasp locations and learn spatial features without concerning the\ngrasping task. This paper proposes GraNet, a graph-based grasp pose generation\nframework that translates a point cloud scene into multi-level graphs and\npropagates features through graph neural networks. By building graphs at the\nscene level, object level, and grasp point level, GraNet enhances feature\nembedding at multiple scales while progressively converging to the ideal\ngrasping locations by learning. Our pipeline can thus characterize the spatial\ndistribution of grasps in cluttered scenes, leading to a higher rate of\neffective grasping. Furthermore, we enhance the representation ability of\nscalable graph networks by a structure-aware attention mechanism to exploit\nlocal relations in graphs. Our method achieves state-of-the-art performance on\nthe large-scale GraspNet-1Billion benchmark, especially in grasping unseen\nobjects (+11.62 AP). The real robot experiment shows a high success rate in\ngrasping scattered objects, verifying the effectiveness of the proposed\napproach in unstructured environments.\n","authors":["Haowen Wang","Wanhao Niu","Chungang Zhuang"],"pdf_url":"https://arxiv.org/pdf/2312.03345v1.pdf","comment":"IROS 2023"},{"id":"http://arxiv.org/abs/2208.14085v3","updated":"2023-12-06T08:26:50Z","published":"2022-08-30T08:59:41Z","title":"Evaluating Point Cloud from Moving Camera Videos: A No-Reference Metric","summary":" Point cloud is one of the most widely used digital representation formats for\nthree-dimensional (3D) contents, the visual quality of which may suffer from\nnoise and geometric shift distortions during the production procedure as well\nas compression and downsampling distortions during the transmission process. To\ntackle the challenge of point cloud quality assessment (PCQA), many PCQA\nmethods have been proposed to evaluate the visual quality levels of point\nclouds by assessing the rendered static 2D projections. Although such\nprojection-based PCQA methods achieve competitive performance with the\nassistance of mature image quality assessment (IQA) methods, they neglect that\nthe 3D model is also perceived in a dynamic viewing manner, where the viewpoint\nis continually changed according to the feedback of the rendering device.\nTherefore, in this paper, we evaluate the point clouds from moving camera\nvideos and explore the way of dealing with PCQA tasks via using video quality\nassessment (VQA) methods. First, we generate the captured videos by rotating\nthe camera around the point clouds through several circular pathways. Then we\nextract both spatial and temporal quality-aware features from the selected key\nframes and the video clips through using trainable 2D-CNN and pre-trained\n3D-CNN models respectively. Finally, the visual quality of point clouds is\nrepresented by the video quality values. The experimental results reveal that\nthe proposed method is effective for predicting the visual quality levels of\nthe point clouds and even competitive with full-reference (FR) PCQA methods.\nThe ablation studies further verify the rationality of the proposed framework\nand confirm the contributions made by the quality-aware features extracted via\nthe dynamic viewing manner. The code is available at\nhttps://github.com/zzc-1998/VQA_PC.\n","authors":["Zicheng Zhang","Wei Sun","Yucheng Zhu","Xiongkuo Min","Wei Wu","Ying Chen","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2208.14085v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03341v1","updated":"2023-12-06T08:26:26Z","published":"2023-12-06T08:26:26Z","title":"Online Vectorized HD Map Construction using Geometry","summary":" The construction of online vectorized High-Definition (HD) maps is critical\nfor downstream prediction and planning. Recent efforts have built strong\nbaselines for this task, however, shapes and relations of instances in urban\nroad systems are still under-explored, such as parallelism, perpendicular, or\nrectangle-shape. In our work, we propose GeMap ($\\textbf{Ge}$ometry\n$\\textbf{Map}$), which end-to-end learns Euclidean shapes and relations of map\ninstances beyond basic perception. Specifically, we design a geometric loss\nbased on angle and distance clues, which is robust to rigid transformations. We\nalso decouple self-attention to independently handle Euclidean shapes and\nrelations. Our method achieves new state-of-the-art performance on the NuScenes\nand Argoverse 2 datasets. Remarkably, it reaches a 71.8% mAP on the large-scale\nArgoverse 2 dataset, outperforming MapTR V2 by +4.4% and surpassing the 70% mAP\nthreshold for the first time. Code is available at\nhttps://github.com/cnzzx/GeMap\n","authors":["Zhixin Zhang","Yiyuan Zhang","Xiaohan Ding","Fusheng Jin","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03341v1.pdf","comment":"Project website https://invictus717.github.io/GeMap/"},{"id":"http://arxiv.org/abs/2312.03339v1","updated":"2023-12-06T08:21:42Z","published":"2023-12-06T08:21:42Z","title":"PointJEM: Self-supervised Point Cloud Understanding for Reducing Feature\n Redundancy via Joint Entropy Maximization","summary":" Most deep learning-based point cloud processing methods are supervised and\nrequire large scale of labeled data. However, manual labeling of point cloud\ndata is laborious and time-consuming. Self-supervised representation learning\ncan address the aforementioned issue by learning robust and generalized\nrepresentations from unlabeled datasets. Nevertheless, the embedded features\nobtained by representation learning usually contain redundant information, and\nmost current methods reduce feature redundancy by linear correlation\nconstraints. In this paper, we propose PointJEM, a self-supervised\nrepresentation learning method applied to the point cloud field. PointJEM\ncomprises an embedding scheme and a loss function based on joint entropy. The\nembedding scheme divides the embedding vector into different parts, each part\ncan learn a distinctive feature. To reduce redundant information in the\nfeatures, PointJEM maximizes the joint entropy between the different parts,\nthereby rendering the learned feature variables pairwise independent. To\nvalidate the effectiveness of our method, we conducted experiments on multiple\ndatasets. The results demonstrate that our method can significantly reduce\nfeature redundancy beyond linear correlation. Furthermore, PointJEM achieves\ncompetitive performance in downstream tasks such as classification and\nsegmentation.\n","authors":["Xin Cao","Huan Xia","Xinxin Han","Yifan Wang","Kang Li","Linzhi Su"],"pdf_url":"https://arxiv.org/pdf/2312.03339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02010v2","updated":"2023-12-06T08:13:28Z","published":"2023-12-04T16:32:51Z","title":"Towards Learning a Generalist Model for Embodied Navigation","summary":" Building a generalist agent that can interact with the world is the\nintriguing target of AI systems, thus spurring the research for embodied\nnavigation, where an agent is required to navigate according to instructions or\nrespond to queries. Despite the major progress attained, previous works\nprimarily focus on task-specific agents and lack generalizability to unseen\nscenarios. Recently, LLMs have presented remarkable capabilities across various\nfields, and provided a promising opportunity for embodied navigation. Drawing\non this, we propose the first generalist model for embodied navigation,\nNaviLLM. It adapts LLMs to embodied navigation by introducing schema-based\ninstruction. The schema-based instruction flexibly casts various tasks into\ngeneration problems, thereby unifying a wide range of tasks. This approach\nallows us to integrate diverse data sources from various datasets into the\ntraining, equipping NaviLLM with a wide range of capabilities required by\nembodied navigation. We conduct extensive experiments to evaluate the\nperformance and generalizability of our model. The experimental results\ndemonstrate that our unified model achieves state-of-the-art performance on\nCVDN, SOON, and ScanQA. Specifically, it surpasses the previous\nstats-of-the-art method by a significant margin of 29% in goal progress on\nCVDN. Moreover, our model also demonstrates strong generalizability and\npresents impressive results on unseen tasks, e.g., embodied question answering\nand 3D captioning.\n","authors":["Duo Zheng","Shijia Huang","Lin Zhao","Yiwu Zhong","Liwei Wang"],"pdf_url":"https://arxiv.org/pdf/2312.02010v2.pdf","comment":"13 pages, 3 figures. Official code:\n https://github.com/zd11024/NaviLLM"},{"id":"http://arxiv.org/abs/2312.01623v2","updated":"2023-12-06T07:46:42Z","published":"2023-12-04T04:47:48Z","title":"Universal Segmentation at Arbitrary Granularity with Language\n Instruction","summary":" This paper aims to achieve universal segmentation of arbitrary semantic\nlevel. Despite significant progress in recent years, specialist segmentation\napproaches are limited to specific tasks and data distribution. Retraining a\nnew model for adaptation to new scenarios or settings takes expensive\ncomputation and time cost, which raises the demand for versatile and universal\nsegmentation model that can cater to various granularity. Although some\nattempts have been made for unifying different segmentation tasks or\ngeneralization to various scenarios, limitations in the definition of paradigms\nand input-output spaces make it difficult for them to achieve accurate\nunderstanding of content at arbitrary granularity. To this end, we present\nUniLSeg, a universal segmentation model that can perform segmentation at any\nsemantic level with the guidance of language instructions. For training\nUniLSeg, we reorganize a group of tasks from original diverse distributions\ninto a unified data format, where images with texts describing segmentation\ntargets as input and corresponding masks are output. Combined with a automatic\nannotation engine for utilizing numerous unlabeled data, UniLSeg achieves\nexcellent performance on various tasks and settings, surpassing both specialist\nand unified segmentation models.\n","authors":["Yong Liu","Cairong Zhang","Yitong Wang","Jiahao Wang","Yujiu Yang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2312.01623v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11312v2","updated":"2023-12-06T07:30:40Z","published":"2023-11-19T12:25:59Z","title":"Optimizing rgb-d semantic segmentation through multi-modal interaction\n and pooling attention","summary":" Semantic segmentation of RGB-D images involves understanding the appearance\nand spatial relationships of objects within a scene, which requires careful\nconsideration of various factors. However, in indoor environments, the simple\ninput of RGB and depth images often results in a relatively limited acquisition\nof semantic and spatial information, leading to suboptimal segmentation\noutcomes. To address this, we propose the Multi-modal Interaction and Pooling\nAttention Network (MIPANet), a novel approach designed to harness the\ninteractive synergy between RGB and depth modalities, optimizing the\nutilization of complementary information. Specifically, we incorporate a\nMulti-modal Interaction Fusion Module (MIM) into the deepest layers of the\nnetwork. This module is engineered to facilitate the fusion of RGB and depth\ninformation, allowing for mutual enhancement and correction. Additionally, we\nintroduce a Pooling Attention Module (PAM) at various stages of the encoder.\nThis module serves to amplify the features extracted by the network and\nintegrates the module's output into the decoder in a targeted manner,\nsignificantly improving semantic segmentation performance. Our experimental\nresults demonstrate that MIPANet outperforms existing methods on two indoor\nscene datasets, NYUDv2 and SUN-RGBD, underscoring its effectiveness in\nenhancing RGB-D semantic segmentation.\n","authors":["Shuai Zhang","Minghong Xie"],"pdf_url":"https://arxiv.org/pdf/2311.11312v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03327v1","updated":"2023-12-06T07:28:43Z","published":"2023-12-06T07:28:43Z","title":"Building Category Graphs Representation with Spatial and Temporal\n Attention for Visual Navigation","summary":" Given an object of interest, visual navigation aims to reach the object's\nlocation based on a sequence of partial observations. To this end, an agent\nneeds to 1) learn a piece of certain knowledge about the relations of object\ncategories in the world during training and 2) look for the target object based\non the pre-learned object category relations and its moving trajectory in the\ncurrent unseen environment. In this paper, we propose a Category Relation Graph\n(CRG) to learn the knowledge of object category layout relations and a\nTemporal-Spatial-Region (TSR) attention architecture to perceive the long-term\nspatial-temporal dependencies of objects helping the navigation. We learn prior\nknowledge of object layout, establishing a category relationship graph to\ndeduce the positions of specific objects. Subsequently, we introduced TSR to\ncapture the relationships of objects in temporal, spatial, and regions within\nthe observation trajectories. Specifically, we propose a Temporal attention\nmodule (T) to model the temporal structure of the observation sequence, which\nimplicitly encodes the historical moving or trajectory information. Then, a\nSpatial attention module (S) is used to uncover the spatial context of the\ncurrent observation objects based on the category relation graph and past\nobservations. Last, a Region attention module (R) shifts the attention to the\ntarget-relevant region. Based on the visual representation extracted by our\nmethod, the agent can better perceive the environment and easily learn superior\nnavigation policy. Experiments on AI2-THOR demonstrate our CRG-TSR method\nsignificantly outperforms existing methods regarding both effectiveness and\nefficiency. The code has been included in the supplementary material and will\nbe publicly available.\n","authors":["Xiaobo Hu","Youfang Lin","HeHe Fan","Shuo Wang","Zhihao Wu","Kai Lv"],"pdf_url":"https://arxiv.org/pdf/2312.03327v1.pdf","comment":"18 pages; 7 figures"},{"id":"http://arxiv.org/abs/2312.03325v1","updated":"2023-12-06T07:26:02Z","published":"2023-12-06T07:26:02Z","title":"GCFA:Geodesic Curve Feature Augmentation via Shape Space Theory","summary":" Deep learning has yielded remarkable outcomes in various domains. However,\nthe challenge of requiring large-scale labeled samples still persists in deep\nlearning. Thus, data augmentation has been introduced as a critical strategy to\ntrain deep learning models. However, data augmentation suffers from information\nloss and poor performance in small sample environments. To overcome these\ndrawbacks, we propose a feature augmentation method based on shape space\ntheory, i.e., Geodesic curve feature augmentation, called GCFA in brevity.\nFirst, we extract features from the image with the neural network model. Then,\nthe multiple image features are projected into a pre-shape space as features.\nIn the pre-shape space, a Geodesic curve is built to fit the features. Finally,\nthe many generated features on the Geodesic curve are used to train the various\nmachine learning models. The GCFA module can be seamlessly integrated with most\nmachine learning methods. And the proposed method is simple, effective and\ninsensitive for the small sample datasets. Several examples demonstrate that\nthe GCFA method can greatly improve the performance of the data preprocessing\nmodel in a small sample environment.\n","authors":["Yuexing Han","Guanxin Wan","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.07394v2","updated":"2023-12-06T07:23:52Z","published":"2022-01-19T03:05:24Z","title":"KappaFace: Adaptive Additive Angular Margin Loss for Deep Face\n Recognition","summary":" Feature learning is a widely used method employed for large-scale face\nrecognition. Recently, large-margin softmax loss methods have demonstrated\nsignificant enhancements on deep face recognition. These methods propose fixed\npositive margins in order to enforce intra-class compactness and inter-class\ndiversity. However, the majority of the proposed methods do not consider the\nclass imbalance issue, which is a major challenge in practice for developing\ndeep face recognition models. We hypothesize that it significantly affects the\ngeneralization ability of the deep face models. Inspired by this observation,\nwe introduce a novel adaptive strategy, called KappaFace, to modulate the\nrelative importance based on class difficultness and imbalance. With the\nsupport of the von Mises-Fisher distribution, our proposed KappaFace loss can\nintensify the margin's magnitude for hard learning or low concentration classes\nwhile relaxing it for counter classes. Experiments conducted on popular facial\nbenchmarks demonstrate that our proposed method achieves superior performance\nto the state-of-the-art.\n","authors":["Chingis Oinar","Binh M. Le","Simon S. Woo"],"pdf_url":"https://arxiv.org/pdf/2201.07394v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.02367v5","updated":"2023-12-06T07:18:23Z","published":"2023-02-05T12:13:27Z","title":"FastPillars: A Deployment-friendly Pillar-based 3D Detector","summary":" The deployment of 3D detectors strikes one of the major challenges in\nreal-world self-driving scenarios. Existing BEV-based (i.e., Bird Eye View)\ndetectors favor sparse convolutions (known as SPConv) to speed up training and\ninference, which puts a hard barrier for deployment, especially for on-device\napplications. In this paper, to tackle the challenge of efficient 3D object\ndetection from an industry perspective, we devise a deployment-friendly\npillar-based 3D detector, termed FastPillars. First, we introduce a novel\nlightweight Max-and-Attention Pillar Encoding (MAPE) module specially for\nenhancing small 3D objects. Second, we propose a simple yet effective principle\nfor designing a backbone in pillar-based 3D detection. We construct FastPillars\nbased on these designs, achieving high performance and low latency without\nSPConv. Extensive experiments on two large-scale datasets demonstrate the\neffectiveness and efficiency of FastPillars for on-device 3D detection\nregarding both performance and speed. Specifically, FastPillars delivers\nstate-of-the-art accuracy on Waymo Open Dataset with 1.8X speed up and 3.8\nmAPH/L2 improvement over CenterPoint (SPConv-based). Our code is publicly\navailable at: https://github.com/StiphyJay/FastPillars.\n","authors":["Sifan Zhou","Zhi Tian","Xiangxiang Chu","Xinyu Zhang","Bo Zhang","Xiaobo Lu","Chengjian Feng","Zequn Jie","Patrick Yin Chiang","Lin Ma"],"pdf_url":"https://arxiv.org/pdf/2302.02367v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03322v1","updated":"2023-12-06T07:16:32Z","published":"2023-12-06T07:16:32Z","title":"Background Clustering Pre-training for Few-shot Segmentation","summary":" Recent few-shot segmentation (FSS) methods introduce an extra pre-training\nstage before meta-training to obtain a stronger backbone, which has become a\nstandard step in few-shot learning. Despite the effectiveness, current\npre-training scheme suffers from the merged background problem: only base\nclasses are labelled as foregrounds, making it hard to distinguish between\nnovel classes and actual background. In this paper, we propose a new\npre-training scheme for FSS via decoupling the novel classes from background,\ncalled Background Clustering Pre-Training (BCPT). Specifically, we adopt online\nclustering to the pixel embeddings of merged background to explore the\nunderlying semantic structures, bridging the gap between pre-training and\nadaptation to novel classes. Given the clustering results, we further propose\nthe background mining loss and leverage base classes to guide the clustering\nprocess, improving the quality and stability of clustering results. Experiments\non PASCAL-5i and COCO-20i show that BCPT yields advanced performance. Code will\nbe available.\n","authors":["Zhimiao Yu","Tiancheng Lin","Yi Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03322v1.pdf","comment":"6 pages, 2 figures, ICIP 2023"},{"id":"http://arxiv.org/abs/2311.17082v2","updated":"2023-12-06T07:15:58Z","published":"2023-11-28T01:28:58Z","title":"DreamPropeller: Supercharge Text-to-3D Generation with Parallel Sampling","summary":" Recent methods such as Score Distillation Sampling (SDS) and Variational\nScore Distillation (VSD) using 2D diffusion models for text-to-3D generation\nhave demonstrated impressive generation quality. However, the long generation\ntime of such algorithms significantly degrades the user experience. To tackle\nthis problem, we propose DreamPropeller, a drop-in acceleration algorithm that\ncan be wrapped around any existing text-to-3D generation pipeline based on\nscore distillation. Our framework generalizes Picard iterations, a classical\nalgorithm for parallel sampling an ODE path, and can account for non-ODE paths\nsuch as momentum-based gradient updates and changes in dimensions during the\noptimization process as in many cases of 3D generation. We show that our\nalgorithm trades parallel compute for wallclock time and empirically achieves\nup to 4.7x speedup with a negligible drop in generation quality for all tested\nframeworks.\n","authors":["Linqi Zhou","Andy Shih","Chenlin Meng","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2311.17082v2.pdf","comment":"Github repo: https://github.com/alexzhou907/DreamPropeller; Project\n page: https://alexzhou907.github.io/dreampropeller_page/"},{"id":"http://arxiv.org/abs/2306.13394v3","updated":"2023-12-06T07:09:31Z","published":"2023-06-23T09:22:36Z","title":"MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language\n Models","summary":" Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform\nmultimodal tasks, showing amazing emergent abilities in recent studies, such as\nwriting poems based on an image. However, it is difficult for these case\nstudies to fully reflect the performance of MLLM, lacking a comprehensive\nevaluation. In this paper, we fill in this blank, presenting the first\ncomprehensive MLLM Evaluation benchmark MME. It measures both perception and\ncognition abilities on a total of 14 subtasks. In order to avoid data leakage\nthat may arise from direct use of public datasets for evaluation, the\nannotations of instruction-answer pairs are all manually designed. The concise\ninstruction design allows us to fairly compare MLLMs, instead of struggling in\nprompt engineering. Besides, with such an instruction, we can also easily carry\nout quantitative statistics. A total of 30 advanced MLLMs are comprehensively\nevaluated on our MME, which not only suggests that existing MLLMs still have a\nlarge room for improvement, but also reveals the potential directions for the\nsubsequent model optimization.\n","authors":["Chaoyou Fu","Peixian Chen","Yunhang Shen","Yulei Qin","Mengdan Zhang","Xu Lin","Jinrui Yang","Xiawu Zheng","Ke Li","Xing Sun","Yunsheng Wu","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2306.13394v3.pdf","comment":"Project page:\n https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models"},{"id":"http://arxiv.org/abs/2312.03318v1","updated":"2023-12-06T07:02:22Z","published":"2023-12-06T07:02:22Z","title":"Complementary Benefits of Contrastive Learning and Self-Training Under\n Distribution Shift","summary":" Self-training and contrastive learning have emerged as leading techniques for\nincorporating unlabeled data, both under distribution shift (unsupervised\ndomain adaptation) and when it is absent (semi-supervised learning). However,\ndespite the popularity and compatibility of these techniques, their efficacy in\ncombination remains unexplored. In this paper, we undertake a systematic\nempirical investigation of this combination, finding that (i) in domain\nadaptation settings, self-training and contrastive learning offer significant\ncomplementary gains; and (ii) in semi-supervised learning settings,\nsurprisingly, the benefits are not synergistic. Across eight distribution shift\ndatasets (e.g., BREEDs, WILDS), we demonstrate that the combined method obtains\n3--8% higher accuracy than either approach independently. We then theoretically\nanalyze these techniques in a simplified model of distribution shift,\ndemonstrating scenarios under which the features produced by contrastive\nlearning can yield a good initialization for self-training to further amplify\ngains and achieve optimal performance, even when either method alone would\nfail.\n","authors":["Saurabh Garg","Amrith Setlur","Zachary Chase Lipton","Sivaraman Balakrishnan","Virginia Smith","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2312.03318v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2305.18731v3","updated":"2023-12-06T06:09:19Z","published":"2023-05-30T04:10:15Z","title":"Epistemic Graph: A Plug-And-Play Module For Hybrid Representation\n Learning","summary":" In recent years, deep models have achieved remarkable success in various\nvision tasks. However, their performance heavily relies on large training\ndatasets. In contrast, humans exhibit hybrid learning, seamlessly integrating\nstructured knowledge for cross-domain recognition or relying on a smaller\namount of data samples for few-shot learning. Motivated by this human-like\nepistemic process, we aim to extend hybrid learning to computer vision tasks by\nintegrating structured knowledge with data samples for more effective\nrepresentation learning. Nevertheless, this extension faces significant\nchallenges due to the substantial gap between structured knowledge and deep\nfeatures learned from data samples, encompassing both dimensions and knowledge\ngranularity. In this paper, a novel Epistemic Graph Layer (EGLayer) is\nintroduced to enable hybrid learning, enhancing the exchange of information\nbetween deep features and a structured knowledge graph. Our EGLayer is composed\nof three major parts, including a local graph module, a query aggregation\nmodel, and a novel correlation alignment loss function to emulate human\nepistemic ability. Serving as a plug-and-play module that can replace the\nstandard linear classifier, EGLayer significantly improves the performance of\ndeep models. Extensive experiments demonstrates that EGLayer can greatly\nenhance representation learning for the tasks of cross-domain recognition and\nfew-shot learning, and the visualization of knowledge graphs can aid in model\ninterpretation.\n","authors":["Jin Yuan","Yang Zhang","Yangzhou Du","Zhongchao Shi","Xin Geng","Jianping Fan","Yong Rui"],"pdf_url":"https://arxiv.org/pdf/2305.18731v3.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2312.03298v1","updated":"2023-12-06T05:39:00Z","published":"2023-12-06T05:39:00Z","title":"DiffPMAE: Diffusion Masked Autoencoders for Point Cloud Reconstruction","summary":" Point cloud streaming is increasingly getting popular, evolving into the norm\nfor interactive service delivery and the future Metaverse. However, the\nsubstantial volume of data associated with point clouds presents numerous\nchallenges, particularly in terms of high bandwidth consumption and large\nstorage capacity. Despite various solutions proposed thus far, with a focus on\npoint cloud compression, upsampling, and completion, these\nreconstruction-related methods continue to fall short in delivering high\nfidelity point cloud output. As a solution, in DiffPMAE, we propose an\neffective point cloud reconstruction architecture. Inspired by self-supervised\nlearning concepts, we combine Masked Auto-Encoding and Diffusion Model\nmechanism to remotely reconstruct point cloud data. By the nature of this\nreconstruction process, DiffPMAE can be extended to many related downstream\ntasks including point cloud compression, upsampling and completion. Leveraging\nShapeNet-55 and ModelNet datasets with over 60000 objects, we validate the\nperformance of DiffPMAE exceeding many state-of-the-art methods in-terms of\nauto-encoding and downstream tasks considered.\n","authors":["Yanlong Li","Chamara Madarasingha","Kanchana Thilakarathna"],"pdf_url":"https://arxiv.org/pdf/2312.03298v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03296v1","updated":"2023-12-06T05:36:52Z","published":"2023-12-06T05:36:52Z","title":"Cooperative Probabilistic Trajectory Forecasting under Occlusion","summary":" Perception and planning under occlusion is essential for safety-critical\ntasks. Occlusion-aware planning often requires communicating the information of\nthe occluded object to the ego agent for safe navigation. However,\ncommunicating rich sensor information under adverse conditions during\ncommunication loss and limited bandwidth may not be always feasible. Further,\nin GPS denied environments and indoor navigation, localizing and sharing of\noccluded objects can be challenging. To overcome this, relative pose estimation\nbetween connected agents sharing a common field of view can be a\ncomputationally effective way of communicating information about surrounding\nobjects. In this paper, we design an end-to-end network that cooperatively\nestimates the current states of occluded pedestrian in the reference frame of\nego agent and then predicts the trajectory with safety guarantees.\nExperimentally, we show that the uncertainty-aware trajectory prediction of\noccluded pedestrian by the ego agent is almost similar to the ground truth\ntrajectory assuming no occlusion. The current research holds promise for\nuncertainty-aware navigation among multiple connected agents under occlusion.\n","authors":["Anshul Nayak","Azim Eskandarian"],"pdf_url":"https://arxiv.org/pdf/2312.03296v1.pdf","comment":"10 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2311.13750v2","updated":"2023-12-06T05:20:16Z","published":"2023-11-23T00:53:11Z","title":"Towards Transferable Multi-modal Perception Representation Learning for\n Autonomy: NeRF-Supervised Masked AutoEncoder","summary":" This work proposes a unified self-supervised pre-training framework for\ntransferable multi-modal perception representation learning via masked\nmulti-modal reconstruction in Neural Radiance Field (NeRF), namely\nNeRF-Supervised Masked AutoEncoder (NS-MAE). Specifically, conditioned on\ncertain view directions and locations, multi-modal embeddings extracted from\ncorrupted multi-modal input signals, i.e., Lidar point clouds and images, are\nrendered into projected multi-modal feature maps via neural rendering. Then,\noriginal multi-modal signals serve as reconstruction targets for the rendered\nmulti-modal feature maps to enable self-supervised representation learning.\nExtensive experiments show that the representation learned via NS-MAE shows\npromising transferability for diverse multi-modal and single-modal (camera-only\nand Lidar-only) perception models on diverse 3D perception downstream tasks (3D\nobject detection and BEV map segmentation) with diverse amounts of fine-tuning\nlabeled data. Moreover, we empirically find that NS-MAE enjoys the synergy of\nboth the mechanism of masked autoencoder and neural radiance field. We hope\nthis study can inspire exploration of more general multi-modal representation\nlearning for autonomous agents.\n","authors":["Xiaohao Xu"],"pdf_url":"https://arxiv.org/pdf/2311.13750v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02168v2","updated":"2023-12-06T05:16:37Z","published":"2023-10-30T15:38:31Z","title":"The SVHN Dataset Is Deceptive for Probabilistic Generative Models Due to\n a Distribution Mismatch","summary":" The Street View House Numbers (SVHN) dataset is a popular benchmark dataset\nin deep learning. Originally designed for digit classification tasks, the SVHN\ndataset has been widely used as a benchmark for various other tasks including\ngenerative modeling. However, with this work, we aim to warn the community\nabout an issue of the SVHN dataset as a benchmark for generative modeling\ntasks: we discover that the official split into training set and test set of\nthe SVHN dataset are not drawn from the same distribution. We empirically show\nthat this distribution mismatch has little impact on the classification task\n(which may explain why this issue has not been detected before), but it\nseverely affects the evaluation of probabilistic generative models, such as\nVariational Autoencoders and diffusion models. As a workaround, we propose to\nmix and re-split the official training and test set when SVHN is used for tasks\nother than classification. We publish a new split and the indices we used to\ncreate it at https://jzenn.github.io/svhn-remix/ .\n","authors":["Tim Z. Xiao","Johannes Zenn","Robert Bamler"],"pdf_url":"https://arxiv.org/pdf/2312.02168v2.pdf","comment":"Accepted at NeurIPS 2023 Workshop on Distribution Shifts; 4 pages +\n appendix; proposed data set at https://jzenn.github.io/svhn-remix/"},{"id":"http://arxiv.org/abs/2312.03289v1","updated":"2023-12-06T04:38:02Z","published":"2023-12-06T04:38:02Z","title":"Class Incremental Learning for Adversarial Robustness","summary":" Adversarial training integrates adversarial examples during model training to\nenhance robustness. However, its application in fixed dataset settings differs\nfrom real-world dynamics, where data accumulates incrementally. In this study,\nwe investigate Adversarially Robust Class Incremental Learning (ARCIL), a\nmethod that combines adversarial robustness with incremental learning. We\nobserve that combining incremental learning with naive adversarial training\neasily leads to a loss of robustness. We discover that this is attributed to\nthe disappearance of the flatness of the loss function, a characteristic of\nadversarial training. To address this issue, we propose the Flatness Preserving\nDistillation (FPD) loss that leverages the output difference between\nadversarial and clean examples. Additionally, we introduce the Logit Adjustment\nDistillation (LAD) loss, which adapts the model's knowledge to perform well on\nnew tasks. Experimental results demonstrate the superiority of our method over\napproaches that apply adversarial training to existing incremental learning\nmethods, which provides a strong baseline for incremental learning on\nadversarial robustness in the future. Our method achieves AutoAttack accuracy\nthat is 5.99\\%p, 5.27\\%p, and 3.90\\%p higher on average than the baseline on\nsplit CIFAR-10, CIFAR-100, and Tiny ImageNet, respectively. The code will be\nmade available.\n","authors":["Seungju Cho","Hongshin Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2312.03289v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03288v1","updated":"2023-12-06T04:36:58Z","published":"2023-12-06T04:36:58Z","title":"STEP CATFormer: Spatial-Temporal Effective Body-Part Cross Attention\n Transformer for Skeleton-based Action Recognition","summary":" Graph convolutional networks (GCNs) have been widely used and achieved\nremarkable results in skeleton-based action recognition. We think the key to\nskeleton-based action recognition is a skeleton hanging in frames, so we focus\non how the Graph Convolutional Convolution networks learn different topologies\nand effectively aggregate joint features in the global temporal and local\ntemporal. In this work, we propose three Channel-wise Tolopogy Graph\nConvolution based on Channel-wise Topology Refinement Graph Convolution\n(CTR-GCN). Combining CTR-GCN with two joint cross-attention modules can capture\nthe upper-lower body part and hand-foot relationship skeleton features. After\nthat, to capture features of human skeletons changing in frames we design the\nTemporal Attention Transformers to extract skeletons effectively. The Temporal\nAttention Transformers can learn the temporal features of human skeleton\nsequences. Finally, we fuse the temporal features output scale with MLP and\nclassification. We develop a powerful graph convolutional network named Spatial\nTemporal Effective Body-part Cross Attention Transformer which notably\nhigh-performance on the NTU RGB+D, NTU RGB+D 120 datasets. Our code and models\nare available at https://github.com/maclong01/STEP-CATFormer\n","authors":["Nguyen Huu Bao Long"],"pdf_url":"https://arxiv.org/pdf/2312.03288v1.pdf","comment":"Accepted to BMVC 2023: Computer Vision for Games and Games for\n Computer Vision (CVG). 9 pages"},{"id":"http://arxiv.org/abs/2312.03286v1","updated":"2023-12-06T04:32:38Z","published":"2023-12-06T04:32:38Z","title":"Indirect Gradient Matching for Adversarial Robust Distillation","summary":" Adversarial training significantly improves adversarial robustness, but\nsuperior performance is primarily attained with large models. This substantial\nperformance gap for smaller models has spurred active research into adversarial\ndistillation (AD) to mitigate the difference. Existing AD methods leverage the\nteacher's logits as a guide. In contrast to these approaches, we aim to\ntransfer another piece of knowledge from the teacher, the input gradient. In\nthis paper, we propose a distillation module termed Indirect Gradient\nDistillation Module (IGDM) that indirectly matches the student's input gradient\nwith that of the teacher. We hypothesize that students can better acquire the\nteacher's knowledge by matching the input gradient. Leveraging the observation\nthat adversarial training renders the model locally linear on the input space,\nwe employ Taylor approximation to effectively align gradients without directly\ncalculating them. Experimental results show that IGDM seamlessly integrates\nwith existing AD methods, significantly enhancing the performance of all AD\nmethods. Particularly, utilizing IGDM on the CIFAR-100 dataset improves the\nAutoAttack accuracy from 28.06% to 30.32% with the ResNet-18 model and from\n26.18% to 29.52% with the MobileNetV2 model when integrated into the SOTA\nmethod without additional data augmentation. The code will be made available.\n","authors":["Hongsin Lee","Seungju Cho","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2312.03286v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.01923v2","updated":"2023-12-06T04:19:29Z","published":"2023-06-02T21:26:20Z","title":"The Surprising Effectiveness of Diffusion Models for Optical Flow and\n Monocular Depth Estimation","summary":" Denoising diffusion probabilistic models have transformed image generation\nwith their impressive fidelity and diversity. We show that they also excel in\nestimating optical flow and monocular depth, surprisingly, without\ntask-specific architectures and loss functions that are predominant for these\ntasks. Compared to the point estimates of conventional regression-based\nmethods, diffusion models also enable Monte Carlo inference, e.g., capturing\nuncertainty and ambiguity in flow and depth. With self-supervised pre-training,\nthe combined use of synthetic and real data for supervised training, and\ntechnical innovations (infilling and step-unrolled denoising diffusion\ntraining) to handle noisy-incomplete training data, and a simple form of\ncoarse-to-fine refinement, one can train state-of-the-art diffusion models for\ndepth and optical flow estimation. Extensive experiments focus on quantitative\nperformance against benchmarks, ablations, and the model's ability to capture\nuncertainty and multimodality, and impute missing values. Our model, DDVM\n(Denoising Diffusion Vision Model), obtains a state-of-the-art relative depth\nerror of 0.074 on the indoor NYU benchmark and an Fl-all outlier rate of 3.26\\%\non the KITTI optical flow benchmark, about 25\\% better than the best published\nmethod. For an overview see https://diffusion-vision.github.io.\n","authors":["Saurabh Saxena","Charles Herrmann","Junhwa Hur","Abhishek Kar","Mohammad Norouzi","Deqing Sun","David J. Fleet"],"pdf_url":"https://arxiv.org/pdf/2306.01923v2.pdf","comment":"NeurIPS 2023 (Oral)"},{"id":"http://arxiv.org/abs/2306.02000v2","updated":"2023-12-06T04:19:27Z","published":"2023-06-03T04:47:05Z","title":"Context-PIPs: Persistent Independent Particles Demands Spatial Context\n Features","summary":" We tackle the problem of Persistent Independent Particles (PIPs), also called\nTracking Any Point (TAP), in videos, which specifically aims at estimating\npersistent long-term trajectories of query points in videos. Previous methods\nattempted to estimate these trajectories independently to incorporate longer\nimage sequences, therefore, ignoring the potential benefits of incorporating\nspatial context features. We argue that independent video point tracking also\ndemands spatial context features. To this end, we propose a novel framework\nContext-PIPs, which effectively improves point trajectory accuracy by\naggregating spatial context features in videos. Context-PIPs contains two main\nmodules: 1) a SOurse Feature Enhancement (SOFE) module, and 2) a TArget Feature\nAggregation (TAFA) module. Context-PIPs significantly improves PIPs all-sided,\nreducing 11.4% Average Trajectory Error of Occluded Points (ATE-Occ) on CroHD\nand increasing 11.8% Average Percentage of Correct Keypoint (A-PCK) on\nTAP-Vid-Kinectics. Demos are available at\nhttps://wkbian.github.io/Projects/Context-PIPs/.\n","authors":["Weikang Bian","Zhaoyang Huang","Xiaoyu Shi","Yitong Dong","Yijin Li","Hongsheng Li"],"pdf_url":"https://arxiv.org/pdf/2306.02000v2.pdf","comment":"Project Page: https://wkbian.github.io/Projects/Context-PIPs/"},{"id":"http://arxiv.org/abs/2307.06472v2","updated":"2023-12-06T04:04:11Z","published":"2023-07-12T22:08:22Z","title":"Early Autism Diagnosis based on Path Signature and Siamese Unsupervised\n Feature Compressor","summary":" Autism Spectrum Disorder (ASD) has been emerging as a growing public health\nthreat. Early diagnosis of ASD is crucial for timely, effective intervention\nand treatment. However, conventional diagnosis methods based on communications\nand behavioral patterns are unreliable for children younger than 2 years of\nage. Given evidences of neurodevelopmental abnormalities in ASD infants, we\nresort to a novel deep learning-based method to extract key features from the\ninherently scarce, class-imbalanced, and heterogeneous structural MR images for\nearly autism diagnosis. Specifically, we propose a Siamese verification\nframework to extend the scarce data, and an unsupervised compressor to\nalleviate data imbalance by extracting key features. We also proposed weight\nconstraints to cope with sample heterogeneity by giving different samples\ndifferent voting weights during validation, and we used Path Signature to\nunravel meaningful developmental features from the two-time point data\nlongitudinally. We further extracted machine learning focused brain regions for\nautism diagnosis. Extensive experiments have shown that our method performed\nwell under practical scenarios, transcending existing machine learning methods\nand providing anatomical insights for autism early diagnosis.\n","authors":["Zhuowen Yin","Xinyao Ding","Xin Zhang","Zhengwang Wu","Li Wang","Gang Li"],"pdf_url":"https://arxiv.org/pdf/2307.06472v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.07096v2","updated":"2023-12-06T03:49:10Z","published":"2023-06-12T13:20:29Z","title":"Global and Local Semantic Completion Learning for Vision-Language\n Pre-training","summary":" Cross-modal alignment plays a crucial role in vision-language pre-training\n(VLP) models, enabling them to capture meaningful associations across different\nmodalities. For this purpose, numerous masked modeling tasks have been proposed\nfor VLP to further promote cross-modal interactions. The core idea of previous\nmasked modeling tasks is to focus on reconstructing the masked tokens based on\nvisible context for learning local-local alignment. However, most of them pay\nlittle attention to the global semantic features generated for the masked data,\nresulting in a limited cross-modal alignment ability of global representations\nto local features of the other modality. Therefore, in this paper, we propose a\nnovel Global and Local Semantic Completion Learning (GLSCL) task to facilitate\nglobal-local alignment and local-local alignment simultaneously. Specifically,\nthe GLSCL task complements the missing semantics of masked data and recovers\nglobal and local features by cross-modal interactions. Our GLSCL consists of\nmasked global semantic completion (MGSC) and masked local token completion\n(MLTC). MGSC promotes learning more representative global features, which have\na great impact on the performance of downstream tasks, while MLTC reconstructs\nmodal-fusion local tokens, further enhancing accurate comprehension of\nmultimodal data. To evaluate the proposed approaches on cross-modal alignment,\nwe develop a validation benchmark called ALIGN-BENCH. Moreover, we present a\nflexible vision encoder, enabling our model to simultaneously perform\nimage-text and video-text multimodal tasks. Experimental results show that our\nproposed method obtains state-of-the-art performance on various vision-language\nbenchmarks, such as visual question answering, image-text retrieval, and\nvideo-text retrieval.\n","authors":["Rong-Cheng Tu","Yatai Ji","Jie Jiang","Weijie Kong","Chengfei Cai","Wenzhe Zhao","Hongfa Wang","Yujiu Yang","Wei Liu"],"pdf_url":"https://arxiv.org/pdf/2306.07096v2.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2211.13437"},{"id":"http://arxiv.org/abs/2312.02896v2","updated":"2023-12-06T03:46:47Z","published":"2023-12-05T17:06:59Z","title":"BenchLMM: Benchmarking Cross-style Visual Capability of Large Multimodal\n Models","summary":" Large Multimodal Models (LMMs) such as GPT-4V and LLaVA have shown remarkable\ncapabilities in visual reasoning with common image styles. However, their\nrobustness against diverse style shifts, crucial for practical applications,\nremains largely unexplored. In this paper, we propose a new benchmark,\nBenchLMM, to assess the robustness of LMMs against three different styles:\nartistic image style, imaging sensor style, and application style, where each\nstyle has five sub-styles. Utilizing BenchLMM, we comprehensively evaluate\nstate-of-the-art LMMs and reveal: 1) LMMs generally suffer performance\ndegradation when working with other styles; 2) An LMM performs better than\nanother model in common style does not guarantee its superior performance in\nother styles; 3) LMMs' reasoning capability can be enhanced by prompting LMMs\nto predict the style first, based on which we propose a versatile and\ntraining-free method for improving LMMs; 4) An intelligent LMM is expected to\ninterpret the causes of its errors when facing stylistic variations. We hope\nthat our benchmark and analysis can shed new light on developing more\nintelligent and versatile LMMs.\n","authors":["Rizhao Cai","Zirui Song","Dayan Guan","Zhenhao Chen","Xing Luo","Chenyu Yi","Alex Kot"],"pdf_url":"https://arxiv.org/pdf/2312.02896v2.pdf","comment":"Code is available at https://github.com/AIFEG/BenchLMM"},{"id":"http://arxiv.org/abs/2304.07444v2","updated":"2023-12-06T03:45:23Z","published":"2023-04-15T01:33:14Z","title":"The Art of Camouflage: Few-shot Learning for Animal Detection and\n Segmentation","summary":" Camouflaged object detection and segmentation is a new and challenging\nresearch topic in computer vision. There is a serious issue of lacking data of\ncamouflaged objects such as camouflaged animals in natural scenes. In this\npaper, we address the problem of few-shot learning for camouflaged object\ndetection and segmentation. To this end, we first collect a new dataset,\nCAMO-FS, for the benchmark. We then propose a novel method to efficiently\ndetect and segment the camouflaged objects in the images. In particular, we\nintroduce the instance triplet loss and the instance memory storage. The\nextensive experiments demonstrated that our proposed method achieves\nstate-of-the-art performance on the newly collected dataset.\n","authors":["Thanh-Danh Nguyen","Anh-Khoa Nguyen Vu","Nhat-Duy Nguyen","Vinh-Tiep Nguyen","Thanh Duc Ngo","Thanh-Toan Do","Minh-Triet Tran","Tam V. Nguyen"],"pdf_url":"https://arxiv.org/pdf/2304.07444v2.pdf","comment":"Under-review Journal"},{"id":"http://arxiv.org/abs/2306.01531v2","updated":"2023-12-06T03:39:15Z","published":"2023-06-02T13:35:07Z","title":"PanoGRF: Generalizable Spherical Radiance Fields for Wide-baseline\n Panoramas","summary":" Achieving an immersive experience enabling users to explore virtual\nenvironments with six degrees of freedom (6DoF) is essential for various\napplications such as virtual reality (VR). Wide-baseline panoramas are commonly\nused in these applications to reduce network bandwidth and storage\nrequirements. However, synthesizing novel views from these panoramas remains a\nkey challenge. Although existing neural radiance field methods can produce\nphotorealistic views under narrow-baseline and dense image captures, they tend\nto overfit the training views when dealing with \\emph{wide-baseline} panoramas\ndue to the difficulty in learning accurate geometry from sparse $360^{\\circ}$\nviews. To address this problem, we propose PanoGRF, Generalizable Spherical\nRadiance Fields for Wide-baseline Panoramas, which construct spherical radiance\nfields incorporating $360^{\\circ}$ scene priors. Unlike generalizable radiance\nfields trained on perspective images, PanoGRF avoids the information loss from\npanorama-to-perspective conversion and directly aggregates geometry and\nappearance features of 3D sample points from each panoramic view based on\nspherical projection. Moreover, as some regions of the panorama are only\nvisible from one view while invisible from others under wide baseline settings,\nPanoGRF incorporates $360^{\\circ}$ monocular depth priors into spherical depth\nestimation to improve the geometry features. Experimental results on multiple\npanoramic datasets demonstrate that PanoGRF significantly outperforms\nstate-of-the-art generalizable view synthesis methods for wide-baseline\npanoramas (e.g., OmniSyn) and perspective images (e.g., IBRNet, NeuRay).\n","authors":["Zheng Chen","Yan-Pei Cao","Yuan-Chen Guo","Chen Wang","Ying Shan","Song-Hai Zhang"],"pdf_url":"https://arxiv.org/pdf/2306.01531v2.pdf","comment":"accepted to NeurIPS2023; Project Page:\n https://thucz.github.io/PanoGRF/"},{"id":"http://arxiv.org/abs/2312.03266v1","updated":"2023-12-06T03:31:13Z","published":"2023-12-06T03:31:13Z","title":"SO-NeRF: Active View Planning for NeRF using Surrogate Objectives","summary":" Despite the great success of Neural Radiance Fields (NeRF), its\ndata-gathering process remains vague with only a general rule of thumb of\nsampling as densely as possible. The lack of understanding of what actually\nconstitutes good views for NeRF makes it difficult to actively plan a sequence\nof views that yield the maximal reconstruction quality. We propose Surrogate\nObjectives for Active Radiance Fields (SOAR), which is a set of interpretable\nfunctions that evaluates the goodness of views using geometric and photometric\nvisual cues - surface coverage, geometric complexity, textural complexity, and\nray diversity. Moreover, by learning to infer the SOAR scores from a deep\nnetwork, SOARNet, we are able to effectively select views in mere seconds\ninstead of hours, without the need for prior visits to all the candidate views\nor training any radiance field during such planning. Our experiments show\nSOARNet outperforms the baselines with $\\sim$80x speed-up while achieving\nbetter or comparable reconstruction qualities. We finally show that SOAR is\nmodel-agnostic, thus it generalizes across fully neural-implicit to fully\nexplicit approaches.\n","authors":["Keifer Lee","Shubham Gupta","Sunglyoung Kim","Bhargav Makwana","Chao Chen","Chen Feng"],"pdf_url":"https://arxiv.org/pdf/2312.03266v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2312.02439v2","updated":"2023-12-06T03:20:29Z","published":"2023-12-05T02:41:57Z","title":"Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language\n Models with Creative Humor Generation","summary":" Chain-of-Thought (CoT) guides large language models (LLMs) to reason\nstep-by-step, and can motivate their logical reasoning ability. While effective\nfor logical tasks, CoT is not conducive to creative problem-solving which often\nrequires out-of-box thoughts and is crucial for innovation advancements. In\nthis paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a\nnon-sequential, creative paradigm involving strong associations and knowledge\nleaps. To this end, we study LLMs on the popular Oogiri game which needs\nparticipants to have good creativity and strong associative thinking for\nresponding unexpectedly and humorously to the given image, text, or both, and\nthus is suitable for LoT study. Then to investigate LLMs' LoT ability in the\nOogiri game, we first build a multimodal and multilingual Oogiri-GO dataset\nwhich contains over 130,000 samples from the Oogiri game, and observe the\ninsufficient LoT ability or failures of most existing LLMs on the Oogiri game.\nAccordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve\nLLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into\nLoT-oriented instruction tuning data to train pretrained LLM for achieving\ncertain LoT humor generation and discrimination abilities. Then CLoT designs an\nexplorative self-refinement that encourages the LLM to generate more creative\nLoT data via exploring parallels between seemingly unrelated concepts and\nselects high-quality data to train itself for self-refinement. CLoT not only\nexcels in humor generation in the Oogiri game but also boosts creative\nabilities in various tasks like cloud guessing game and divergent association\ntask. These findings advance our understanding and offer a pathway to improve\nLLMs' creative capacities for innovative applications across domains. The\ndataset, code, and models will be released online.\nhttps://zhongshsh.github.io/CLoT/.\n","authors":["Shanshan Zhong","Zhongzhan Huang","Shanghua Gao","Wushao Wen","Liang Lin","Marinka Zitnik","Pan Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.02439v2.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2308.06444v3","updated":"2023-12-06T02:11:15Z","published":"2023-08-12T02:38:43Z","title":"TongueSAM: An Universal Tongue Segmentation Model Based on SAM with\n Zero-Shot","summary":" Tongue segmentation serves as the primary step in automated TCM tongue\ndiagnosis, which plays a significant role in the diagnostic results. Currently,\nnumerous deep learning based methods have achieved promising results. However,\nwhen confronted with tongue images that differ from the training set or possess\nchallenging backgrounds, these methods demonstrate limited performance. To\naddress this issue, this paper proposes a universal tongue segmentation model\nnamed TongueSAM based on SAM (Segment Anything Model). SAM is a large-scale\npretrained interactive segmentation model known for its powerful zero-shot\ngeneralization capability. Applying SAM to tongue segmentation leverages its\nlearned prior knowledge from natural images, enabling the achievement of\nzero-shot segmentation for various types of tongue images. In this study, a\nPrompt Generator based on object detection is integrated into SAM to enable an\nend-to-end automated tongue segmentation method. Experiments demonstrate that\nTongueSAM achieves exceptional performance across various of tongue\nsegmentation datasets, particularly under zero-shot. Even when dealing with\nchallenging background tongue images, TongueSAM achieves a mIoU of 95.23\\%\nunder zero-shot conditions, surpassing other segmentation methods. As far as we\nknow, this is the first application of large-scale pretrained model for tongue\nsegmentation. The project mentioned in this paper is currently publicly\navailable.\n","authors":["Shan Cao","Qunsheng Ruan","Linjian Ma"],"pdf_url":"https://arxiv.org/pdf/2308.06444v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.00564v3","updated":"2023-12-06T02:10:09Z","published":"2022-12-01T15:11:21Z","title":"Leveraging Single-View Images for Unsupervised 3D Point Cloud Completion","summary":" Point clouds captured by scanning devices are often incomplete due to\nocclusion. To overcome this limitation, point cloud completion methods have\nbeen developed to predict the complete shape of an object based on its partial\ninput. These methods can be broadly classified as supervised or unsupervised.\nHowever, both categories require a large number of 3D complete point clouds,\nwhich may be difficult to capture. In this paper, we propose Cross-PCC, an\nunsupervised point cloud completion method without requiring any 3D complete\npoint clouds. We only utilize 2D images of the complete objects, which are\neasier to capture than 3D complete and clean point clouds. Specifically, to\ntake advantage of the complementary information from 2D images, we use a\nsingle-view RGB image to extract 2D features and design a fusion module to fuse\nthe 2D and 3D features extracted from the partial point cloud. To guide the\nshape of predicted point clouds, we project the predicted points of the object\nto the 2D plane and use the foreground pixels of its silhouette maps to\nconstrain the position of the projected points. To reduce the outliers of the\npredicted point clouds, we propose a view calibrator to move the points\nprojected to the background into the foreground by the single-view silhouette\nimage. To the best of our knowledge, our approach is the first point cloud\ncompletion method that does not require any 3D supervision. The experimental\nresults of our method are superior to those of the state-of-the-art\nunsupervised methods by a large margin. Moreover, our method even achieves\ncomparable performance to some supervised methods. We will make the source code\npublicly available at https://github.com/ltwu6/cross-pcc.\n","authors":["Lintai Wu","Qijian Zhang","Junhui Hou","Yong Xu"],"pdf_url":"https://arxiv.org/pdf/2212.00564v3.pdf","comment":"Accepted in IEEE Transactions on Multimedia"},{"id":"http://arxiv.org/abs/2312.03231v1","updated":"2023-12-06T01:59:47Z","published":"2023-12-06T01:59:47Z","title":"Deep Multimodal Fusion for Surgical Feedback Classification","summary":" Quantification of real-time informal feedback delivered by an experienced\nsurgeon to a trainee during surgery is important for skill improvements in\nsurgical training. Such feedback in the live operating room is inherently\nmultimodal, consisting of verbal conversations (e.g., questions and answers) as\nwell as non-verbal elements (e.g., through visual cues like pointing to\nanatomic elements). In this work, we leverage a clinically-validated\nfive-category classification of surgical feedback: \"Anatomic\", \"Technical\",\n\"Procedural\", \"Praise\" and \"Visual Aid\". We then develop a multi-label machine\nlearning model to classify these five categories of surgical feedback from\ninputs of text, audio, and video modalities. The ultimate goal of our work is\nto help automate the annotation of real-time contextual surgical feedback at\nscale. Our automated classification of surgical feedback achieves AUCs ranging\nfrom 71.5 to 77.6 with the fusion improving performance by 3.1%. We also show\nthat high-quality manual transcriptions of feedback audio from experts improve\nAUCs to between 76.5 and 96.2, which demonstrates a clear path toward future\nimprovements. Empirically, we find that the Staged training strategy, with\nfirst pre-training each modality separately and then training them jointly, is\nmore effective than training different modalities altogether. We also present\nintuitive findings on the importance of modalities for different feedback\ncategories. This work offers an important first look at the feasibility of\nautomated classification of real-world live surgical feedback based on text,\naudio, and video modalities.\n","authors":["Rafal Kocielnik","Elyssa Y. Wong","Timothy N. Chu","Lydia Lin","De-An Huang","Jiayun Wang","Anima Anandkumar","Andrew J. Hung"],"pdf_url":"https://arxiv.org/pdf/2312.03231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.08333v3","updated":"2023-12-06T01:53:04Z","published":"2023-08-16T12:46:52Z","title":"Improving Depth Gradient Continuity in Transformers: A Comparative Study\n on Monocular Depth Estimation with CNN","summary":" Monocular depth estimation is an ongoing challenge in computer vision. Recent\nprogress with Transformer models has demonstrated notable advantages over\nconventional CNNs in this area. However, there's still a gap in understanding\nhow these models prioritize different regions in 2D images and how these\nregions affect depth estimation performance. To explore the differences between\nTransformers and CNNs, we employ a sparse pixel approach to contrastively\nanalyze the distinctions between the two. Our findings suggest that while\nTransformers excel in handling global context and intricate textures, they lag\nbehind CNNs in preserving depth gradient continuity. To further enhance the\nperformance of Transformer models in monocular depth estimation, we propose the\nDepth Gradient Refinement (DGR) module that refines depth estimation through\nhigh-order differentiation, feature fusion, and recalibration. Additionally, we\nleverage optimal transport theory, treating depth maps as spatial probability\ndistributions, and employ the optimal transport distance as a loss function to\noptimize our model. Experimental results demonstrate that models integrated\nwith the plug-and-play Depth Gradient Refinement (DGR) module and the proposed\nloss function enhance performance without increasing complexity and\ncomputational costs on both outdoor KITTI and indoor NYU-Depth-v2 datasets.\nThis research not only offers fresh insights into the distinctions between\nTransformers and CNNs in depth estimation but also paves the way for novel\ndepth estimation methodologies.\n","authors":["Jiawei Yao","Tong Wu","Xiaofeng Zhang"],"pdf_url":"https://arxiv.org/pdf/2308.08333v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03227v1","updated":"2023-12-06T01:51:54Z","published":"2023-12-06T01:51:54Z","title":"Human Body Model based ID using Shape and Pose Parameters","summary":" We present a Human Body model based IDentification system (HMID) system that\nis jointly trained for shape, pose and biometric identification. HMID is based\non the Human Mesh Recovery (HMR) network and we propose additional losses to\nimprove and stabilize shape estimation and biometric identification while\nmaintaining the pose and shape output. We show that when our HMID network is\ntrained using additional shape and pose losses, it shows a significant\nimprovement in biometric identification performance when compared to an\nidentical model that does not use such losses. The HMID model uses raw images\ninstead of silhouettes and is able to perform robust recognition on images\ncollected at range and altitude as many anthropometric properties are\nreasonably invariant to clothing, view and range. We show results on the USF\ndataset as well as the BRIAR dataset which includes probes with both clothing\nand view changes. Our approach (using body model losses) shows a significant\nimprovement in Rank20 accuracy and True Accuracy Rate on the BRIAR evaluation\ndataset.\n","authors":["Aravind Sundaresan","Brian Burns","Indranil Sur","Yi Yao","Xiao Lin","Sujeong Kim"],"pdf_url":"https://arxiv.org/pdf/2312.03227v1.pdf","comment":"to be published in IEEE International Joint Conference on Biometrics,\n Ljubljana, Slovenia 2023"},{"id":"http://arxiv.org/abs/2312.03226v1","updated":"2023-12-06T01:51:03Z","published":"2023-12-06T01:51:03Z","title":"Rethinking Object Saliency Ranking: A Novel Whole-flow Processing\n Paradigm","summary":" Existing salient object detection methods are capable of predicting binary\nmaps that highlight visually salient regions. However, these methods are\nlimited in their ability to differentiate the relative importance of multiple\nobjects and the relationships among them, which can lead to errors and reduced\naccuracy in downstream tasks that depend on the relative importance of multiple\nobjects. To conquer, this paper proposes a new paradigm for saliency ranking,\nwhich aims to completely focus on ranking salient objects by their \"importance\norder\". While previous works have shown promising performance, they still face\nill-posed problems. First, the saliency ranking ground truth (GT) orders\ngeneration methods are unreasonable since determining the correct ranking order\nis not well-defined, resulting in false alarms. Second, training a ranking\nmodel remains challenging because most saliency ranking methods follow the\nmulti-task paradigm, leading to conflicts and trade-offs among different tasks.\nThird, existing regression-based saliency ranking methods are complex for\nsaliency ranking models due to their reliance on instance mask-based saliency\nranking orders. These methods require a significant amount of data to perform\naccurately and can be challenging to implement effectively. To solve these\nproblems, this paper conducts an in-depth analysis of the causes and proposes a\nwhole-flow processing paradigm of saliency ranking task from the perspective of\n\"GT data generation\", \"network structure design\" and \"training protocol\". The\nproposed approach outperforms existing state-of-the-art methods on the\nwidely-used SALICON set, as demonstrated by extensive experiments with fair and\nreasonable comparisons. The saliency ranking task is still in its infancy, and\nour proposed unified framework can serve as a fundamental strategy to guide\nfuture work.\n","authors":["Mengke Song","Linfeng Li","Dunquan Wu","Wenfeng Song","Chenglizhao Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03226v1.pdf","comment":"16 pages, 14 figures, accepted by IEEE Transactions on Image\n Processing"},{"id":"http://arxiv.org/abs/2309.02959v3","updated":"2023-12-06T01:48:13Z","published":"2023-09-06T12:55:59Z","title":"A Non-Invasive Interpretable NAFLD Diagnostic Method Combining TCM\n Tongue Features","summary":" Non-alcoholic fatty liver disease (NAFLD) is a clinicopathological syndrome\ncharacterized by hepatic steatosis resulting from the exclusion of alcohol and\nother identifiable liver-damaging factors. It has emerged as a leading cause of\nchronic liver disease worldwide. Currently, the conventional methods for NAFLD\ndetection are expensive and not suitable for users to perform daily\ndiagnostics. To address this issue, this study proposes a non-invasive and\ninterpretable NAFLD diagnostic method, the required user-provided indicators\nare only Gender, Age, Height, Weight, Waist Circumference, Hip Circumference,\nand tongue image. This method involves merging patients' physiological\nindicators with tongue features, which are then input into a fusion network\nnamed SelectorNet. SelectorNet combines attention mechanisms with feature\nselection mechanisms, enabling it to autonomously learn the ability to select\nimportant features. The experimental results show that the proposed method\nachieves an accuracy of 77.22\\% using only non-invasive data, and it also\nprovides compelling interpretability matrices. This study contributes to the\nearly diagnosis of NAFLD and the intelligent advancement of TCM tongue\ndiagnosis. The project mentioned in this paper is currently publicly available.\n","authors":["Shan Cao","Qunsheng Ruan","Qingfeng Wu","Weiqiang Lin"],"pdf_url":"https://arxiv.org/pdf/2309.02959v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03222v1","updated":"2023-12-06T01:41:49Z","published":"2023-12-06T01:41:49Z","title":"Predicting Scores of Various Aesthetic Attribute Sets by Learning from\n Overall Score Labels","summary":" Now many mobile phones embed deep-learning models for evaluation or guidance\non photography. These models cannot provide detailed results like human pose\nscores or scene color scores because of the rare of corresponding aesthetic\nattribute data. However, the annotation of image aesthetic attribute scores\nrequires experienced artists and professional photographers, which hinders the\ncollection of large-scale fully-annotated datasets. In this paper, we propose\nto replace image attribute labels with feature extractors. First, a novel\naesthetic attribute evaluation framework based on attribute features is\nproposed to predict attribute scores and overall scores. We call it the F2S\n(attribute features to attribute scores) model. We use networks from different\ntasks to provide attribute features to our F2S models. Then, we define an\naesthetic attribute contribution to describe the role of aesthetic attributes\nthroughout an image and use it with the attribute scores and the overall scores\nto train our F2S model. Sufficient experiments on publicly available datasets\ndemonstrate that our F2S model achieves comparable performance with those\ntrained on the datasets with fully-annotated aesthetic attribute score labels.\nOur method makes it feasible to learn meaningful attribute scores for various\naesthetic attribute sets in different types of images with only overall\naesthetic scores.\n","authors":["Heng Huang","Xin Jin","Yaqi Liu","Hao Lou","Chaoen Xiao","Shuai Cui","Xinning Li","Dongqing Zou"],"pdf_url":"https://arxiv.org/pdf/2312.03222v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01650v2","updated":"2023-12-06T01:30:10Z","published":"2023-12-04T06:01:46Z","title":"Adaptive Confidence Threshold for ByteTrack in Multi-Object Tracking","summary":" We investigate the application of ByteTrack in the realm of multiple object\ntracking. ByteTrack, a simple tracking algorithm, enables the simultaneous\ntracking of multiple objects by strategically incorporating detections with a\nlow confidence threshold. Conventionally, objects are initially associated with\nhigh confidence threshold detections. When the association between objects and\ndetections becomes ambiguous, ByteTrack extends the association to lower\nconfidence threshold detections. One notable drawback of the existing ByteTrack\napproach is its reliance on a fixed threshold to differentiate between high and\nlow-confidence detections. In response to this limitation, we introduce a novel\nand adaptive approach. Our proposed method entails a dynamic adjustment of the\nconfidence threshold, leveraging insights derived from overall detections.\nThrough experimentation, we demonstrate the effectiveness of our adaptive\nconfidence threshold technique while maintaining running time compared to\nByteTrack.\n","authors":["Linh Van Ma","Muhammad Ishfaq Hussain","JongHyun Park","Jeongbae Kim","Moongu Jeon"],"pdf_url":"https://arxiv.org/pdf/2312.01650v2.pdf","comment":"The 12th International Conference on Control, Automation and\n Information Sciences (ICCAIS 2023), November 27th to 29th, 2023 in Hanoi"},{"id":"http://arxiv.org/abs/2304.12507v2","updated":"2023-12-06T01:10:27Z","published":"2023-04-25T01:12:47Z","title":"Learning Task-Specific Strategies for Accelerated MRI","summary":" Compressed sensing magnetic resonance imaging (CS-MRI) seeks to recover\nvisual information from subsampled measurements for diagnostic tasks.\nTraditional CS-MRI methods often separately address measurement subsampling,\nimage reconstruction, and task prediction, resulting in a suboptimal end-to-end\nperformance. In this work, we propose TACKLE as a unified co-design framework\nfor jointly optimizing subsampling, reconstruction, and prediction strategies\nfor the performance on downstream tasks. The na\\\"ive approach of simply\nappending a task prediction module and training with a task-specific loss leads\nto suboptimal downstream performance. Instead, we develop a training procedure\nwhere a backbone architecture is first trained for a generic pre-training task\n(image reconstruction in our case), and then fine-tuned for different\ndownstream tasks with a prediction head. Experimental results on multiple\npublic MRI datasets show that TACKLE achieves an improved performance on\nvarious tasks over traditional CS-MRI methods. We also demonstrate that TACKLE\nis robust to distribution shifts by showing that it generalizes to a new\ndataset we experimentally collected using different acquisition setups from the\ntraining data. Without additional fine-tuning, TACKLE leads to both numerical\nand visual improvements compared to existing baselines. We have further\nimplemented a learned 4$\\times$-accelerated sequence on a Siemens 3T MRI Skyra\nscanner. Compared to the fully-sampling scan that takes 335 seconds, our\noptimized sequence only takes 84 seconds, achieving a four-fold time reduction\nas desired, while maintaining high performance.\n","authors":["Zihui Wu","Tianwei Yin","Yu Sun","Robert Frost","Andre van der Kouwe","Adrian V. Dalca","Katherine L. Bouman"],"pdf_url":"https://arxiv.org/pdf/2304.12507v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03209v1","updated":"2023-12-06T00:51:38Z","published":"2023-12-06T00:51:38Z","title":"Cache Me if You Can: Accelerating Diffusion Models through Block Caching","summary":" Diffusion models have recently revolutionized the field of image synthesis\ndue to their ability to generate photorealistic images. However, one of the\nmajor drawbacks of diffusion models is that the image generation process is\ncostly. A large image-to-image network has to be applied many times to\niteratively refine an image from random noise. While many recent works propose\ntechniques to reduce the number of required steps, they generally treat the\nunderlying denoising network as a black box. In this work, we investigate the\nbehavior of the layers within the network and find that 1) the layers' output\nchanges smoothly over time, 2) the layers show distinct patterns of change, and\n3) the change from step to step is often very small. We hypothesize that many\nlayer computations in the denoising network are redundant. Leveraging this, we\nintroduce block caching, in which we reuse outputs from layer blocks of\nprevious steps to speed up inference. Furthermore, we propose a technique to\nautomatically determine caching schedules based on each block's changes over\ntimesteps. In our experiments, we show through FID, human evaluation and\nqualitative analysis that Block Caching allows to generate images with higher\nvisual quality at the same computational cost. We demonstrate this for\ndifferent state-of-the-art models (LDM and EMU) and solvers (DDIM and DPM).\n","authors":["Felix Wimbauer","Bichen Wu","Edgar Schoenfeld","Xiaoliang Dai","Ji Hou","Zijian He","Artsiom Sanakoyeu","Peizhao Zhang","Sam Tsai","Jonas Kohler","Christian Rupprecht","Daniel Cremers","Peter Vajda","Jialiang Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03209v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03207v1","updated":"2023-12-06T00:48:50Z","published":"2023-12-06T00:48:50Z","title":"Satellite Imagery and AI: A New Era in Ocean Conservation, from Research\n to Deployment and Impact","summary":" Illegal, unreported, and unregulated (IUU) fishing poses a global threat to\nocean habitats. Publicly available satellite data offered by NASA and the\nEuropean Space Agency (ESA) provide an opportunity to actively monitor this\nactivity. Effectively leveraging satellite data for maritime conservation\nrequires highly reliable machine learning models operating globally with\nminimal latency. This paper introduces three specialized computer vision models\ndesigned for synthetic aperture radar (Sentinel-1), optical imagery\n(Sentinel-2), and nighttime lights (Suomi-NPP/NOAA-20). It also presents best\npractices for developing and delivering real-time computer vision services for\nconservation. These models have been deployed in Skylight, a real time maritime\nmonitoring platform, which is provided at no cost to users worldwide.\n","authors":["Patrick Beukema","Favyen Bastani","Piper Wolters","Henry Herzog","Joe Ferdinando"],"pdf_url":"https://arxiv.org/pdf/2312.03207v1.pdf","comment":"8 pages, 3 figures, submitted to NeurIPS Computational Sustainability\n 2023"},{"id":"http://arxiv.org/abs/2210.05952v2","updated":"2023-12-06T00:48:13Z","published":"2022-10-12T06:35:04Z","title":"3D Brain and Heart Volume Generative Models: A Survey","summary":" Generative models such as generative adversarial networks and autoencoders\nhave gained a great deal of attention in the medical field due to their\nexcellent data generation capability. This paper provides a comprehensive\nsurvey of generative models for three-dimensional (3D) volumes, focusing on the\nbrain and heart. A new and elaborate taxonomy of unconditional and conditional\ngenerative models is proposed to cover diverse medical tasks for the brain and\nheart: unconditional synthesis, classification, conditional synthesis,\nsegmentation, denoising, detection, and registration. We provide relevant\nbackground, examine each task and also suggest potential future directions. A\nlist of the latest publications will be updated on Github to keep up with the\nrapid influx of papers at\nhttps://github.com/csyanbin/3D-Medical-Generative-Survey.\n","authors":["Yanbin Liu","Girish Dwivedi","Farid Boussaid","Mohammed Bennamoun"],"pdf_url":"https://arxiv.org/pdf/2210.05952v2.pdf","comment":"Accepted at ACM Computing Surveys (CSUR) 2023"},{"id":"http://arxiv.org/abs/2312.03203v1","updated":"2023-12-06T00:46:30Z","published":"2023-12-06T00:46:30Z","title":"Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled\n Feature Fields","summary":" 3D scene representations have gained immense popularity in recent years.\nMethods that use Neural Radiance fields are versatile for traditional tasks\nsuch as novel view synthesis. In recent times, some work has emerged that aims\nto extend the functionality of NeRF beyond view synthesis, for semantically\naware tasks such as editing and segmentation using 3D feature field\ndistillation from 2D foundation models. However, these methods have two major\nlimitations: (a) they are limited by the rendering speed of NeRF pipelines, and\n(b) implicitly represented feature fields suffer from continuity artifacts\nreducing feature quality. Recently, 3D Gaussian Splatting has shown\nstate-of-the-art performance on real-time radiance field rendering. In this\nwork, we go one step further: in addition to radiance field rendering, we\nenable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D\nfoundation model distillation. This translation is not straightforward: naively\nincorporating feature fields in the 3DGS framework leads to warp-level\ndivergence. We propose architectural and training changes to efficiently avert\nthis problem. Our proposed method is general, and our experiments showcase\nnovel view semantic segmentation, language-guided editing and segment anything\nthrough learning feature fields from state-of-the-art 2D foundation models such\nas SAM and CLIP-LSeg. Across experiments, our distillation method is able to\nprovide comparable or better results, while being significantly faster to both\ntrain and render. Additionally, to the best of our knowledge, we are the first\nmethod to enable point and bounding-box prompting for radiance field\nmanipulation, by leveraging the SAM model. Project website at:\nhttps://feature-3dgs.github.io/\n","authors":["Shijie Zhou","Haoran Chang","Sicheng Jiang","Zhiwen Fan","Zehao Zhu","Dejia Xu","Pradyumna Chari","Suya You","Zhangyang Wang","Achuta Kadambi"],"pdf_url":"https://arxiv.org/pdf/2312.03203v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.04749v2","updated":"2023-12-06T00:45:08Z","published":"2023-07-10T17:54:57Z","title":"Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image\n Alignment with Iterative VQA Feedback","summary":" The field of text-conditioned image generation has made unparalleled progress\nwith the recent advent of latent diffusion models. While remarkable, as the\ncomplexity of given text input increases, the state-of-the-art diffusion models\nmay still fail in generating images which accurately convey the semantics of\nthe given prompt. Furthermore, it has been observed that such misalignments are\noften left undetected by pretrained multi-modal models such as CLIP. To address\nthese problems, in this paper we explore a simple yet effective decompositional\napproach towards both evaluation and improvement of text-to-image alignment. In\nparticular, we first introduce a Decompositional-Alignment-Score which given a\ncomplex prompt decomposes it into a set of disjoint assertions. The alignment\nof each assertion with generated images is then measured using a VQA model.\nFinally, alignment scores for different assertions are combined aposteriori to\ngive the final text-to-image alignment score. Experimental analysis reveals\nthat the proposed alignment metric shows significantly higher correlation with\nhuman ratings as opposed to traditional CLIP, BLIP scores. Furthermore, we also\nfind that the assertion level alignment scores provide a useful feedback which\ncan then be used in a simple iterative procedure to gradually increase the\nexpression of different assertions in the final image outputs. Human user\nstudies indicate that the proposed approach surpasses previous state-of-the-art\nby 8.7% in overall text-to-image alignment accuracy. Project page for our paper\nis available at https://1jsingh.github.io/divide-evaluate-and-refine\n","authors":["Jaskirat Singh","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04749v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2103.16669v3","updated":"2023-12-06T16:29:11Z","published":"2021-03-30T20:28:02Z","title":"An In-depth Analysis of Passage-Level Label Transfer for Contextual\n Document Ranking","summary":" Pre-trained contextual language models such as BERT, GPT, and XLnet work\nquite well for document retrieval tasks. Such models are fine-tuned based on\nthe query-document/query-passage level relevance labels to capture the ranking\nsignals. However, the documents are longer than the passages and such document\nranking models suffer from the token limitation (512) of BERT. Researchers\nproposed ranking strategies that either truncate the documents beyond the token\nlimit or chunk the documents into units that can fit into the BERT. In the\nlater case, the relevance labels are either directly transferred from the\noriginal query-document pair or learned through some external model. In this\npaper, we conduct a detailed study of the design decisions about splitting and\nlabel transfer on retrieval effectiveness and efficiency. We find that direct\ntransfer of relevance labels from documents to passages introduces label noise\nthat strongly affects retrieval effectiveness for large training datasets. We\nalso find that query processing times are adversely affected by fine-grained\nsplitting schemes. As a remedy, we propose a careful passage level labelling\nscheme using weak supervision that delivers improved performance (3-14% in\nterms of nDCG score) over most of the recently proposed models for ad-hoc\nretrieval while maintaining manageable computational complexity on four diverse\ndocument retrieval datasets.\n","authors":["Koustav Rudra","Zeon Trevor Fernando","Avishek Anand"],"pdf_url":"https://arxiv.org/pdf/2103.16669v3.pdf","comment":"Paper is about the performance analysis of contextual ranking\n strategies in an ad-hoc document retrieval"},{"id":"http://arxiv.org/abs/2207.02160v2","updated":"2023-12-06T15:29:02Z","published":"2022-07-05T16:28:47Z","title":"A Comprehensive Review of Visual-Textual Sentiment Analysis from Social\n Media Networks","summary":" Social media networks have become a significant aspect of people's lives,\nserving as a platform for their ideas, opinions and emotions. Consequently,\nautomated sentiment analysis (SA) is critical for recognising people's feelings\nin ways that other information sources cannot. The analysis of these feelings\nrevealed various applications, including brand evaluations, YouTube film\nreviews and healthcare applications. As social media continues to develop,\npeople post a massive amount of information in different forms, including text,\nphotos, audio and video. Thus, traditional SA algorithms have become limited,\nas they do not consider the expressiveness of other modalities. By including\nsuch characteristics from various material sources, these multimodal data\nstreams provide new opportunities for optimising the expected results beyond\ntext-based SA. Our study focuses on the forefront field of multimodal SA, which\nexamines visual and textual data posted on social media networks. Many people\nare more likely to utilise this information to express themselves on these\nplatforms. To serve as a resource for academics in this rapidly growing field,\nwe introduce a comprehensive overview of textual and visual SA, including data\npre-processing, feature extraction techniques, sentiment benchmark datasets,\nand the efficacy of multiple classification methodologies suited to each field.\nWe also provide a brief introduction of the most frequently utilised data\nfusion strategies and a summary of existing research on visual-textual SA.\nFinally, we highlight the most significant challenges and investigate several\nimportant sentiment applications.\n","authors":["Israa Khalaf Salman Al-Tameemi","Mohammad-Reza Feizi-Derakhshi","Saeed Pashazadeh","Mohammad Asadpour"],"pdf_url":"https://arxiv.org/pdf/2207.02160v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03494v1","updated":"2023-12-06T13:36:20Z","published":"2023-12-06T13:36:20Z","title":"Boosting legal case retrieval by query content selection with large\n language models","summary":" Legal case retrieval, which aims to retrieve relevant cases to a given query\ncase, benefits judgment justice and attracts increasing attention. Unlike\ngeneric retrieval queries, legal case queries are typically long and the\ndefinition of relevance is closely related to legal-specific elements.\nTherefore, legal case queries may suffer from noise and sparsity of salient\ncontent, which hinders retrieval models from perceiving correct information in\na query. While previous studies have paid attention to improving retrieval\nmodels and understanding relevance judgments, we focus on enhancing legal case\nretrieval by utilizing the salient content in legal case queries. We first\nannotate the salient content in queries manually and investigate how sparse and\ndense retrieval models attend to those content. Then we experiment with various\nquery content selection methods utilizing large language models (LLMs) to\nextract or summarize salient content and incorporate it into the retrieval\nmodels. Experimental results show that reformulating long queries using LLMs\nimproves the performance of both sparse and dense models in legal case\nretrieval.\n","authors":["Youchao Zhou","Heyan Huang","Zhijing Wu"],"pdf_url":"https://arxiv.org/pdf/2312.03494v1.pdf","comment":"This work was accepted as 23-SIGIR-AP main conference paper"},{"id":"http://arxiv.org/abs/2312.03463v1","updated":"2023-12-06T12:37:28Z","published":"2023-12-06T12:37:28Z","title":"DBCopilot: Scaling Natural Language Querying to Massive Databases","summary":" Text-to-SQL simplifies database interactions by enabling non-experts to\nconvert their natural language (NL) questions into Structured Query Language\n(SQL) queries. While recent advances in large language models (LLMs) have\nimproved the zero-shot text-to-SQL paradigm, existing methods face scalability\nchallenges when dealing with massive, dynamically changing databases. This\npaper introduces DBCopilot, a framework that addresses these challenges by\nemploying a compact and flexible copilot model for routing across massive\ndatabases. Specifically, DBCopilot decouples the text-to-SQL process into\nschema routing and SQL generation, leveraging a lightweight\nsequence-to-sequence neural network-based router to formulate database\nconnections and navigate natural language questions through databases and\ntables. The routed schemas and questions are then fed into LLMs for efficient\nSQL generation. Furthermore, DBCopilot also introduced a reverse\nschema-to-question generation paradigm, which can learn and adapt the router\nover massive databases automatically without requiring manual intervention.\nExperimental results demonstrate that DBCopilot is a scalable and effective\nsolution for real-world text-to-SQL tasks, providing a significant advancement\nin handling large-scale schemas.\n","authors":["Tianshu Wang","Hongyu Lin","Xianpei Han","Le Sun","Xiaoyang Chen","Hao Wang","Zhenyu Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.03463v1.pdf","comment":"Code and data are available at https://github.com/tshu-w/DBCopilot"},{"id":"http://arxiv.org/abs/2312.02429v2","updated":"2023-12-06T03:08:31Z","published":"2023-12-05T02:08:48Z","title":"PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval\n Models","summary":" Embedding-based Retrieval Models (ERMs) have emerged as a promising framework\nfor large-scale text retrieval problems due to powerful large language models.\nNevertheless, fine-tuning ERMs to reach state-of-the-art results can be\nexpensive due to the extreme scale of data as well as the complexity of\nmulti-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this\nwork, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast\ntuning of ERMs without any backward pass in the optimization. At index building\nstage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN)\ncomponent. At inference stage, PEFA performs a convex combination of two\nscoring functions, one from the ERM and the other from the kNN. Based on the\nneighborhood definition, PEFA framework induces two realizations, namely\nPEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra\nsmall) using a single ANN index. Empirically, PEFA achieves significant\nimprovement on two retrieval applications. For document retrieval, regarding\nRecall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an\naverage of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%,\nrespectively. For product search, PEFA improves the Recall@100 of the\nfine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL,\nrespectively. Our code is available at\nhttps://github.com/amzn/pecos/tree/mainline/examples/pefa-wsdm24.\n","authors":["Wei-Cheng Chang","Jyun-Yu Jiang","Jiong Zhang","Mutasem Al-Darabsah","Choon Hui Teo","Cho-Jui Hsieh","Hsiang-Fu Yu","S. V. N. Vishwanathan"],"pdf_url":"https://arxiv.org/pdf/2312.02429v2.pdf","comment":"Accept by WSDM 2024"},{"id":"http://arxiv.org/abs/2308.16761v4","updated":"2023-12-06T02:40:53Z","published":"2023-08-31T14:29:10Z","title":"Learning Category Trees for ID-Based Recommendation: Exploring the Power\n of Differentiable Vector Quantization","summary":" Category information plays a crucial role in enhancing the quality and\npersonalization of recommender systems. Nevertheless, the availability of item\ncategory information is not consistently present, particularly in the context\nof ID-based recommendations. In this work, we propose a novel approach to\nautomatically learn and generate entity (i.e., user or item) category trees for\nID-based recommendation. Specifically, we devise a differentiable vector\nquantization framework for automatic category tree generation, namely CAGE,\nwhich enables the simultaneous learning and refinement of categorical code\nrepresentations and entity embeddings in an end-to-end manner, starting from\nthe randomly initialized states. With its high adaptability, CAGE can be easily\nintegrated into both sequential and non-sequential recommender systems. We\nvalidate the effectiveness of CAGE on various recommendation tasks including\nlist completion, collaborative filtering, and click-through rate prediction,\nacross different recommendation models. We release the code and data for others\nto reproduce the reported results.\n","authors":["Qijiong Liu","Jiaren Xiao","Lu Fan","Jieming Zhu","Xiao-Ming Wu"],"pdf_url":"https://arxiv.org/pdf/2308.16761v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03217v1","updated":"2023-12-06T01:15:40Z","published":"2023-12-06T01:15:40Z","title":"Rethinking E-Commerce Search","summary":" E-commerce search and recommendation usually operate on structured data such\nas product catalogs and taxonomies. However, creating better search and\nrecommendation systems often requires a large variety of unstructured data\nincluding customer reviews and articles on the web. Traditionally, the solution\nhas always been converting unstructured data into structured data through\ninformation extraction, and conducting search over the structured data.\nHowever, this is a costly approach that often has low quality. In this paper,\nwe envision a solution that does entirely the opposite. Instead of converting\nunstructured data (web pages, customer reviews, etc) to structured data, we\ninstead convert structured data (product inventory, catalogs, taxonomies, etc)\ninto textual data, which can be easily integrated into the text corpus that\ntrains LLMs. Then, search and recommendation can be performed through a Q/A\nmechanism through an LLM instead of using traditional information retrieval\nmethods over structured data.\n","authors":["Haixun Wang","Taesik Na"],"pdf_url":"https://arxiv.org/pdf/2312.03217v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03785v1","updated":"2023-12-06T11:00:59Z","published":"2023-12-06T11:00:59Z","title":"Sports Recommender Systems: Overview and Research Issues","summary":" Sports recommender systems receive an increasing attention due to their\npotential of fostering healthy living, improving personal well-being, and\nincreasing performances in sport. These systems support people in sports, for\nexample, by the recommendation of healthy and performance boosting food items,\nthe recommendation of training practices, talent and team recommendation, and\nthe recommendation of specific tactics in competitions. With applications in\nthe virtual world, for example, the recommendation of maps or opponents in\ne-sports, these systems already transcend conventional sports scenarios where\nphysical presence is needed. On the basis of different working examples, we\npresent an overview of sports recommender systems applications and techniques.\nOverall, we analyze the related state-of-the-art and discuss open research\nissues.\n","authors":["Alexander Felfernig","Manfred Wundara","Thi Ngoc Trang Tran","Viet-Man Le","Sebastian Lubos","Seda Polat-Erdeniz"],"pdf_url":"https://arxiv.org/pdf/2312.03785v1.pdf","comment":"Article under review in the Journal of Intelligent Information\n Systems (Springer JIIS)"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.03700v1","updated":"2023-12-06T18:59:19Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":" Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v1.pdf","comment":"Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2305.11042v2","updated":"2023-12-06T18:59:16Z","published":"2023-05-18T15:36:20Z","title":"A unified framework for information-theoretic generalization bounds","summary":" This paper presents a general methodology for deriving information-theoretic\ngeneralization bounds for learning algorithms. The main technical tool is a\nprobabilistic decorrelation lemma based on a change of measure and a relaxation\nof Young's inequality in $L_{\\psi_p}$ Orlicz spaces. Using the decorrelation\nlemma in combination with other techniques, such as symmetrization, couplings,\nand chaining in the space of probability measures, we obtain new upper bounds\non the generalization error, both in expectation and in high probability, and\nrecover as special cases many of the existing generalization bounds, including\nthe ones based on mutual information, conditional mutual information,\nstochastic chaining, and PAC-Bayes inequalities. In addition, the\nFernique-Talagrand upper bound on the expected supremum of a subgaussian\nprocess emerges as a special case.\n","authors":["Yifeng Chu","Maxim Raginsky"],"pdf_url":"https://arxiv.org/pdf/2305.11042v2.pdf","comment":"19 pages; final version accepted to Neural Information Processing\n Systems"},{"id":"http://arxiv.org/abs/2302.02560v3","updated":"2023-12-06T18:55:43Z","published":"2023-02-06T04:35:08Z","title":"Causal Estimation of Exposure Shifts with Neural Networks: Evaluating\n the Health Benefits of Stricter Air Quality Standards in the US","summary":" In policy research, one of the most critical analytic tasks is to estimate\nthe causal effect of a policy-relevant shift to the distribution of a\ncontinuous exposure/treatment on an outcome of interest. We call this problem\nshift-response function (SRF) estimation. Existing neural network methods\ninvolving robust causal-effect estimators lack theoretical guarantees and\npractical implementations for SRF estimation. Motivated by a key\npolicy-relevant question in public health, we develop a neural network method\nand its theoretical underpinnings to estimate SRFs with robustness and\nefficiency guarantees. We then apply our method to data consisting of 68\nmillion individuals and 27 million deaths across the U.S. to estimate the\ncausal effect from revising the US National Ambient Air Quality Standards\n(NAAQS) for PM 2.5 from 12 $\\mu g/m^3$ to 9 $\\mu g/m^3$. This change has been\nrecently proposed by the US Environmental Protection Agency (EPA). Our goal is\nto estimate, for the first time, the reduction in deaths that would result from\nthis anticipated revision using causal methods for SRFs. Our proposed method,\ncalled {T}argeted {R}egularization for {E}xposure {S}hifts with Neural\n{Net}works (TRESNET), contributes to the neural network literature for causal\ninference in two ways: first, it proposes a targeted regularization loss with\ntheoretical properties that ensure double robustness and achieves asymptotic\nefficiency specific for SRF estimation; second, it enables loss functions from\nthe exponential family of distributions to accommodate non-continuous outcome\ndistributions (such as hospitalization or mortality counts). We complement our\napplication with benchmark experiments that demonstrate TRESNET's broad\napplicability and competitiveness.\n","authors":["Mauricio Tec","Oladimeji Mudele","Kevin Josey","Francesca Dominici"],"pdf_url":"https://arxiv.org/pdf/2302.02560v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03692v1","updated":"2023-12-06T18:54:44Z","published":"2023-12-06T18:54:44Z","title":"Memory Triggers: Unveiling Memorization in Text-To-Image Generative\n Models through Word-Level Duplication","summary":" Diffusion-based models, such as the Stable Diffusion model, have\nrevolutionized text-to-image synthesis with their ability to produce\nhigh-quality, high-resolution images. These advancements have prompted\nsignificant progress in image generation and editing tasks. However, these\nmodels also raise concerns due to their tendency to memorize and potentially\nreplicate exact training samples, posing privacy risks and enabling adversarial\nattacks. Duplication in training datasets is recognized as a major factor\ncontributing to memorization, and various forms of memorization have been\nstudied so far. This paper focuses on two distinct and underexplored types of\nduplication that lead to replication during inference in diffusion-based\nmodels, particularly in the Stable Diffusion model. We delve into these\nlesser-studied duplication phenomena and their implications through two case\nstudies, aiming to contribute to the safer and more responsible use of\ngenerative models in various applications.\n","authors":["Ali Naseh","Jaechul Roh","Amir Houmansadr"],"pdf_url":"https://arxiv.org/pdf/2312.03692v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03691v1","updated":"2023-12-06T18:54:27Z","published":"2023-12-06T18:54:27Z","title":"On the Role of Edge Dependency in Graph Generative Models","summary":" In this work, we introduce a novel evaluation framework for generative models\nof graphs, emphasizing the importance of model-generated graph overlap\n(Chanpuriya et al., 2021) to ensure both accuracy and edge-diversity. We\ndelineate a hierarchy of graph generative models categorized into three levels\nof complexity: edge independent, node independent, and fully dependent models.\nThis hierarchy encapsulates a wide range of prevalent methods. We derive\ntheoretical bounds on the number of triangles and other short-length cycles\nproducible by each level of the hierarchy, contingent on the model overlap. We\nprovide instances demonstrating the asymptotic optimality of our bounds.\nFurthermore, we introduce new generative models for each of the three\nhierarchical levels, leveraging dense subgraph discovery (Gionis & Tsourakakis,\n2015). Our evaluation, conducted on real-world datasets, focuses on assessing\nthe output quality and overlap of our proposed models in comparison to other\npopular models. Our results indicate that our simple, interpretable models\nprovide competitive baselines to popular generative models. Through this\ninvestigation, we aim to propel the advancement of graph generative models by\noffering a structured framework and robust evaluation metrics, thereby\nfacilitating the development of models capable of generating accurate and\nedge-diverse graphs.\n","authors":["Sudhanshu Chanpuriya","Cameron Musco","Konstantinos Sotiropoulos","Charalampos Tsourakakis"],"pdf_url":"https://arxiv.org/pdf/2312.03691v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03690v1","updated":"2023-12-06T18:53:45Z","published":"2023-12-06T18:53:45Z","title":"Inverse Design of Vitrimeric Polymers by Molecular Dynamics and\n Generative Modeling","summary":" Vitrimer is a new class of sustainable polymers with the ability of\nself-healing through rearrangement of dynamic covalent adaptive networks.\nHowever, a limited choice of constituent molecules restricts their property\nspace, prohibiting full realization of their potential applications. Through a\ncombination of molecular dynamics (MD) simulations and machine learning (ML),\nparticularly a novel graph variational autoencoder (VAE) model, we establish a\nmethod for generating novel vitrimers and guide their inverse design based on\ndesired glass transition temperature (Tg). We build the first vitrimer dataset\nof one million and calculate Tg on 8,424 of them by high-throughput MD\nsimulations calibrated by a Gaussian process model. The proposed VAE employs\ndual graph encoders and a latent dimension overlapping scheme which allows for\nindividual representation of multi-component vitrimers. By constructing a\ncontinuous latent space containing necessary information of vitrimers, we\ndemonstrate high accuracy and efficiency of our framework in discovering novel\nvitrimers with desirable Tg beyond the training regime. The proposed vitrimers\nwith reasonable synthesizability cover a wide range of Tg and broaden the\npotential widespread usage of vitrimeric materials.\n","authors":["Yiwen Zheng","Prakash Thakolkaran","Jake A. Smith","Ziheng Lu","Shuxin Zheng","Bichlien H. Nguyen","Siddhant Kumar","Aniruddh Vashisth"],"pdf_url":"https://arxiv.org/pdf/2312.03690v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03682v1","updated":"2023-12-06T18:47:28Z","published":"2023-12-06T18:47:28Z","title":"What Planning Problems Can A Relational Neural Network Solve?","summary":" Goal-conditioned policies are generally understood to be \"feed-forward\"\ncircuits, in the form of neural networks that map from the current state and\nthe goal specification to the next action to take. However, under what\ncircumstances such a policy can be learned and how efficient the policy will be\nare not well understood. In this paper, we present a circuit complexity\nanalysis for relational neural networks (such as graph neural networks and\ntransformers) representing policies for planning problems, by drawing\nconnections with serialized goal regression search (S-GRS). We show that there\nare three general classes of planning problems, in terms of the growth of\ncircuit width and depth as a function of the number of objects and planning\nhorizon, providing constructive proofs. We also illustrate the utility of this\nanalysis for designing neural networks for policy learning.\n","authors":["Jiayuan Mao","Tomás Lozano-Pérez","Joshua B. Tenenbaum","Leslie Pack Kaelbling"],"pdf_url":"https://arxiv.org/pdf/2312.03682v1.pdf","comment":"NeurIPS 2023 (Spotlight). Project page:\n https://concepts-ai.com/p/goal-regression-width/"},{"id":"http://arxiv.org/abs/2312.03675v1","updated":"2023-12-06T18:39:29Z","published":"2023-12-06T18:39:29Z","title":"GeoShapley: A Game Theory Approach to Measuring Spatial Effects in\n Machine Learning Models","summary":" This paper introduces GeoShapley, a game theory approach to measuring spatial\neffects in machine learning models. GeoShapley extends the Nobel Prize-winning\nShapley value framework in game theory by conceptualizing location as a player\nin a model prediction game, which enables the quantification of the importance\nof location and the synergies between location and other features in a model.\nGeoShapley is a model-agnostic approach and can be applied to statistical or\nblack-box machine learning models in various structures. The interpretation of\nGeoShapley is directly linked with spatially varying coefficient models for\nexplaining spatial effects and additive models for explaining non-spatial\neffects. Using simulated data, GeoShapley values are validated against known\ndata-generating processes and are used for cross-comparison of seven\nstatistical and machine learning models. An empirical example of house price\nmodeling is used to illustrate GeoShapley's utility and interpretation with\nreal world data. The method is available as an open-source Python package named\ngeoshapley.\n","authors":["Ziqi Li"],"pdf_url":"https://arxiv.org/pdf/2312.03675v1.pdf","comment":"30 pages, 10 figures, 6 tables"},{"id":"http://arxiv.org/abs/2312.03673v1","updated":"2023-12-06T18:38:05Z","published":"2023-12-06T18:38:05Z","title":"On the Role of the Action Space in Robot Manipulation Learning and\n Sim-to-Real Transfer","summary":" We study the choice of action space in robot manipulation learning and\nsim-to-real transfer. We define metrics that assess the performance, and\nexamine the emerging properties in the different action spaces. We train over\n250 reinforcement learning~(RL) agents in simulated reaching and pushing tasks,\nusing 13 different control spaces. The choice of action spaces spans popular\nchoices in the literature as well as novel combinations of common design\ncharacteristics. We evaluate the training performance in simulation and the\ntransfer to a real-world environment. We identify good and bad characteristics\nof robotic action spaces and make recommendations for future designs. Our\nfindings have important implications for the design of RL algorithms for robot\nmanipulation tasks, and highlight the need for careful consideration of action\nspaces when training and transferring RL agents for real-world robotics.\n","authors":["Elie Aljalbout","Felix Frank","Maximilian Karl","Patrick van der Smagt"],"pdf_url":"https://arxiv.org/pdf/2312.03673v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03671v1","updated":"2023-12-06T18:36:03Z","published":"2023-12-06T18:36:03Z","title":"Direct Exoplanet Detection Using Deep Convolutional Image Reconstruction\n (ConStruct): A New Algorithm for Post-Processing High-Contrast Images","summary":" We present a novel machine-learning approach for detecting faint point\nsources in high-contrast adaptive optics imaging datasets. The most widely used\nalgorithms for primary subtraction aim to decouple bright stellar speckle noise\nfrom planetary signatures by subtracting an approximation of the temporally\nevolving stellar noise from each frame in an imaging sequence. Our approach\naims to improve the stellar noise approximation and increase the planet\ndetection sensitivity by leveraging deep learning in a novel direct imaging\npost-processing algorithm. We show that a convolutional autoencoder neural\nnetwork, trained on an extensive reference library of real imaging sequences,\naccurately reconstructs the stellar speckle noise at the location of a\npotential planet signal. This tool is used in a post-processing algorithm we\ncall Direct Exoplanet Detection with Convolutional Image Reconstruction, or\nConStruct. The reliability and sensitivity of ConStruct are assessed using real\nKeck/NIRC2 angular differential imaging datasets. Of the 30 unique point\nsources we examine, ConStruct yields a higher S/N than traditional PCA-based\nprocessing for 67$\\%$ of the cases and improves the relative contrast by up to\na factor of 2.6. This work demonstrates the value and potential of deep\nlearning to take advantage of a diverse reference library of point spread\nfunction realizations to improve direct imaging post-processing. ConStruct and\nits future improvements may be particularly useful as tools for post-processing\nhigh-contrast images from the James Webb Space Telescope and extreme adaptive\noptics instruments, both for the current generation and those being designed\nfor the upcoming 30 meter-class telescopes.\n","authors":["Trevor N. Wolf","Brandon A. Jones","Brendan P. Bowler"],"pdf_url":"https://arxiv.org/pdf/2312.03671v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03668v1","updated":"2023-12-06T18:34:42Z","published":"2023-12-06T18:34:42Z","title":"An Integration of Pre-Trained Speech and Language Models for End-to-End\n Speech Recognition","summary":" Advances in machine learning have made it possible to perform various text\nand speech processing tasks, including automatic speech recognition (ASR), in\nan end-to-end (E2E) manner. Since typical E2E approaches require large amounts\nof training data and resources, leveraging pre-trained foundation models\ninstead of training from scratch is gaining attention. Although there have been\nattempts to use pre-trained speech and language models in ASR, most of them are\nlimited to using either. This paper explores the potential of integrating a\npre-trained speech representation model with a large language model (LLM) for\nE2E ASR. The proposed model enables E2E ASR by generating text tokens in an\nautoregressive manner via speech representations as speech prompts, taking\nadvantage of the vast knowledge provided by the LLM. Furthermore, the proposed\nmodel can incorporate remarkable developments for LLM utilization, such as\ninference optimization and parameter-efficient domain adaptation. Experimental\nresults show that the proposed model achieves performance comparable to modern\nE2E ASR models.\n","authors":["Yukiya Hono","Koh Mitsuda","Tianyu Zhao","Kentaro Mitsui","Toshiaki Wakatsuki","Kei Sawada"],"pdf_url":"https://arxiv.org/pdf/2312.03668v1.pdf","comment":"6 pages, 2 figures, 3 tables, The model is available at\n https://huggingface.co/rinna/nue-asr"},{"id":"http://arxiv.org/abs/2312.03666v1","updated":"2023-12-06T18:34:01Z","published":"2023-12-06T18:34:01Z","title":"Towards small and accurate convolutional neural networks for acoustic\n biodiversity monitoring","summary":" Automated classification of animal sounds is a prerequisite for large-scale\nmonitoring of biodiversity. Convolutional Neural Networks (CNNs) are among the\nmost promising algorithms but they are slow, often achieve poor classification\nin the field and typically require large training data sets. Our objective was\nto design CNNs that are fast at inference time and achieve good classification\nperformance while learning from moderate-sized data. Recordings from a\nrainforest ecosystem were used. Start and end-point of sounds from 20 bird\nspecies were manually annotated. Spectrograms from 10 second segments were used\nas CNN input. We designed simple CNNs with a frequency unwrapping layer\n(SIMP-FU models) such that any output unit was connected to all spectrogram\nfrequencies but only to a sub-region of time, the Receptive Field (RF). Our\nmodels allowed experimentation with different RF durations. Models either used\nthe time-indexed labels that encode start and end-point of sounds or simpler\nsegment-level labels. Models learning from time-indexed labels performed\nconsiderably better than their segment-level counterparts. Best classification\nperformances was achieved for models with intermediate RF duration of 1.5\nseconds. The best SIMP-FU models achieved AUCs over 0.95 in 18 of 20 classes on\nthe test set. On compact low-cost hardware the best SIMP-FU models evaluated up\nto seven times faster than real-time data acquisition. RF duration was a major\ndriver of classification performance. The optimum of 1.5 s was in the same\nrange as the duration of the sounds. Our models achieved good classification\nperformance while learning from moderate-sized training data. This is explained\nby the usage of time-indexed labels during training and adequately sized RF.\nResults confirm the feasibility of deploying small CNNs with good\nclassification performance on compact low-cost devices.\n","authors":["Serge Zaugg","Mike van der Schaar","Florence Erbs","Antonio Sanchez","Joan V. Castell","Emiliano Ramallo","Michel André"],"pdf_url":"https://arxiv.org/pdf/2312.03666v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03656v1","updated":"2023-12-06T18:25:53Z","published":"2023-12-06T18:25:53Z","title":"Interpretability Illusions in the Generalization of Simplified Models","summary":" A common method to study deep learning systems is to use simplified model\nrepresentations -- for example, using singular value decomposition to visualize\nthe model's hidden states in a lower dimensional space. This approach assumes\nthat the results of these simplified are faithful to the original model. Here,\nwe illustrate an important caveat to this assumption: even if the simplified\nrepresentations can accurately approximate the full model on the training set,\nthey may fail to accurately capture the model's behavior out of distribution --\nthe understanding developed from simplified representations may be an illusion.\nWe illustrate this by training Transformer models on controlled datasets with\nsystematic generalization splits. First, we train models on the Dyck\nbalanced-parenthesis languages. We simplify these models using tools like\ndimensionality reduction and clustering, and then explicitly test how these\nsimplified proxies match the behavior of the original model on various\nout-of-distribution test sets. We find that the simplified proxies are\ngenerally less faithful out of distribution. In cases where the original model\ngeneralizes to novel structures or deeper depths, the simplified versions may\nfail, or generalize better. This finding holds even if the simplified\nrepresentations do not directly depend on the training distribution. Next, we\nstudy a more naturalistic task: predicting the next character in a dataset of\ncomputer code. We find similar generalization gaps between the original model\nand simplified proxies, and conduct further analysis to investigate which\naspects of the code completion task are associated with the largest gaps.\nTogether, our results raise questions about the extent to which mechanistic\ninterpretations derived using tools like SVD can reliably predict what a model\nwill do in novel situations.\n","authors":["Dan Friedman","Andrew Lampinen","Lucas Dixon","Danqi Chen","Asma Ghandeharioun"],"pdf_url":"https://arxiv.org/pdf/2312.03656v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03654v1","updated":"2023-12-06T18:20:46Z","published":"2023-12-06T18:20:46Z","title":"Efficient Inverse Design Optimization through Multi-fidelity\n Simulations, Machine Learning, and Search Space Reduction Strategies","summary":" This paper introduces a methodology designed to augment the inverse design\noptimization process in scenarios constrained by limited compute, through the\nstrategic synergy of multi-fidelity evaluations, machine learning models, and\noptimization algorithms. The proposed methodology is analyzed on two distinct\nengineering inverse design problems: airfoil inverse design and the scalar\nfield reconstruction problem. It leverages a machine learning model trained\nwith low-fidelity simulation data, in each optimization cycle, thereby\nproficiently predicting a target variable and discerning whether a\nhigh-fidelity simulation is necessitated, which notably conserves computational\nresources. Additionally, the machine learning model is strategically deployed\nprior to optimization to reduce the search space, thereby further accelerating\nconvergence toward the optimal solution. The methodology has been employed to\nenhance two optimization algorithms, namely Differential Evolution and Particle\nSwarm Optimization. Comparative analyses illustrate performance improvements\nacross both algorithms. Notably, this method is adeptly adaptable across any\ninverse design application, facilitating a harmonious synergy between a\nrepresentative low-fidelity machine learning model, and high-fidelity\nsimulation, and can be seamlessly applied across any variety of\npopulation-based optimization algorithms.\n","authors":["Luka Grbcic","Juliane Müller","Wibe Albert de Jong"],"pdf_url":"https://arxiv.org/pdf/2312.03654v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03651v1","updated":"2023-12-06T18:13:21Z","published":"2023-12-06T18:13:21Z","title":"MICRACLE: Inverse Reinforcement and Curriculum Learning Model for\n Human-inspired Mobile Robot Navigation","summary":" In emergency scenarios, mobile robots must navigate like humans, interpreting\nstimuli to locate potential victims rapidly without interfering with first\nresponders. Existing socially-aware navigation algorithms face computational\nand adaptability challenges. To overcome these, we propose a solution, MIRACLE\n-- an inverse reinforcement and curriculum learning model, that employs\ngamified learning to gather stimuli-driven human navigational data. This data\nis then used to train a Deep Inverse Maximum Entropy Reinforcement Learning\nmodel, reducing reliance on demonstrator abilities. Testing reveals a low loss\nof 2.7717 within a 400-sized environment, signifying human-like response\nreplication. Current databases lack comprehensive stimuli-driven data,\nnecessitating our approach. By doing so, we enable robots to navigate emergency\nsituations with human-like perception, enhancing their life-saving\ncapabilities.\n","authors":["Nihal Gunukula","Kshitij Tiwari","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2312.03651v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2209.12835v3","updated":"2023-12-06T18:09:02Z","published":"2022-09-26T16:41:16Z","title":"Targeted Separation and Convergence with Kernel Discrepancies","summary":" Maximum mean discrepancies (MMDs) like the kernel Stein discrepancy (KSD)\nhave grown central to a wide range of applications, including hypothesis\ntesting, sampler selection, distribution approximation, and variational\ninference. In each setting, these kernel-based discrepancy measures are\nrequired to (i) separate a target P from other probability measures or even\n(ii) control weak convergence to P. In this article we derive new sufficient\nand necessary conditions to ensure (i) and (ii). For MMDs on separable metric\nspaces, we characterize those kernels that separate Bochner embeddable measures\nand introduce simple conditions for separating all measures with unbounded\nkernels and for controlling convergence with bounded kernels. We use these\nresults on $\\mathbb{R}^d$ to substantially broaden the known conditions for KSD\nseparation and convergence control and to develop the first KSDs known to\nexactly metrize weak convergence to P. Along the way, we highlight the\nimplications of our results for hypothesis testing, measuring and improving\nsample quality, and sampling with Stein variational gradient descent.\n","authors":["Alessandro Barp","Carl-Johann Simon-Gabriel","Mark Girolami","Lester Mackey"],"pdf_url":"https://arxiv.org/pdf/2209.12835v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.16380v3","updated":"2023-12-06T18:06:10Z","published":"2022-10-28T19:39:14Z","title":"Incorporating Crowdsourced Annotator Distributions into Ensemble\n Modeling to Improve Classification Trustworthiness for Ancient Greek Papyri","summary":" Performing classification on noisy, crowdsourced image datasets can prove\nchallenging even for the best neural networks. Two issues which complicate the\nproblem on such datasets are class imbalance and ground-truth uncertainty in\nlabeling. The AL-ALL and AL-PUB datasets - consisting of tightly cropped,\nindividual characters from images of ancient Greek papyri - are strongly\naffected by both issues. The application of ensemble modeling to such datasets\ncan help identify images where the ground-truth is questionable and quantify\nthe trustworthiness of those samples. As such, we apply stacked generalization\nconsisting of nearly identical ResNets with different loss functions: one\nutilizing sparse cross-entropy (CXE) and the other Kullback-Liebler Divergence\n(KLD). Both networks use labels drawn from a crowd-sourced consensus. This\nconsensus is derived from a Normalized Distribution of Annotations (NDA) based\non all annotations for a given character in the dataset. For the second\nnetwork, the KLD is calculated with respect to the NDA. For our ensemble model,\nwe apply a k-nearest neighbors model to the outputs of the CXE and KLD\nnetworks. Individually, the ResNet models have approximately 93% accuracy,\nwhile the ensemble model achieves an accuracy of > 95%, increasing the\nclassification trustworthiness. We also perform an analysis of the Shannon\nentropy of the various models' output distributions to measure classification\nuncertainty. Our results suggest that entropy is useful for predicting model\nmisclassifications.\n","authors":["Graham West","Matthew I. Swindall","Ben Keener","Timothy Player","Alex C. Williams","James H. Brusuelas","John F. Wallin"],"pdf_url":"https://arxiv.org/pdf/2210.16380v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03647v1","updated":"2023-12-06T18:05:41Z","published":"2023-12-06T18:05:41Z","title":"Editable Stain Transformation Of Histological Images Using Unpaired GANs","summary":" Double staining in histopathology, particularly for metaplastic breast\ncancer, typically employs H&E and P63 dyes. However, P63's tissue damage and\nhigh cost necessitate alternative methods. This study introduces xAI-CycleGAN,\nan advanced architecture combining Mask CycleGAN with explainability features\nand structure-preserving capabilities for transforming H&E stained breast\ntissue images into P63-like images. The architecture allows for output editing,\nenhancing resemblance to actual images and enabling further model refinement.\nWe showcase xAI-CycleGAN's efficacy in maintaining structural integrity and\ngenerating high-quality images. Additionally, a histopathologist survey\nindicates the generated images' realism is often comparable to actual images,\nvalidating our model's high-quality output.\n","authors":["Tibor Sloboda","Lukáš Hudec","Wanda Benešová"],"pdf_url":"https://arxiv.org/pdf/2312.03647v1.pdf","comment":"13 pages, 5 figures, AIRCAD ICIAP 2023 in Udine"},{"id":"http://arxiv.org/abs/2312.03644v1","updated":"2023-12-06T17:59:34Z","published":"2023-12-06T17:59:34Z","title":"MACCA: Offline Multi-agent Reinforcement Learning with Causal Credit\n Assignment","summary":" Offline Multi-agent Reinforcement Learning (MARL) is valuable in scenarios\nwhere online interaction is impractical or risky. While independent learning in\nMARL offers flexibility and scalability, accurately assigning credit to\nindividual agents in offline settings poses challenges due to partial\nobservability and emergent behavior. Directly transferring the online credit\nassignment method to offline settings results in suboptimal outcomes due to the\nabsence of real-time feedback and intricate agent interactions. Our approach,\nMACCA, characterizing the generative process as a Dynamic Bayesian Network,\ncaptures relationships between environmental variables, states, actions, and\nrewards. Estimating this model on offline data, MACCA can learn each agent's\ncontribution by analyzing the causal relationship of their individual rewards,\nensuring accurate and interpretable credit assignment. Additionally, the\nmodularity of our approach allows it to seamlessly integrate with various\noffline MARL methods. Theoretically, we proved that under the setting of the\noffline dataset, the underlying causal structure and the function for\ngenerating the individual rewards of agents are identifiable, which laid the\nfoundation for the correctness of our modeling. Experimentally, we tested MACCA\nin two environments, including discrete and continuous action settings. The\nresults show that MACCA outperforms SOTA methods and improves performance upon\ntheir backbones.\n","authors":["Ziyan Wang","Yali Du","Yudi Zhang","Meng Fang","Biwei Huang"],"pdf_url":"https://arxiv.org/pdf/2312.03644v1.pdf","comment":"16 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.03642v1","updated":"2023-12-06T17:53:06Z","published":"2023-12-06T17:53:06Z","title":"Transformer-Powered Surrogates Close the ICF Simulation-Experiment Gap\n with Extremely Limited Data","summary":" Recent advances in machine learning, specifically transformer architecture,\nhave led to significant advancements in commercial domains. These powerful\nmodels have demonstrated superior capability to learn complex relationships and\noften generalize better to new data and problems. This paper presents a novel\ntransformer-powered approach for enhancing prediction accuracy in multi-modal\noutput scenarios, where sparse experimental data is supplemented with\nsimulation data. The proposed approach integrates transformer-based\narchitecture with a novel graph-based hyper-parameter optimization technique.\nThe resulting system not only effectively reduces simulation bias, but also\nachieves superior prediction accuracy compared to the prior method. We\ndemonstrate the efficacy of our approach on inertial confinement fusion\nexperiments, where only 10 shots of real-world data are available, as well as\nsynthetic versions of these experiments.\n","authors":["Matthew L. Olson","Shusen Liu","Jayaraman J. Thiagarajan","Bogdan Kustowski","Weng-Keen Wong","Rushil Anirudh"],"pdf_url":"https://arxiv.org/pdf/2312.03642v1.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2305.14164v2","updated":"2023-12-06T17:50:50Z","published":"2023-05-23T15:29:09Z","title":"Improved Convergence of Score-Based Diffusion Models via\n Prediction-Correction","summary":" Score-based generative models (SGMs) are powerful tools to sample from\ncomplex data distributions. Their underlying idea is to (i) run a forward\nprocess for time $T_1$ by adding noise to the data, (ii) estimate its score\nfunction, and (iii) use such estimate to run a reverse process. As the reverse\nprocess is initialized with the stationary distribution of the forward one, the\nexisting analysis paradigm requires $T_1\\to\\infty$. This is however\nproblematic: from a theoretical viewpoint, for a given precision of the score\napproximation, the convergence guarantee fails as $T_1$ diverges; from a\npractical viewpoint, a large $T_1$ increases computational costs and leads to\nerror propagation. This paper addresses the issue by considering a version of\nthe popular predictor-corrector scheme: after running the forward process, we\nfirst estimate the final distribution via an inexact Langevin dynamics and then\nrevert the process. Our key technical contribution is to provide convergence\nguarantees which require to run the forward process only for a fixed finite\ntime $T_1$. Our bounds exhibit a mild logarithmic dependence on the input\ndimension and the subgaussian norm of the target distribution, have minimal\nassumptions on the data, and require only to control the $L^2$ loss on the\nscore approximation, which is the quantity minimized in practice.\n","authors":["Francesco Pedrotti","Jan Maas","Marco Mondelli"],"pdf_url":"https://arxiv.org/pdf/2305.14164v2.pdf","comment":"33 pages; included discretization result; typos corrected"},{"id":"http://arxiv.org/abs/2312.03641v1","updated":"2023-12-06T17:49:57Z","published":"2023-12-06T17:49:57Z","title":"MotionCtrl: A Unified and Flexible Motion Controller for Video\n Generation","summary":" Motions in a video primarily consist of camera motion, induced by camera\nmovement, and object motion, resulting from object movement. Accurate control\nof both camera and object motion is essential for video generation. However,\nexisting works either mainly focus on one type of motion or do not clearly\ndistinguish between the two, limiting their control capabilities and diversity.\nTherefore, this paper presents MotionCtrl, a unified and flexible motion\ncontroller for video generation designed to effectively and independently\ncontrol camera and object motion. The architecture and training strategy of\nMotionCtrl are carefully devised, taking into account the inherent properties\nof camera motion, object motion, and imperfect training data. Compared to\nprevious methods, MotionCtrl offers three main advantages: 1) It effectively\nand independently controls camera motion and object motion, enabling more\nfine-grained motion control and facilitating flexible and diverse combinations\nof both types of motion. 2) Its motion conditions are determined by camera\nposes and trajectories, which are appearance-free and minimally impact the\nappearance or shape of objects in generated videos. 3) It is a relatively\ngeneralizable model that can adapt to a wide array of camera poses and\ntrajectories once trained. Extensive qualitative and quantitative experiments\nhave been conducted to demonstrate the superiority of MotionCtrl over existing\nmethods.\n","authors":["Zhouxia Wang","Ziyang Yuan","Xintao Wang","Tianshui Chen","Menghan Xia","Ping Luo","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.03641v1.pdf","comment":"Project Page: https://wzhouxiff.github.io/projects/MotionCtrl/"},{"id":"http://arxiv.org/abs/2312.03633v1","updated":"2023-12-06T17:29:45Z","published":"2023-12-06T17:29:45Z","title":"Not All Large Language Models (LLMs) Succumb to the \"Reversal Curse\": A\n Comparative Study of Deductive Logical Reasoning in BERT and GPT Models","summary":" The \"Reversal Curse\" refers to the scenario where auto-regressive decoder\nlarge language models (LLMs), such as ChatGPT, trained on \"A is B\" fail to\nlearn \"B is A\", demonstrating a basic failure of logical deduction. This raises\na red flag in the use of GPT models for certain general tasks such as\nconstructing knowledge graphs, considering their adherence to this symmetric\nprinciple. In our study, we examined a bidirectional LLM, BERT, and found that\nit is immune to the reversal curse. Driven by ongoing efforts to construct\nbiomedical knowledge graphs with LLMs, we also embarked on evaluating more\ncomplex but essential deductive reasoning capabilities. This process included\nfirst training encoder and decoder language models to master the intersection\n($\\cap$) and union ($\\cup$) operations on two sets and then moving on to assess\ntheir capability to infer different combinations of union ($\\cup$) and\nintersection ($\\cap$) operations on three newly created sets. The findings\nshowed that while both encoder and decoder language models, trained for tasks\ninvolving two sets (union/intersection), were proficient in such scenarios,\nthey encountered difficulties when dealing with operations that included three\nsets (various combinations of union and intersection). Our research highlights\nthe distinct characteristics of encoder and decoder models in simple and\ncomplex logical reasoning. In practice, the choice between BERT and GPT should\nbe guided by the specific requirements and nature of the task at hand,\nleveraging their respective strengths in bidirectional context comprehension\nand sequence prediction.\n","authors":["Jingye Yang","Da Wu","Kai Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03633v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03632v1","updated":"2023-12-06T17:29:03Z","published":"2023-12-06T17:29:03Z","title":"Multimodal Data and Resource Efficient Device-Directed Speech Detection\n with Large Foundation Models","summary":" Interactions with virtual assistants typically start with a trigger phrase\nfollowed by a command. In this work, we explore the possibility of making these\ninteractions more natural by eliminating the need for a trigger phrase. Our\ngoal is to determine whether a user addressed the virtual assistant based on\nsignals obtained from the streaming audio recorded by the device microphone. We\naddress this task by combining 1-best hypotheses and decoder signals from an\nautomatic speech recognition system with acoustic representations from an audio\nencoder as input features to a large language model (LLM). In particular, we\nare interested in data and resource efficient systems that require only a small\namount of training data and can operate in scenarios with only a single frozen\nLLM available on a device. For this reason, our model is trained on 80k or less\nexamples of multimodal data using a combination of low-rank adaptation and\nprefix tuning. We compare the proposed system to unimodal baselines and show\nthat the multimodal approach achieves lower equal-error-rates (EERs), while\nusing only a fraction of the training data. We also show that low-dimensional\nspecialized audio representations lead to lower EERs than high-dimensional\ngeneral audio representations.\n","authors":["Dominik Wagner","Alexander Churchill","Siddharth Sigtia","Panayiotis Georgiou","Matt Mirsamadi","Aarshee Mishra","Erik Marchi"],"pdf_url":"https://arxiv.org/pdf/2312.03632v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04064v2","updated":"2023-12-06T17:17:00Z","published":"2023-11-07T15:25:52Z","title":"KPI Extraction from Maintenance Work Orders -- A Comparison of Expert\n Labeling, Text Classification and AI-Assisted Tagging for Computing Failure\n Rates of Wind Turbines","summary":" Maintenance work orders are commonly used to document information about wind\nturbine operation and maintenance. This includes details about proactive and\nreactive wind turbine downtimes, such as preventative and corrective\nmaintenance. However, the information contained in maintenance work orders is\noften unstructured and difficult to analyze, presenting challenges for\ndecision-makers wishing to use it for optimizing operation and maintenance. To\naddress this issue, this work compares three different approaches to calculate\nreliability by performance indicators from maintenance work orders. The first\napproach involves manual labeling of the maintenance work orders by domain\nexperts, using the schema defined in an industrial guideline to assign the\nlabel accordingly. The second approach involves the development of a model that\nautomatically labels the maintenance work orders using text classification\nmethods. Through this method, we are able to achieve macro average and weighted\naverage F1-Scores of 0.75 and 0.85 respectively. The third technique uses an\nAI-assisted tagging tool to tag and structure the raw maintenance information,\ntogether with a novel rule-based approach for extracting relevant maintenance\nwork orders for failure rate calculation. In our experiments the AI-assisted\ntool leads to a 88% drop in tagging time in comparison to the other two\napproaches, while expert labeling and text classification are more accurate in\nKPI extraction. Overall, our findings make extracting maintenance information\nfrom maintenance work orders more efficient, enable the assessment of\nreliability key performance indicators and therefore support the optimization\nof wind turbine operation and maintenance.\n","authors":["Marc-Alexander Lutz","Bastian Schäfermeier","Rachael Sexton","Michael Sharp","Alden Dima","Stefan Faulstich","Jagan Mohini Aluri"],"pdf_url":"https://arxiv.org/pdf/2311.04064v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.18260v2","updated":"2023-12-06T17:16:07Z","published":"2023-11-30T05:38:34Z","title":"Consensus, dissensus and synergy between clinicians and specialist\n foundation models in radiology report generation","summary":" Radiology reports are an instrumental part of modern medicine, informing key\nclinical decisions such as diagnosis and treatment. The worldwide shortage of\nradiologists, however, restricts access to expert care and imposes heavy\nworkloads, contributing to avoidable errors and delays in report delivery.\nWhile recent progress in automated report generation with vision-language\nmodels offer clear potential in ameliorating the situation, the path to\nreal-world adoption has been stymied by the challenge of evaluating the\nclinical quality of AI-generated reports. In this study, we build a\nstate-of-the-art report generation system for chest radiographs,\n\\textit{Flamingo-CXR}, by fine-tuning a well-known vision-language foundation\nmodel on radiology data. To evaluate the quality of the AI-generated reports, a\ngroup of 16 certified radiologists provide detailed evaluations of AI-generated\nand human written reports for chest X-rays from an intensive care setting in\nthe United States and an inpatient setting in India. At least one radiologist\n(out of two per case) preferred the AI report to the ground truth report in\nover 60$\\%$ of cases for both datasets. Amongst the subset of AI-generated\nreports that contain errors, the most frequently cited reasons were related to\nthe location and finding, whereas for human written reports, most mistakes were\nrelated to severity and finding. This disparity suggested potential\ncomplementarity between our AI system and human experts, prompting us to\ndevelop an assistive scenario in which \\textit{Flamingo-CXR} generates a\nfirst-draft report, which is subsequently revised by a clinician. This is the\nfirst demonstration of clinician-AI collaboration for report writing, and the\nresultant reports are assessed to be equivalent or preferred by at least one\nradiologist to reports written by experts alone in 80$\\%$ of in-patient cases\nand 60$\\%$ of intensive care cases.\n","authors":["Ryutaro Tanno","David G. T. Barrett","Andrew Sellergren","Sumedh Ghaisas","Sumanth Dathathri","Abigail See","Johannes Welbl","Karan Singhal","Shekoofeh Azizi","Tao Tu","Mike Schaekermann","Rhys May","Roy Lee","SiWai Man","Zahra Ahmed","Sara Mahdavi","Danielle Belgrave","Vivek Natarajan","Shravya Shetty","Pushmeet Kohli","Po-Sen Huang","Alan Karthikesalingam","Ira Ktena"],"pdf_url":"https://arxiv.org/pdf/2311.18260v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03619v1","updated":"2023-12-06T17:07:42Z","published":"2023-12-06T17:07:42Z","title":"Evaluation of Active Feature Acquisition Methods for Static Feature\n Settings","summary":" Active feature acquisition (AFA) agents, crucial in domains like healthcare\nwhere acquiring features is often costly or harmful, determine the optimal set\nof features for a subsequent classification task. As deploying an AFA agent\nintroduces a shift in missingness distribution, it's vital to assess its\nexpected performance at deployment using retrospective data. In a companion\npaper, we introduce a semi-offline reinforcement learning (RL) framework for\nactive feature acquisition performance evaluation (AFAPE) where features are\nassumed to be time-dependent. Here, we study and extend the AFAPE problem to\ncover static feature settings, where features are time-invariant, and hence\nprovide more flexibility to the AFA agents in deciding the order of the\nacquisitions. In this static feature setting, we derive and adapt new inverse\nprobability weighting (IPW), direct method (DM), and double reinforcement\nlearning (DRL) estimators within the semi-offline RL framework. These\nestimators can be applied when the missingness in the retrospective dataset\nfollows a missing-at-random (MAR) pattern. They also can be applied to\nmissing-not-at-random (MNAR) patterns in conjunction with appropriate existing\nmissing data techniques. We illustrate the improved data efficiency offered by\nthe semi-offline RL estimators in synthetic and real-world data experiments\nunder synthetic MAR and MNAR missingness.\n","authors":["Henrik von Kleist","Alireza Zamanian","Ilya Shpitser","Narges Ahmidi"],"pdf_url":"https://arxiv.org/pdf/2312.03619v1.pdf","comment":"38 pages, 7 figures, 5 tables. arXiv admin note: substantial text\n overlap with arXiv:2312.01530"},{"id":"http://arxiv.org/abs/2310.05736v2","updated":"2023-12-06T17:02:25Z","published":"2023-10-09T14:10:21Z","title":"LLMLingua: Compressing Prompts for Accelerated Inference of Large\n Language Models","summary":" Large language models (LLMs) have been applied in various applications due to\ntheir astonishing capabilities. With advancements in technologies such as\nchain-of-thought (CoT) prompting and in-context learning (ICL), the prompts fed\nto LLMs are becoming increasingly lengthy, even exceeding tens of thousands of\ntokens. To accelerate model inference and reduce cost, this paper presents\nLLMLingua, a coarse-to-fine prompt compression method that involves a budget\ncontroller to maintain semantic integrity under high compression ratios, a\ntoken-level iterative compression algorithm to better model the interdependence\nbetween compressed contents, and an instruction tuning based method for\ndistribution alignment between language models. We conduct experiments and\nanalysis over four datasets from different scenarios, i.e., GSM8K, BBH,\nShareGPT, and Arxiv-March23; showing that the proposed approach yields\nstate-of-the-art performance and allows for up to 20x compression with little\nperformance loss. Our code is available at https://aka.ms/LLMLingua.\n","authors":["Huiqiang Jiang","Qianhui Wu","Chin-Yew Lin","Yuqing Yang","Lili Qiu"],"pdf_url":"https://arxiv.org/pdf/2310.05736v2.pdf","comment":"Accepted at EMNLP 2023"},{"id":"http://arxiv.org/abs/2312.03612v1","updated":"2023-12-06T16:56:28Z","published":"2023-12-06T16:56:28Z","title":"Physical Symbolic Optimization","summary":" We present a framework for constraining the automatic sequential generation\nof equations to obey the rules of dimensional analysis by construction.\nCombining this approach with reinforcement learning, we built $\\Phi$-SO, a\nPhysical Symbolic Optimization method for recovering analytical functions from\nphysical data leveraging units constraints. Our symbolic regression algorithm\nachieves state-of-the-art results in contexts in which variables and constants\nhave known physical units, outperforming all other methods on SRBench's Feynman\nbenchmark in the presence of noise (exceeding 0.1%) and showing resilience even\nin the presence of significant (10%) levels of noise.\n","authors":["Wassim Tenachi","Rodrigo Ibata","Foivos I. Diakogiannis"],"pdf_url":"https://arxiv.org/pdf/2312.03612v1.pdf","comment":"6 pages, 2 figures, 1 table. Accepted to NeurIPS 2023, Machine\n Learning for Physical Sciences workshop"},{"id":"http://arxiv.org/abs/2312.03611v1","updated":"2023-12-06T16:55:53Z","published":"2023-12-06T16:55:53Z","title":"DreamComposer: Controllable 3D Object Generation via Multi-View\n Conditions","summary":" Utilizing pre-trained 2D large-scale generative models, recent works are\ncapable of generating high-quality novel views from a single in-the-wild image.\nHowever, due to the lack of information from multiple views, these works\nencounter difficulties in generating controllable novel views. In this paper,\nwe present DreamComposer, a flexible and scalable framework that can enhance\nexisting view-aware diffusion models by injecting multi-view conditions.\nSpecifically, DreamComposer first uses a view-aware 3D lifting module to obtain\n3D representations of an object from multiple views. Then, it renders the\nlatent features of the target view from 3D representations with the multi-view\nfeature fusion module. Finally the target view features extracted from\nmulti-view inputs are injected into a pre-trained diffusion model. Experiments\nshow that DreamComposer is compatible with state-of-the-art diffusion models\nfor zero-shot novel view synthesis, further enhancing them to generate\nhigh-fidelity novel view images with multi-view conditions, ready for\ncontrollable 3D object reconstruction and various other applications.\n","authors":["Yunhan Yang","Yukun Huang","Xiaoyang Wu","Yuan-Chen Guo","Song-Hai Zhang","Hengshuang Zhao","Tong He","Xihui Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03611v1.pdf","comment":"Project Page: https://yhyang-myron.github.io/DreamComposer/"},{"id":"http://arxiv.org/abs/2312.03606v1","updated":"2023-12-06T16:53:17Z","published":"2023-12-06T16:53:17Z","title":"DiffusionSat: A Generative Foundation Model for Satellite Imagery","summary":" Diffusion models have achieved state-of-the-art results on many modalities\nincluding images, speech, and video. However, existing models are not tailored\nto support remote sensing data, which is widely used in important applications\nincluding environmental monitoring and crop-yield prediction. Satellite images\nare significantly different from natural images -- they can be multi-spectral,\nirregularly sampled across time -- and existing diffusion models trained on\nimages from the Web do not support them. Furthermore, remote sensing data is\ninherently spatio-temporal, requiring conditional generation tasks not\nsupported by traditional methods based on captions or images. In this paper, we\npresent DiffusionSat, to date the largest generative foundation model trained\non a collection of publicly available large, high-resolution remote sensing\ndatasets. As text-based captions are sparsely available for satellite images,\nwe incorporate the associated metadata such as geolocation as conditioning\ninformation. Our method produces realistic samples and can be used to solve\nmultiple generative tasks including temporal generation, superresolution given\nmulti-spectral inputs and in-painting. Our method outperforms previous\nstate-of-the-art methods for satellite image generation and is the first\nlarge-scale $\\textit{generative}$ foundation model for satellite imagery.\n","authors":["Samar Khanna","Patrick Liu","Linqi Zhou","Chenlin Meng","Robin Rombach","Marshall Burke","David Lobell","Stefano Ermon"],"pdf_url":"https://arxiv.org/pdf/2312.03606v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.11891v2","updated":"2023-12-06T16:51:09Z","published":"2023-10-18T11:20:59Z","title":"A Hyperparameter Study for Quantum Kernel Methods","summary":" Quantum kernel methods are a promising method in quantum machine learning\nthanks to the guarantees connected to them. Their accessibility for analytic\nconsiderations also opens up the possibility of prescreening datasets based on\ntheir potential for a quantum advantage. To do so, earlier works developed the\ngeometric difference, which can be understood as a closeness measure between\ntwo kernel-based machine learning approaches, most importantly between a\nquantum kernel and classical kernel. This metric links the quantum and\nclassical model complexities. Therefore, it raises the question of whether the\ngeometric difference, based on its relation to model complexity, can be a\nuseful tool in evaluations other than for the potential for quantum advantage.\nIn this work, we investigate the effects of hyperparameter choice on the model\nperformance and the generalization gap between classical and quantum kernels.\nThe importance of hyperparameter optimization is well known also for classical\nmachine learning. Especially for the quantum Hamiltonian evolution feature map,\nthe scaling of the input data has been shown to be crucial. However, there are\nadditional parameters left to be optimized, like the best number of qubits to\ntrace out before computing a projected quantum kernel. We investigate the\ninfluence of these hyperparameters and compare the classically reliable method\nof cross validation with the method of choosing based on the geometric\ndifference. Based on the thorough investigation of the hyperparameters across\n11 datasets we identified commodities that can be exploited when examining a\nnew dataset. In addition, our findings contribute to better understanding of\nthe applicability of the geometric difference.\n","authors":["Sebastian Egginger","Alona Sakhnenko","Jeanette Miriam Lorenz"],"pdf_url":"https://arxiv.org/pdf/2310.11891v2.pdf","comment":"Updated experimental results, adapted text"},{"id":"http://arxiv.org/abs/2006.03487v2","updated":"2023-12-06T16:46:51Z","published":"2020-06-05T14:42:49Z","title":"Dimensionless Anomaly Detection on Multivariate Streams with Variance\n Norm and Path Signature","summary":" In this paper, we propose a dimensionless anomaly detection method for\nmultivariate streams. Our method is independent of the unit of measurement for\nthe different stream channels, therefore dimensionless. We first propose the\nvariance norm, a generalisation of Mahalanobis distance to handle\ninfinite-dimensional feature space and singular empirical covariance matrix\nrigorously. We then combine the variance norm with the path signature, an\ninfinite collection of iterated integrals that provide global features of\nstreams, to propose SigMahaKNN, a method for anomaly detection on\n(multivariate) streams. We show that SigMahaKNN is invariant to stream\nreparametrisation, stream concatenation and has a graded discrimination power\ndepending on the truncation level of the path signature. We implement\nSigMahaKNN as an open-source software, and perform extensive numerical\nexperiments, showing significantly improved anomaly detection on streams\ncompared to isolation forest and local outlier factors in applications ranging\nfrom language analysis, hand-writing analysis, ship movement paths analysis and\nunivariate time-series analysis.\n","authors":["Zhen Shao","Ryan Sze-Yin Chan","Thomas Cochrane","Peter Foster","Terry Lyons"],"pdf_url":"https://arxiv.org/pdf/2006.03487v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03596v1","updated":"2023-12-06T16:35:59Z","published":"2023-12-06T16:35:59Z","title":"MMM: Generative Masked Motion Model","summary":" Recent advances in text-to-motion generation using diffusion and\nautoregressive models have shown promising results. However, these models often\nsuffer from a trade-off between real-time performance, high fidelity, and\nmotion editability. To address this gap, we introduce MMM, a novel yet simple\nmotion generation paradigm based on Masked Motion Model. MMM consists of two\nkey components: (1) a motion tokenizer that transforms 3D human motion into a\nsequence of discrete tokens in latent space, and (2) a conditional masked\nmotion transformer that learns to predict randomly masked motion tokens,\nconditioned on the pre-computed text tokens. By attending to motion and text\ntokens in all directions, MMM explicitly captures inherent dependency among\nmotion tokens and semantic mapping between motion and text tokens. During\ninference, this allows parallel and iterative decoding of multiple motion\ntokens that are highly consistent with fine-grained text descriptions,\ntherefore simultaneously achieving high-fidelity and high-speed motion\ngeneration. In addition, MMM has innate motion editability. By simply placing\nmask tokens in the place that needs editing, MMM automatically fills the gaps\nwhile guaranteeing smooth transitions between editing and non-editing parts.\nExtensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM\nsurpasses current leading methods in generating high-quality motion (evidenced\nby superior FID scores of 0.08 and 0.429), while offering advanced editing\nfeatures such as body-part modification, motion in-betweening, and the\nsynthesis of long motion sequences. In addition, MMM is two orders of magnitude\nfaster on a single mid-range GPU than editable motion diffusion models. Our\nproject page is available at \\url{https://exitudio.github.io/MMM-page}.\n","authors":["Ekkasit Pinyoanuntapong","Pu Wang","Minwoo Lee","Chen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03596v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16173v2","updated":"2023-12-06T16:31:50Z","published":"2023-11-22T03:36:18Z","title":"Conditions for Length Generalization in Learning Reasoning Skills","summary":" Reasoning is a fundamental capability of AI agents. Recently, large language\nmodels (LLMs) have shown remarkable abilities to perform reasoning tasks.\nHowever, numerous evaluations of the reasoning capabilities of LLMs have also\nshowed some limitations. An outstanding limitation is length generalization,\nmeaning that when trained on reasoning problems of smaller lengths or sizes,\nthe resulting models struggle with problems of larger sizes or lengths. This\npotentially indicates some theoretical limitations of generalization in\nlearning reasoning skills. These evaluations and their observations motivated\nus to perform a theoretical study of the length generalization problem. This\nwork focuses on reasoning tasks that can be formulated as Markov dynamic\nprocesses (MDPs) and/or directed acyclic graphs (DAGs). It identifies and\nproves conditions that decide whether the length generalization problem can be\nsolved or not for a reasoning task in a particular representation. Experiments\nare also conducted to verify the theoretical results.\n","authors":["Changnan Xiao","Bing Liu"],"pdf_url":"https://arxiv.org/pdf/2311.16173v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.01090v3","updated":"2023-12-06T16:23:35Z","published":"2023-05-01T21:14:47Z","title":"Autoencoders for discovering manifold dimension and coordinates in data\n from complex dynamical systems","summary":" While many phenomena in physics and engineering are formally\nhigh-dimensional, their long-time dynamics often live on a lower-dimensional\nmanifold. The present work introduces an autoencoder framework that combines\nimplicit regularization with internal linear layers and $L_2$ regularization\n(weight decay) to automatically estimate the underlying dimensionality of a\ndata set, produce an orthogonal manifold coordinate system, and provide the\nmapping functions between the ambient space and manifold space, allowing for\nout-of-sample projections. We validate our framework's ability to estimate the\nmanifold dimension for a series of datasets from dynamical systems of varying\ncomplexities and compare to other state-of-the-art estimators. We analyze the\ntraining dynamics of the network to glean insight into the mechanism of\nlow-rank learning and find that collectively each of the implicit regularizing\nlayers compound the low-rank representation and even self-correct during\ntraining. Analysis of gradient descent dynamics for this architecture in the\nlinear case reveals the role of the internal linear layers in leading to faster\ndecay of a \"collective weight variable\" incorporating all layers, and the role\nof weight decay in breaking degeneracies and thus driving convergence along\ndirections in which no decay would occur in its absence. We show that this\nframework can be naturally extended for applications of state-space modeling\nand forecasting by generating a data-driven dynamic model of a spatiotemporally\nchaotic partial differential equation using only the manifold coordinates.\nFinally, we demonstrate that our framework is robust to hyperparameter choices.\n","authors":["Kevin Zeng","Carlos E. Pérez De Jesús","Andrew J. Fox","Michael D. Graham"],"pdf_url":"https://arxiv.org/pdf/2305.01090v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03059v4","updated":"2023-12-06T16:16:38Z","published":"2023-10-04T16:49:36Z","title":"Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models","summary":" The popularity of pre-trained large models has revolutionized downstream\ntasks across diverse fields, such as language, vision, and multi-modality. To\nminimize the adaption cost for downstream tasks, many Parameter-Efficient\nFine-Tuning (PEFT) techniques are proposed for language and 2D image\npre-trained models. However, the specialized PEFT method for 3D pre-trained\nmodels is still under-explored. To this end, we introduce Point-PEFT, a novel\nframework for adapting point cloud pre-trained models with minimal learnable\nparameters. Specifically, for a pre-trained 3D model, we freeze most of its\nparameters, and only tune the newly added PEFT modules on downstream tasks,\nwhich consist of a Point-prior Prompt and a Geometry-aware Adapter. The\nPoint-prior Prompt adopts a set of learnable prompt tokens, for which we\npropose to construct a memory bank with domain-specific knowledge, and utilize\na parameter-free attention to enhance the prompt tokens. The Geometry-aware\nAdapter aims to aggregate point cloud features within spatial neighborhoods to\ncapture fine-grained geometric information through local interactions.\nExtensive experiments indicate that our Point-PEFT can achieve better\nperformance than the full fine-tuning on various downstream tasks, while using\nonly 5% of the trainable parameters, demonstrating the efficiency and\neffectiveness of our approach. Code will be released at\nhttps://github.com/Even-JK/PEFT-3D.\n","authors":["Ivan Tang","Ray Zhang","Zoey Guo","Xianzheng Ma","Dong Wang","Zhigang Wang","Bin Zhao","Xuelong Li"],"pdf_url":"https://arxiv.org/pdf/2310.03059v4.pdf","comment":"10 pages. The specialized PEFT framework for 3D pre-trained models,\n which achieves competitive performance to full fine-tuning, and significantly\n reduces the computational resources. Project page:\n https://github.com/Even-JK/PEFT-3D"},{"id":"http://arxiv.org/abs/2312.03580v1","updated":"2023-12-06T16:16:31Z","published":"2023-12-06T16:16:31Z","title":"Invariance & Causal Representation Learning: Prospects and Limitations","summary":" In causal models, a given mechanism is assumed to be invariant to changes of\nother mechanisms. While this principle has been utilized for inference in\nsettings where the causal variables are observed, theoretical insights when the\nvariables of interest are latent are largely missing. We assay the connection\nbetween invariance and causal representation learning by establishing\nimpossibility results which show that invariance alone is insufficient to\nidentify latent causal variables. Together with practical considerations, we\nuse these theoretical findings to highlight the need for additional constraints\nin order to identify representations by exploiting invariance.\n","authors":["Simon Bing","Jonas Wahl","Urmi Ninad","Jakob Runge"],"pdf_url":"https://arxiv.org/pdf/2312.03580v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00775v3","updated":"2023-12-06T16:08:51Z","published":"2023-11-01T18:38:31Z","title":"Harnessing machine learning for accurate treatment of overlapping\n opacity species in general circulation models","summary":" To understand high precision observations of exoplanets and brown dwarfs, we\nneed detailed and complex general circulation models (GCMs) that incorporate\nhydrodynamics, chemistry, and radiation. For this study, we specifically\nexamined the coupling between chemistry and radiation in GCMs and compared\ndifferent methods for the mixing of opacities of different chemical species in\nthe correlated-k assumption, when equilibrium chemistry cannot be assumed. We\npropose a fast machine learning method based on DeepSets (DS), which\neffectively combines individual correlated-k opacities (k-tables). We evaluated\nthe DS method alongside other published methods such as adaptive equivalent\nextinction (AEE) and random overlap with rebinning and resorting (RORR). We\nintegrated these mixing methods into our GCM (expeRT/MITgcm) and assessed their\naccuracy and performance for the example of the hot Jupiter HD~209458 b. Our\nfindings indicate that the DS method is both accurate and efficient for GCM\nusage, whereas RORR is too slow. Additionally, we observed that the accuracy of\nAEE depends on its specific implementation and may introduce numerical issues\nin achieving radiative transfer solution convergence. We then applied the DS\nmixing method in a simplified chemical disequilibrium situation, where we\nmodeled the rainout of TiO and VO, and confirmed that the rainout of TiO and VO\nwould hinder the formation of a stratosphere. To further expedite the\ndevelopment of consistent disequilibrium chemistry calculations in GCMs, we\nprovide documentation and code for coupling the DS mixing method with\ncorrelated-k radiative transfer solvers. The DS method has been extensively\ntested to be accurate enough for GCMs; however, other methods might be needed\nfor accelerating atmospheric retrievals.\n","authors":["Aaron David Schneider","Paul Mollière","Gilles Louppe","Ludmila Carone","Uffe Gråe Jørgensen","Leen Decin","Christiane Helling"],"pdf_url":"https://arxiv.org/pdf/2311.00775v3.pdf","comment":"Accepted for publication in A&A, language edited version"},{"id":"http://arxiv.org/abs/2312.02858v2","updated":"2023-12-06T15:52:07Z","published":"2023-12-05T16:13:34Z","title":"Towards Causal Representations of Climate Model Data","summary":" Climate models, such as Earth system models (ESMs), are crucial for\nsimulating future climate change based on projected Shared Socioeconomic\nPathways (SSP) greenhouse gas emissions scenarios. While ESMs are sophisticated\nand invaluable, machine learning-based emulators trained on existing simulation\ndata can project additional climate scenarios much faster and are\ncomputationally efficient. However, they often lack generalizability and\ninterpretability. This work delves into the potential of causal representation\nlearning, specifically the \\emph{Causal Discovery with Single-parent Decoding}\n(CDSD) method, which could render climate model emulation efficient\n\\textit{and} interpretable. We evaluate CDSD on multiple climate datasets,\nfocusing on emissions, temperature, and precipitation. Our findings shed light\non the challenges, limitations, and promise of using CDSD as a stepping stone\ntowards more interpretable and robust climate model emulation.\n","authors":["Julien Boussard","Chandni Nagda","Julia Kaltenborn","Charlotte Emilie Elektra Lange","Philippe Brouillard","Yaniv Gurwicz","Peer Nowack","David Rolnick"],"pdf_url":"https://arxiv.org/pdf/2312.02858v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03561v1","updated":"2023-12-06T15:51:49Z","published":"2023-12-06T15:51:49Z","title":"Blueprinting the Future: Automatic Item Categorization using\n Hierarchical Zero-Shot and Few-Shot Classifiers","summary":" In testing industry, precise item categorization is pivotal to align exam\nquestions with the designated content domains outlined in the assessment\nblueprint. Traditional methods either entail manual classification, which is\nlaborious and error-prone, or utilize machine learning requiring extensive\ntraining data, often leading to model underfit or overfit issues. This study\nunveils a novel approach employing the zero-shot and few-shot Generative\nPretrained Transformer (GPT) classifier for hierarchical item categorization,\nminimizing the necessity for training data, and instead, leveraging human-like\nlanguage descriptions to define categories. Through a structured python\ndictionary, the hierarchical nature of examination blueprints is navigated\nseamlessly, allowing for a tiered classification of items across multiple\nlevels. An initial simulation with artificial data demonstrates the efficacy of\nthis method, achieving an average accuracy of 92.91% measured by the F1 score.\nThis method was further applied to real exam items from the 2022 In-Training\nExamination (ITE) conducted by the American Board of Family Medicine (ABFM),\nreclassifying 200 items according to a newly formulated blueprint swiftly in 15\nminutes, a task that traditionally could span several days among editors and\nphysicians. This innovative approach not only drastically cuts down\nclassification time but also ensures a consistent, principle-driven\ncategorization, minimizing human biases and discrepancies. The ability to\nrefine classifications by adjusting definitions adds to its robustness and\nsustainability.\n","authors":["Ting Wang","Keith Stelter","Jenn Floyd","Thomas O'Neill","Nathaniel Hendrix","Andrew Bazemore","Kevin Rode","Warren Newton"],"pdf_url":"https://arxiv.org/pdf/2312.03561v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03556v1","updated":"2023-12-06T15:39:03Z","published":"2023-12-06T15:39:03Z","title":"Personalized Face Inpainting with Diffusion Models by Parallel Visual\n Attention","summary":" Face inpainting is important in various applications, such as photo\nrestoration, image editing, and virtual reality. Despite the significant\nadvances in face generative models, ensuring that a person's unique facial\nidentity is maintained during the inpainting process is still an elusive goal.\nCurrent state-of-the-art techniques, exemplified by MyStyle, necessitate\nresource-intensive fine-tuning and a substantial number of images for each new\nidentity. Furthermore, existing methods often fall short in accommodating\nuser-specified semantic attributes, such as beard or expression. To improve\ninpainting results, and reduce the computational complexity during inference,\nthis paper proposes the use of Parallel Visual Attention (PVA) in conjunction\nwith diffusion models. Specifically, we insert parallel attention matrices to\neach cross-attention module in the denoising network, which attends to features\nextracted from reference images by an identity encoder. We train the added\nattention modules and identity encoder on CelebAHQ-IDI, a dataset proposed for\nidentity-preserving face inpainting. Experiments demonstrate that PVA attains\nunparalleled identity resemblance in both face inpainting and face inpainting\nwith language guidance tasks, in comparison to various benchmarks, including\nMyStyle, Paint by Example, and Custom Diffusion. Our findings reveal that PVA\nensures good identity preservation while offering effective\nlanguage-controllability. Additionally, in contrast to Custom Diffusion, PVA\nrequires just 40 fine-tuning steps for each new identity, which translates to a\nsignificant speed increase of over 20 times.\n","authors":["Jianjin Xu","Saman Motamed","Praneetha Vaddamanu","Chen Henry Wu","Christian Haene","Jean-Charles Bazin","Fernando de la Torre"],"pdf_url":"https://arxiv.org/pdf/2312.03556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.09477v2","updated":"2023-12-06T15:09:17Z","published":"2023-07-13T14:50:04Z","title":"Towards Ordinal Data Science","summary":" Order is one of the main instruments to measure the relationship between\nobjects in (empirical) data. However, compared to methods that use numerical\nproperties of objects, the amount of ordinal methods developed is rather small.\nOne reason for this is the limited availability of computational resources in\nthe last century that would have been required for ordinal computations.\nAnother reason -- particularly important for this line of research -- is that\norder-based methods are often seen as too mathematically rigorous for applying\nthem to real-world data. In this paper, we will therefore discuss different\nmeans for measuring and 'calculating' with ordinal structures -- a specific\nclass of directed graphs -- and show how to infer knowledge from them. Our aim\nis to establish Ordinal Data Science as a fundamentally new research agenda.\nBesides cross-fertilization with other cornerstone machine learning and\nknowledge representation methods, a broad range of disciplines will benefit\nfrom this endeavor, including, psychology, sociology, economics, web science,\nknowledge engineering, scientometrics.\n","authors":["Gerd Stumme","Dominik Dürrschnabel","Tom Hanika"],"pdf_url":"https://arxiv.org/pdf/2307.09477v2.pdf","comment":"40 pages, 7 figures, Transactions on Graph Data and Knowledge (TGDK)"},{"id":"http://arxiv.org/abs/2312.03526v1","updated":"2023-12-06T14:40:05Z","published":"2023-12-06T14:40:05Z","title":"On the Diversity and Realism of Distilled Dataset: An Efficient Dataset\n Distillation Paradigm","summary":" Contemporary machine learning requires training large neural networks on\nmassive datasets and thus faces the challenges of high computational demands.\nDataset distillation, as a recent emerging strategy, aims to compress\nreal-world datasets for efficient training. However, this line of research\ncurrently struggle with large-scale and high-resolution datasets, hindering its\npracticality and feasibility. To this end, we re-examine the existing dataset\ndistillation methods and identify three properties required for large-scale\nreal-world applications, namely, realism, diversity, and efficiency. As a\nremedy, we propose RDED, a novel computationally-efficient yet effective data\ndistillation paradigm, to enable both diversity and realism of the distilled\ndata. Extensive empirical results over various neural architectures and\ndatasets demonstrate the advancement of RDED: we can distill the full\nImageNet-1K to a small dataset comprising 10 images per class within 7 minutes,\nachieving a notable 42% top-1 accuracy with ResNet-18 on a single RTX-4090 GPU\n(while the SOTA only achieves 21% but requires 6 hours).\n","authors":["Peng Sun","Bei Shi","Daiwei Yu","Tao Lin"],"pdf_url":"https://arxiv.org/pdf/2312.03526v1.pdf","comment":"17 pages, 20 figures"},{"id":"http://arxiv.org/abs/2311.17030v2","updated":"2023-12-06T14:28:46Z","published":"2023-11-28T18:32:19Z","title":"Is This the Subspace You Are Looking for? An Interpretability Illusion\n for Subspace Activation Patching","summary":" Mechanistic interpretability aims to understand model behaviors in terms of\nspecific, interpretable features, often hypothesized to manifest as\nlow-dimensional subspaces of activations. Specifically, recent studies have\nexplored subspace interventions (such as activation patching) as a way to\nsimultaneously manipulate model behavior and attribute the features behind it\nto given subspaces.\n In this work, we demonstrate that these two aims diverge, potentially leading\nto an illusory sense of interpretability. Counterintuitively, even if a\nsubspace intervention makes the model's output behave as if the value of a\nfeature was changed, this effect may be achieved by activating a dormant\nparallel pathway leveraging another subspace that is causally disconnected from\nmodel outputs. We demonstrate this phenomenon in a distilled mathematical\nexample, in two real-world domains (the indirect object identification task and\nfactual recall), and present evidence for its prevalence in practice. In the\ncontext of factual recall, we further show a link to rank-1 fact editing,\nproviding a mechanistic explanation for previous work observing an\ninconsistency between fact editing performance and fact localization.\n However, this does not imply that activation patching of subspaces is\nintrinsically unfit for interpretability. To contextualize our findings, we\nalso show what a success case looks like in a task (indirect object\nidentification) where prior manual circuit analysis informs an understanding of\nthe location of a feature. We explore the additional evidence needed to argue\nthat a patched subspace is faithful.\n","authors":["Aleksandar Makelov","Georg Lange","Neel Nanda"],"pdf_url":"https://arxiv.org/pdf/2311.17030v2.pdf","comment":"NeurIPS 2023 Workshop on Attributing Model Behavior at Scale"},{"id":"http://arxiv.org/abs/2312.03516v1","updated":"2023-12-06T14:21:17Z","published":"2023-12-06T14:21:17Z","title":"Clustering by Contour coreset and variational quantum eigensolver","summary":" Recent work has proposed solving the k-means clustering problem on quantum\ncomputers via the Quantum Approximate Optimization Algorithm (QAOA) and coreset\ntechniques. Although the current method demonstrates the possibility of quantum\nk-means clustering, it does not ensure high accuracy and consistency across a\nwide range of datasets. The existing coreset techniques are designed for\nclassical algorithms and there has been no quantum-tailored coreset technique\nwhich is designed to boost the accuracy of quantum algorithms. In this work, we\npropose solving the k-means clustering problem with the variational quantum\neigensolver (VQE) and a customised coreset method, the Contour coreset, which\nhas been formulated with specific focus on quantum algorithms. Extensive\nsimulations with synthetic and real-life data demonstrated that our VQE+Contour\nCoreset approach outperforms existing QAOA+Coreset k-means clustering\napproaches with higher accuracy and lower standard deviation. Our work has\nshown that quantum tailored coreset techniques has the potential to\nsignificantly boost the performance of quantum algorithms when compared to\nusing generic off-the-shelf coreset techniques.\n","authors":["Canaan Yung","Muhammad Usman"],"pdf_url":"https://arxiv.org/pdf/2312.03516v1.pdf","comment":"33 pages"},{"id":"http://arxiv.org/abs/2311.09058v2","updated":"2023-12-06T14:20:53Z","published":"2023-11-15T15:50:34Z","title":"Constrained Parameter Regularization","summary":" Regularization is a critical component in deep learning training, with weight\ndecay being a commonly used approach. It applies a constant penalty coefficient\nuniformly across all parameters. This may be unnecessarily restrictive for some\nparameters, while insufficiently restricting others. To dynamically adjust\npenalty coefficients for different parameter groups, we present constrained\nparameter regularization (CPR) as an alternative to traditional weight decay.\nInstead of applying a single constant penalty to all parameters, we enforce an\nupper bound on a statistical measure (e.g., the L$_2$-norm) of parameter\ngroups. Consequently, learning becomes a constraint optimization problem, which\nwe address by an adaptation of the augmented Lagrangian method. CPR only\nrequires two hyperparameters and incurs no measurable runtime overhead.\nAdditionally, we propose a simple but efficient mechanism to adapt the upper\nbounds during the optimization. We provide empirical evidence of CPR's efficacy\nin experiments on the \"grokking\" phenomenon, computer vision, and language\nmodeling tasks. Our results demonstrate that CPR counteracts the effects of\ngrokking and consistently matches or outperforms traditional weight decay.\n","authors":["Jörg K. H. Franke","Michael Hefenbrock","Gregor Koehler","Frank Hutter"],"pdf_url":"https://arxiv.org/pdf/2311.09058v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03511v1","updated":"2023-12-06T14:13:38Z","published":"2023-12-06T14:13:38Z","title":"Kandinsky 3.0 Technical Report","summary":" We present Kandinsky 3.0, a large-scale text-to-image generation model based\non latent diffusion, continuing the series of text-to-image Kandinsky models\nand reflecting our progress to achieve higher quality and realism of image\ngeneration. Compared to previous versions of Kandinsky 2.x, Kandinsky 3.0\nleverages a two times larger U-Net backbone, a ten times larger text encoder\nand removes diffusion mapping. We describe the architecture of the model, the\ndata collection procedure, the training technique, and the production system of\nuser interaction. We focus on the key components that, as we have identified as\na result of a large number of experiments, had the most significant impact on\nimproving the quality of our model compared to the others. By our side-by-side\ncomparisons, Kandinsky becomes better in text understanding and works better on\nspecific domains. Project page: https://ai-forever.github.io/Kandinsky-3\n","authors":["Vladimir Arkhipkin","Andrei Filatov","Viacheslav Vasilev","Anastasia Maltseva","Said Azizov","Igor Pavlov","Julia Agafonova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2312.03511v1.pdf","comment":"Project page: https://ai-forever.github.io/Kandinsky-3"},{"id":"http://arxiv.org/abs/2312.03510v1","updated":"2023-12-06T14:13:30Z","published":"2023-12-06T14:13:30Z","title":"Towards Sobolev Training","summary":" The increasing use of stochastic models for describing complex phenomena\nwarrants surrogate models that capture the reference model characteristics at a\nfraction of the computational cost, foregoing potentially expensive Monte Carlo\nsimulation. The predominant approach of fitting a large neural network and then\npruning it to a reduced size has commonly neglected shortcomings. The produced\nsurrogate models often will not capture the sensitivities and uncertainties\ninherent in the original model. In particular, (higher-order) derivative\ninformation of such surrogates could differ drastically. Given a large enough\nnetwork, we expect this derivative information to match. However, the pruned\nmodel will almost certainly not share this behavior.\n In this paper, we propose to find surrogate models by using sensitivity\ninformation throughout the learning and pruning process. We build on work using\nInterval Adjoint Significance Analysis for pruning and combine it with the\nrecent advancements in Sobolev Training to accurately model the original\nsensitivity information in the pruned neural network based surrogate model. We\nexperimentally underpin the method on an example of pricing a multidimensional\nBasket option modelled through a stochastic differential equation with Brownian\nmotion. The proposed method is, however, not limited to the domain of\nquantitative finance, which was chosen as a case study for intuitive\ninterpretations of the sensitivities. It serves as a foundation for building\nfurther surrogate modelling techniques considering sensitivity information.\n","authors":["Neil Kichler","Sher Afghan","Uwe Naumann"],"pdf_url":"https://arxiv.org/pdf/2312.03510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00886v3","updated":"2023-12-06T14:07:10Z","published":"2023-12-01T19:26:23Z","title":"Nash Learning from Human Feedback","summary":" Reinforcement learning from human feedback (RLHF) has emerged as the main\nparadigm for aligning large language models (LLMs) with human preferences.\nTypically, RLHF involves the initial step of learning a reward model from human\nfeedback, often expressed as preferences between pairs of text generations\nproduced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by\noptimizing it to maximize the reward model through a reinforcement learning\nalgorithm. However, an inherent limitation of current reward models is their\ninability to fully represent the richness of human preferences and their\ndependency on the sampling distribution.\n In this study, we introduce an alternative pipeline for the fine-tuning of\nLLMs using pairwise human feedback. Our approach entails the initial learning\nof a preference model, which is conditioned on two inputs given a prompt,\nfollowed by the pursuit of a policy that consistently generates responses\npreferred over those generated by any competing policy, thus defining the Nash\nequilibrium of this preference model. We term this approach Nash learning from\nhuman feedback (NLHF).\n In the context of a tabular policy representation, we present a novel\nalgorithmic solution, Nash-MD, founded on the principles of mirror descent.\nThis algorithm produces a sequence of policies, with the last iteration\nconverging to the regularized Nash equilibrium. Additionally, we explore\nparametric representations of policies and introduce gradient descent\nalgorithms for deep-learning architectures. To demonstrate the effectiveness of\nour approach, we present experimental results involving the fine-tuning of a\nLLM for a text summarization task. We believe NLHF offers a compelling avenue\nfor preference learning and policy optimization with the potential of advancing\nthe field of aligning LLMs with human preferences.\n","authors":["Rémi Munos","Michal Valko","Daniele Calandriello","Mohammad Gheshlaghi Azar","Mark Rowland","Zhaohan Daniel Guo","Yunhao Tang","Matthieu Geist","Thomas Mesnard","Andrea Michi","Marco Selvi","Sertan Girgin","Nikola Momchev","Olivier Bachem","Daniel J. Mankowitz","Doina Precup","Bilal Piot"],"pdf_url":"https://arxiv.org/pdf/2312.00886v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2207.03932v2","updated":"2023-12-06T13:59:17Z","published":"2022-07-08T14:33:16Z","title":"Memory-free Online Change-point Detection: A Novel Neural Network\n Approach","summary":" Change-point detection (CPD), which detects abrupt changes in the data\ndistribution, is recognized as one of the most significant tasks in time series\nanalysis. Despite the extensive literature on offline CPD, unsupervised online\nCPD still suffers from major challenges, including scalability, hyperparameter\ntuning, and learning constraints. To mitigate some of these challenges, in this\npaper, we propose a novel deep learning approach for unsupervised online CPD\nfrom multi-dimensional time series, named Adaptive LSTM-Autoencoder\nChange-Point Detection (ALACPD). ALACPD exploits an LSTM-autoencoder-based\nneural network to perform unsupervised online CPD. It continuously adapts to\nthe incoming samples without keeping the previously received input, thus being\nmemory-free. We perform an extensive evaluation on several real-world time\nseries CPD benchmarks. We show that ALACPD, on average, ranks first among\nstate-of-the-art CPD algorithms in terms of quality of the time series\nsegmentation, and it is on par with the best performer in terms of the accuracy\nof the estimated change-points. The implementation of ALACPD is available\nonline on Github\\footnote{\\url{https://github.com/zahraatashgahi/ALACPD}}.\n","authors":["Zahra Atashgahi","Decebal Constantin Mocanu","Raymond Veldhuis","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2207.03932v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03497v1","updated":"2023-12-06T13:46:30Z","published":"2023-12-06T13:46:30Z","title":"Speculative Exploration on the Concept of Artificial Agents Conducting\n Autonomous Research","summary":" This paper engages in a speculative exploration of the concept of an\nartificial agent capable of conducting research. Initially, it examines how the\nact of research can be conceptually characterized, aiming to provide a starting\npoint for discussions about what it means to create such agents. The focus then\nshifts to the core components of research: question formulation, hypothesis\ngeneration, and hypothesis verification. This discussion includes a\nconsideration of the potential and challenges associated with enabling machines\nto autonomously perform these tasks. Subsequently, this paper briefly considers\nthe overlapping themes and interconnections that underlie them. Finally, the\npaper presents preliminary thoughts on prototyping as an initial step towards\nuncovering the challenges involved in developing these research-capable agents.\n","authors":["Shiro Takagi"],"pdf_url":"https://arxiv.org/pdf/2312.03497v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03492v1","updated":"2023-12-06T13:32:17Z","published":"2023-12-06T13:32:17Z","title":"Learning From Scenarios for Stochastic Repairable Scheduling","summary":" When optimizing problems with uncertain parameter values in a linear\nobjective, decision-focused learning enables end-to-end learning of these\nvalues. We are interested in a stochastic scheduling problem, in which\nprocessing times are uncertain, which brings uncertain values in the\nconstraints, and thus repair of an initial schedule may be needed. Historical\nrealizations of the stochastic processing times are available. We show how\nexisting decision-focused learning techniques based on stochastic smoothing can\nbe adapted to this scheduling problem. We include an extensive experimental\nevaluation to investigate in which situations decision-focused learning\noutperforms the state of the art for such situations: scenario-based stochastic\noptimization.\n","authors":["Kim van den Houten","David M. J. Tax","Esteban Freydell","Mathijs de Weerdt"],"pdf_url":"https://arxiv.org/pdf/2312.03492v1.pdf","comment":"8 pages"},{"id":"http://arxiv.org/abs/2312.03491v1","updated":"2023-12-06T13:31:55Z","published":"2023-12-06T13:31:55Z","title":"Schrodinger Bridges Beat Diffusion Models on Text-to-Speech Synthesis","summary":" In text-to-speech (TTS) synthesis, diffusion models have achieved promising\ngeneration quality. However, because of the pre-defined data-to-noise diffusion\nprocess, their prior distribution is restricted to a noisy representation,\nwhich provides little information of the generation target. In this work, we\npresent a novel TTS system, Bridge-TTS, making the first attempt to substitute\nthe noisy Gaussian prior in established diffusion-based TTS methods with a\nclean and deterministic one, which provides strong structural information of\nthe target. Specifically, we leverage the latent representation obtained from\ntext input as our prior, and build a fully tractable Schrodinger bridge between\nit and the ground-truth mel-spectrogram, leading to a data-to-data process.\nMoreover, the tractability and flexibility of our formulation allow us to\nempirically study the design spaces such as noise schedules, as well as to\ndevelop stochastic and deterministic samplers. Experimental results on the\nLJ-Speech dataset illustrate the effectiveness of our method in terms of both\nsynthesis quality and sampling efficiency, significantly outperforming our\ndiffusion counterpart Grad-TTS in 50-step/1000-step synthesis and strong fast\nTTS models in few-step scenarios. Project page: https://bridge-tts.github.io/\n","authors":["Zehua Chen","Guande He","Kaiwen Zheng","Xu Tan","Jun Zhu"],"pdf_url":"https://arxiv.org/pdf/2312.03491v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03485v1","updated":"2023-12-06T13:29:23Z","published":"2023-12-06T13:29:23Z","title":"Precision of Individual Shapley Value Explanations","summary":" Shapley values are extensively used in explainable artificial intelligence\n(XAI) as a framework to explain predictions made by complex machine learning\n(ML) models. In this work, we focus on conditional Shapley values for\npredictive models fitted to tabular data and explain the prediction\n$f(\\boldsymbol{x}^{*})$ for a single observation $\\boldsymbol{x}^{*}$ at the\ntime. Numerous Shapley value estimation methods have been proposed and\nempirically compared on an average basis in the XAI literature. However, less\nfocus has been devoted to analyzing the precision of the Shapley value\nexplanations on an individual basis. We extend our work in Olsen et al. (2023)\nby demonstrating and discussing that the explanations are systematically less\nprecise for observations on the outer region of the training data distribution\nfor all used estimation methods. This is expected from a statistical point of\nview, but to the best of our knowledge, it has not been systematically\naddressed in the Shapley value literature. This is crucial knowledge for\nShapley values practitioners, who should be more careful in applying these\nobservations' corresponding Shapley value explanations.\n","authors":["Lars Henry Berge Olsen"],"pdf_url":"https://arxiv.org/pdf/2312.03485v1.pdf","comment":"Submitted to the 23rd European Young Statisticians Meeting (EYSM)"},{"id":"http://arxiv.org/abs/2312.03483v1","updated":"2023-12-06T13:26:16Z","published":"2023-12-06T13:26:16Z","title":"Exploring Answer Information Methods for Question Generation with\n Transformers","summary":" There has been a lot of work in question generation where different methods\nto provide target answers as input, have been employed. This experimentation\nhas been mostly carried out for RNN based models. We use three different\nmethods and their combinations for incorporating answer information and explore\ntheir effect on several automatic evaluation metrics. The methods that are used\nare answer prompting, using a custom product method using answer embeddings and\nencoder outputs, choosing sentences from the input paragraph that have answer\nrelated information, and using a separate cross-attention attention block in\nthe decoder which attends to the answer. We observe that answer prompting\nwithout any additional modes obtains the best scores across rouge, meteor\nscores. Additionally, we use a custom metric to calculate how many of the\ngenerated questions have the same answer, as the answer which is used to\ngenerate them.\n","authors":["Talha Chafekar","Aafiya Hussain","Grishma Sharma","Deepak Sharma"],"pdf_url":"https://arxiv.org/pdf/2312.03483v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03477v1","updated":"2023-12-06T13:10:02Z","published":"2023-12-06T13:10:02Z","title":"From Detection to Action Recognition: An Edge-Based Pipeline for Robot\n Human Perception","summary":" Mobile service robots are proving to be increasingly effective in a range of\napplications, such as healthcare, monitoring Activities of Daily Living (ADL),\nand facilitating Ambient Assisted Living (AAL). These robots heavily rely on\nHuman Action Recognition (HAR) to interpret human actions and intentions.\nHowever, for HAR to function effectively on service robots, it requires prior\nknowledge of human presence (human detection) and identification of individuals\nto monitor (human tracking). In this work, we propose an end-to-end pipeline\nthat encompasses the entire process, starting from human detection and\ntracking, leading to action recognition. The pipeline is designed to operate in\nnear real-time while ensuring all stages of processing are performed on the\nedge, reducing the need for centralised computation. To identify the most\nsuitable models for our mobile robot, we conducted a series of experiments\ncomparing state-of-the-art solutions based on both their detection performance\nand efficiency. To evaluate the effectiveness of our proposed pipeline, we\nproposed a dataset comprising daily household activities. By presenting our\nfindings and analysing the results, we demonstrate the efficacy of our approach\nin enabling mobile robots to understand and respond to human behaviour in\nreal-world scenarios relying mainly on the data from their RGB cameras.\n","authors":["Petros Toupas","Georgios Tsamis","Dimitrios Giakoumis","Konstantinos Votis","Dimitrios Tzovaras"],"pdf_url":"https://arxiv.org/pdf/2312.03477v1.pdf","comment":"7 pages, 10 figures, 2 tables"},{"id":"http://arxiv.org/abs/2312.03475v1","updated":"2023-12-06T12:58:37Z","published":"2023-12-06T12:58:37Z","title":"Molecule Joint Auto-Encoding: Trajectory Pretraining with 2D and 3D\n Diffusion","summary":" Recently, artificial intelligence for drug discovery has raised increasing\ninterest in both machine learning and chemistry domains. The fundamental\nbuilding block for drug discovery is molecule geometry and thus, the molecule's\ngeometrical representation is the main bottleneck to better utilize machine\nlearning techniques for drug discovery. In this work, we propose a pretraining\nmethod for molecule joint auto-encoding (MoleculeJAE). MoleculeJAE can learn\nboth the 2D bond (topology) and 3D conformation (geometry) information, and a\ndiffusion process model is applied to mimic the augmented trajectories of such\ntwo modalities, based on which, MoleculeJAE will learn the inherent chemical\nstructure in a self-supervised manner. Thus, the pretrained geometrical\nrepresentation in MoleculeJAE is expected to benefit downstream\ngeometry-related tasks. Empirically, MoleculeJAE proves its effectiveness by\nreaching state-of-the-art performance on 15 out of 20 tasks by comparing it\nwith 12 competitive baselines.\n","authors":["Weitao Du","Jiujiu Chen","Xuecang Zhang","Zhiming Ma","Shengchao Liu"],"pdf_url":"https://arxiv.org/pdf/2312.03475v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2107.12065v2","updated":"2023-12-06T12:52:40Z","published":"2021-07-26T09:42:33Z","title":"Provably Accelerated Decentralized Gradient Method Over Unbalanced\n Directed Graphs","summary":" We consider the decentralized optimization problem, where a network of $n$\nagents aims to collaboratively minimize the average of their individual smooth\nand convex objective functions through peer-to-peer communication in a directed\ngraph. To tackle this problem, we propose two accelerated gradient tracking\nmethods, namely APD and APD-SC, for non-strongly convex and strongly convex\nobjective functions, respectively. We show that APD and APD-SC converge at the\nrates $O\\left(\\frac{1}{k^2}\\right)$ and $O\\left(\\left(1 -\nC\\sqrt{\\frac{\\mu}{L}}\\right)^k\\right)$, respectively, up to constant factors\ndepending only on the mixing matrix. APD and APD-SC are the first decentralized\nmethods over unbalanced directed graphs that achieve the same provable\nacceleration as centralized methods. Numerical experiments demonstrate the\neffectiveness of both methods.\n","authors":["Zhuoqing Song","Lei Shi","Shi Pu","Ming Yan"],"pdf_url":"https://arxiv.org/pdf/2107.12065v2.pdf","comment":"SIAM Journal on Optimization, in press"},{"id":"http://arxiv.org/abs/2312.03466v1","updated":"2023-12-06T12:41:53Z","published":"2023-12-06T12:41:53Z","title":"Search Strategies for Self-driving Laboratories with Pending Experiments","summary":" Self-driving laboratories (SDLs) consist of multiple stations that perform\nmaterial synthesis and characterisation tasks. To minimize station downtime and\nmaximize experimental throughput, it is practical to run experiments in\nasynchronous parallel, in which multiple experiments are being performed at\nonce in different stages. Asynchronous parallelization of experiments, however,\nintroduces delayed feedback (i.e. \"pending experiments\"), which is known to\nreduce Bayesian optimiser performance. Here, we build a simulator for a\nmulti-stage SDL and compare optimisation strategies for dealing with delayed\nfeedback and asynchronous parallelized operation. Using data from a real SDL,\nwe build a ground truth Bayesian optimisation simulator from 177 previously run\nexperiments for maximizing the conductivity of functional coatings. We then\ncompare search strategies such as expected improvement, noisy expected\nimprovement, 4-mode exploration and random sampling. We evaluate their\nperformance in terms of amount of delay and problem dimensionality. Our\nsimulation results showcase the trade-off between the asynchronous parallel\noperation and delayed feedback.\n","authors":["Hao Wen","Jakob Zeitler","Connor Rupnow"],"pdf_url":"https://arxiv.org/pdf/2312.03466v1.pdf","comment":"Accepted at NeurIPS 2023, AI4Mat"},{"id":"http://arxiv.org/abs/2312.03464v1","updated":"2023-12-06T12:40:06Z","published":"2023-12-06T12:40:06Z","title":"Subnetwork-to-go: Elastic Neural Network with Dynamic Training and\n Customizable Inference","summary":" Deploying neural networks to different devices or platforms is in general\nchallenging, especially when the model size is large or model complexity is\nhigh. Although there exist ways for model pruning or distillation, it is\ntypically required to perform a full round of model training or finetuning\nprocedure in order to obtain a smaller model that satisfies the model size or\ncomplexity constraints. Motivated by recent works on dynamic neural networks,\nwe propose a simple way to train a large network and flexibly extract a\nsubnetwork from it given a model size or complexity constraint during\ninference. We introduce a new way to allow a large model to be trained with\ndynamic depth and width during the training phase, and after the large model is\ntrained we can select a subnetwork from it with arbitrary depth and width\nduring the inference phase with a relatively better performance compared to\ntraining the subnetwork independently from scratch. Experiment results on a\nmusic source separation model show that our proposed method can effectively\nimprove the separation performance across different subnetwork sizes and\ncomplexities with a single large model, and training the large model takes\nsignificantly shorter time than training all the different subnetworks.\n","authors":["Kai Li","Yi Luo"],"pdf_url":"https://arxiv.org/pdf/2312.03464v1.pdf","comment":"5 pages, 3 figures"},{"id":"http://arxiv.org/abs/2310.08577v3","updated":"2023-12-06T12:34:16Z","published":"2023-10-12T17:59:30Z","title":"Visual Data-Type Understanding does not emerge from Scaling\n Vision-Language Models","summary":" Recent advances in the development of vision-language models (VLMs) are\nyielding remarkable success in recognizing visual semantic content, including\nimpressive instances of compositional image understanding. Here, we introduce\nthe novel task of Visual Data-Type Identification, a basic perceptual skill\nwith implications for data curation (e.g., noisy data-removal from large\ndatasets, domain-specific retrieval) and autonomous vision (e.g.,\ndistinguishing changing weather conditions from camera lens staining). We\ndevelop two datasets consisting of animal images altered across a diverse set\nof 27 visual data-types, spanning four broad categories. An extensive zero-shot\nevaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a nuanced\nperformance landscape. While VLMs are reasonably good at identifying certain\nstylistic \\textit{data-types}, such as cartoons and sketches, they struggle\nwith simpler data-types arising from basic manipulations like image rotations\nor additive noise. Our findings reveal that (i) model scaling alone yields\nmarginal gains for contrastively-trained models like CLIP, and (ii) there is a\npronounced drop in performance for the largest auto-regressively trained VLMs\nlike OpenFlamingo. This finding points to a blind spot in current frontier\nVLMs: they excel in recognizing semantic content but fail to acquire an\nunderstanding of visual data-types through scaling. By analyzing the\npre-training distributions of these models and incorporating data-type\ninformation into the captions during fine-tuning, we achieve a significant\nenhancement in performance. By exploring this previously uncharted task, we aim\nto set the stage for further advancing VLMs to equip them with visual data-type\nunderstanding. Code and datasets are released at\nhttps://github.com/bethgelab/DataTypeIdentification.\n","authors":["Vishaal Udandarao","Max F. Burg","Samuel Albanie","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2310.08577v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.03696v2","updated":"2023-12-06T12:30:45Z","published":"2023-10-05T17:13:16Z","title":"Function-Space Optimality of Neural Architectures With Multivariate\n Nonlinearities","summary":" We investigate the function-space optimality (specifically, the Banach-space\noptimality) of a large class of shallow neural architectures with multivariate\nnonlinearities/activation functions. To that end, we construct a new family of\nBanach spaces defined via a regularization operator, the $k$-plane transform,\nand a sparsity-promoting norm. We prove a representer theorem that states that\nthe solution sets to learning problems posed over these Banach spaces are\ncompletely characterized by neural architectures with multivariate\nnonlinearities. These optimal architectures have skip connections and are\ntightly connected to orthogonal weight normalization and multi-index models,\nboth of which have received recent interest in the neural network community.\nOur framework is compatible with a number of classical nonlinearities including\nthe rectified linear unit (ReLU) activation function, the norm activation\nfunction, and the radial basis functions found in the theory of\nthin-plate/polyharmonic splines. We also show that the underlying spaces are\nspecial instances of reproducing kernel Banach spaces and variation spaces. Our\nresults shed light on the regularity of functions learned by neural networks\ntrained on data, particularly with multivariate nonlinearities, and provide new\ntheoretical motivation for several architectural choices found in practice.\n","authors":["Rahul Parhi","Michael Unser"],"pdf_url":"https://arxiv.org/pdf/2310.03696v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03455v1","updated":"2023-12-06T12:27:25Z","published":"2023-12-06T12:27:25Z","title":"Data is Overrated: Perceptual Metrics Can Lead Learning in the Absence\n of Training Data","summary":" Perceptual metrics are traditionally used to evaluate the quality of natural\nsignals, such as images and audio. They are designed to mimic the perceptual\nbehaviour of human observers and usually reflect structures found in natural\nsignals. This motivates their use as loss functions for training generative\nmodels such that models will learn to capture the structure held in the metric.\nWe take this idea to the extreme in the audio domain by training a compressive\nautoencoder to reconstruct uniform noise, in lieu of natural data. We show that\ntraining with perceptual losses improves the reconstruction of spectrograms and\nre-synthesized audio at test time over models trained with a standard Euclidean\nloss. This demonstrates better generalisation to unseen natural signals when\nusing perceptual metrics.\n","authors":["Tashi Namgyal","Alexander Hepburn","Raul Santos-Rodriguez","Valero Laparra","Jesus Malo"],"pdf_url":"https://arxiv.org/pdf/2312.03455v1.pdf","comment":"Machine Learning for Audio Workshop, NeurIPS 2023"},{"id":"http://arxiv.org/abs/2307.03761v2","updated":"2023-12-06T11:59:17Z","published":"2023-07-07T12:22:16Z","title":"DyEdgeGAT: Dynamic Edge via Graph Attention for Early Fault Detection in\n IIoT Systems","summary":" In the industrial Internet of Things, condition monitoring sensor signals\nfrom complex systems often exhibit strong nonlinear and stochastic\nspatial-temporal dynamics under varying operating conditions. Such complex\ndynamics make fault detection particularly challenging. Although previously\nproposed methods effectively model these dynamics, they often neglect the\ndynamic evolution of relationships between sensor signals. Undetected shifts in\nthese relationships can potentially result in significant system failures.\nAnother limitation is their inability to effectively distinguish between novel\noperating conditions and actual faults. To address this gap, we propose\nDyEdgeGAT (Dynamic Edge via Graph Attention), a novel approach capable of\ndetecting various faults, especially those characterized by relationship\nchanges at early stages, while distinguishing faults from novel operating\nconditions. DyEdgeGAT is a graph-based framework that provides a novel graph\ninference scheme for multivariate time series that dynamically constructs edges\nto represent and track the evolution of relationships between time series.\nAdditionally, it addresses a commonly overlooked aspect: the cause-and-effect\nrelationships within the system, such as between control inputs and\nmeasurements. By incorporating system-independent variables as contexts of\noperating conditions into node dynamics extraction, DyEdgeGAT enhances its\nrobustness against novel operating conditions. We rigorously evaluate\nDyEdgeGAT's performance using both a synthetic dataset, designed to simulate\nvarying levels of fault severity and a real-world industrial-scale benchmark\ncontaining a variety of fault types with different detection complexities. Our\nfindings demonstrate that DyEdgeGAT is highly effective in fault detection,\nshowing particular strength in early fault detection while maintaining\nrobustness under novel operating conditions.\n","authors":["Mengjie Zhao","Olga Fink"],"pdf_url":"https://arxiv.org/pdf/2307.03761v2.pdf","comment":"16 pages, 8 figures"},{"id":"http://arxiv.org/abs/2310.11230v2","updated":"2023-12-06T11:58:34Z","published":"2023-10-17T13:01:10Z","title":"Zipformer: A faster and better encoder for automatic speech recognition","summary":" The Conformer has become the most popular encoder model for automatic speech\nrecognition (ASR). It adds convolution modules to a transformer to learn both\nlocal and global dependencies. In this work we describe a faster, more\nmemory-efficient, and better-performing transformer, called Zipformer. Modeling\nchanges include: 1) a U-Net-like encoder structure where middle stacks operate\nat lower frame rates; 2) reorganized block structure with more modules, within\nwhich we re-use attention weights for efficiency; 3) a modified form of\nLayerNorm called BiasNorm allows us to retain some length information; 4) new\nactivation functions SwooshR and SwooshL work better than Swish. We also\npropose a new optimizer, called ScaledAdam, which scales the update by each\ntensor's current scale to keep the relative change about the same, and also\nexplictly learns the parameter scale. It achieves faster convergence and better\nperformance than Adam. Extensive experiments on LibriSpeech, Aishell-1, and\nWenetSpeech datasets demonstrate the effectiveness of our proposed Zipformer\nover other state-of-the-art ASR models. Our code is publicly available at\nhttps://github.com/k2-fsa/icefall.\n","authors":["Zengwei Yao","Liyong Guo","Xiaoyu Yang","Wei Kang","Fangjun Kuang","Yifan Yang","Zengrui Jin","Long Lin","Daniel Povey"],"pdf_url":"https://arxiv.org/pdf/2310.11230v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2309.14209v2","updated":"2023-12-06T11:21:28Z","published":"2023-09-25T15:14:54Z","title":"Continual Driving Policy Optimization with Closed-Loop Individualized\n Curricula","summary":" The safety of autonomous vehicles (AV) has been a long-standing top concern,\nstemming from the absence of rare and safety-critical scenarios in the\nlong-tail naturalistic driving distribution. To tackle this challenge, a surge\nof research in scenario-based autonomous driving has emerged, with a focus on\ngenerating high-risk driving scenarios and applying them to conduct\nsafety-critical testing of AV models. However, limited work has been explored\non the reuse of these extensive scenarios to iteratively improve AV models.\nMoreover, it remains intractable and challenging to filter through gigantic\nscenario libraries collected from other AV models with distinct behaviors,\nattempting to extract transferable information for current AV improvement.\nTherefore, we develop a continual driving policy optimization framework\nfeaturing Closed-Loop Individualized Curricula (CLIC), which we factorize into\na set of standardized sub-modules for flexible implementation choices: AV\nEvaluation, Scenario Selection, and AV Training. CLIC frames AV Evaluation as a\ncollision prediction task, where it estimates the chance of AV failures in\nthese scenarios at each iteration. Subsequently, by re-sampling from historical\nscenarios based on these failure probabilities, CLIC tailors individualized\ncurricula for downstream training, aligning them with the evaluated capability\nof AV. Accordingly, CLIC not only maximizes the utilization of the vast\npre-collected scenario library for closed-loop driving policy optimization but\nalso facilitates AV improvement by individualizing its training with more\nchallenging cases out of those poorly organized scenarios. Experimental results\nclearly indicate that CLIC surpasses other curriculum-based training\nstrategies, showing substantial improvement in managing risky scenarios, while\nstill maintaining proficiency in handling simpler cases.\n","authors":["Haoyi Niu","Yizhou Xu","Xingjian Jiang","Jianming Hu"],"pdf_url":"https://arxiv.org/pdf/2309.14209v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2201.04093v2","updated":"2023-12-06T11:14:38Z","published":"2022-01-11T17:36:34Z","title":"Systematic Literature Review: Quantum Machine Learning and its\n applications","summary":" Quantum computing is the process of performing calculations using quantum\nmechanics. This field studies the quantum behavior of certain subatomic\nparticles for subsequent use in performing calculations, as well as for\nlarge-scale information processing. These capabilities can give quantum\ncomputers an advantage in terms of computational time and cost over classical\ncomputers. Nowadays, there are scientific challenges that are impossible to\nperform by classical computation due to computational complexity or the time\nthe calculation would take, and quantum computation is one of the possible\nanswers. However, current quantum devices have not yet the necessary qubits and\nare not fault-tolerant enough to achieve these goals. Nonetheless, there are\nother fields like machine learning or chemistry where quantum computation could\nbe useful with current quantum devices. This manuscript aims to present a\nSystematic Literature Review of the papers published between 2017 and 2023 to\nidentify, analyze and classify the different algorithms used in quantum machine\nlearning and their applications. Consequently, this study identified 94\narticles that used quantum machine learning techniques and algorithms. The main\ntypes of found algorithms are quantum implementations of classical machine\nlearning algorithms, such as support vector machines or the k-nearest neighbor\nmodel, and classical deep learning algorithms, like quantum neural networks.\nMany articles try to solve problems currently answered by classical machine\nlearning but using quantum devices and algorithms. Even though results are\npromising, quantum machine learning is far from achieving its full potential.\nAn improvement in the quantum hardware is required since the existing quantum\ncomputers lack enough quality, speed, and scale to allow quantum computing to\nachieve its full potential.\n","authors":["David Peral García","Juan Cruz-Benito","Francisco José García-Peñalvo"],"pdf_url":"https://arxiv.org/pdf/2201.04093v2.pdf","comment":"28 pages, 25 figures"},{"id":"http://arxiv.org/abs/2312.03415v1","updated":"2023-12-06T10:54:34Z","published":"2023-12-06T10:54:34Z","title":"Run LoRA Run: Faster and Lighter LoRA Implementations","summary":" LoRA is a technique that reduces the number of trainable parameters in a\nneural network by introducing low-rank adapters to linear layers. This\ntechnique is used both for fine-tuning (LoRA, QLoRA) and full train (ReLoRA).\nThis paper presents the RunLoRA framework for efficient implementations of LoRA\nthat significantly improves the speed of neural network training and\nfine-tuning using low-rank adapters. The proposed implementation optimizes the\ncomputation of LoRA operations based on dimensions of corresponding linear\nlayer, layer input dimensions and lora rank by choosing best forward and\nbackward computation graph based on FLOPs and time estimations, resulting in\nfaster training without sacrificing accuracy. The experimental results show up\nto 17% speedup on Llama family of models.\n","authors":["Daria Cherniuk","Aleksandr Mikhalev","Ivan Oseledets"],"pdf_url":"https://arxiv.org/pdf/2312.03415v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2210.13179v5","updated":"2023-12-06T10:54:01Z","published":"2022-10-24T13:00:15Z","title":"A simple probabilistic neural network for machine understanding","summary":" We discuss probabilistic neural networks with a fixed internal representation\nas models for machine understanding. Here understanding is intended as mapping\ndata to an already existing representation which encodes an {\\em a priori}\norganisation of the feature space. We derive the internal representation by\nrequiring that it satisfies the principles of maximal relevance and of maximal\nignorance about how different features are combined. We show that, when hidden\nunits are binary variables, these two principles identify a unique model -- the\nHierarchical Feature Model (HFM) -- which is fully solvable and provides a\nnatural interpretation in terms of features. We argue that learning machines\nwith this architecture enjoy a number of interesting properties, like the\ncontinuity of the representation with respect to changes in parameters and\ndata, the possibility to control the level of compression and the ability to\nsupport functions that go beyond generalisation. We explore the behaviour of\nthe model with extensive numerical experiments and argue that models where the\ninternal representation is fixed reproduce a learning modality which is\nqualitatively different from that of traditional models such as Restricted\nBoltzmann Machines.\n","authors":["Rongrong Xie","Matteo Marsili"],"pdf_url":"https://arxiv.org/pdf/2210.13179v5.pdf","comment":"34 pages, 9 figures. Accepted in JSTAT"},{"id":"http://arxiv.org/abs/2306.16111v2","updated":"2023-12-06T10:52:15Z","published":"2023-06-28T11:27:48Z","title":"Time Regularization in Optimal Time Variable Learning","summary":" Recently, optimal time variable learning in deep neural networks (DNNs) was\nintroduced in arXiv:2204.08528. In this manuscript we extend the concept by\nintroducing a regularization term that directly relates to the time horizon in\ndiscrete dynamical systems. Furthermore, we propose an adaptive pruning\napproach for Residual Neural Networks (ResNets), which reduces network\ncomplexity without compromising expressiveness, while simultaneously decreasing\ntraining time. The results are illustrated by applying the proposed concepts to\nclassification tasks on the well known MNIST and Fashion MNIST data sets. Our\nPyTorch code is available on\nhttps://github.com/frederikkoehne/time_variable_learning.\n","authors":["Evelyn Herberg","Roland Herzog","Frederik Köhne"],"pdf_url":"https://arxiv.org/pdf/2306.16111v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03414v1","updated":"2023-12-06T10:50:43Z","published":"2023-12-06T10:50:43Z","title":"Compressed Context Memory For Online Language Model Interaction","summary":" This paper presents a novel context compression method for Transformer\nlanguage models in online scenarios such as ChatGPT, where the context\ncontinually expands. As the context lengthens, the attention process requires\nmore memory and computational resources, which in turn reduces the throughput\nof the language model. To this end, we propose a compressed context memory\nsystem that continually compresses the growing context into a compact memory\nspace. The compression process simply involves integrating a lightweight\nconditional LoRA into the language model's forward pass during inference. Based\non the compressed context memory, the language model can perform inference with\nreduced memory and attention operations. Through evaluations on conversation,\npersonalization, and multi-task learning, we demonstrate that our approach\nachieves the performance level of a full context model with $5\\times$ smaller\ncontext memory space. Codes are available at\nhttps://github.com/snu-mllab/context-memory.\n","authors":["Jang-Hyun Kim","Junyoung Yeom","Sangdoo Yun","Hyun Oh Song"],"pdf_url":"https://arxiv.org/pdf/2312.03414v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03413v1","updated":"2023-12-06T10:50:27Z","published":"2023-12-06T10:50:27Z","title":"Approximating Solutions to the Knapsack Problem using the Lagrangian\n Dual Framework","summary":" The Knapsack Problem is a classic problem in combinatorial optimisation.\nSolving these problems may be computationally expensive. Recent years have seen\na growing interest in the use of deep learning methods to approximate the\nsolutions to such problems. A core problem is how to enforce or encourage\nconstraint satisfaction in predicted solutions. A promising approach for\npredicting solutions to constrained optimisation problems is the Lagrangian\nDual Framework which builds on the method of Lagrangian Relaxation. In this\npaper we develop neural network models to approximate Knapsack Problem\nsolutions using the Lagrangian Dual Framework while improving constraint\nsatisfaction. We explore the problems of output interpretation and model\nselection within this context. Experimental results show strong constraint\nsatisfaction with a minor reduction of optimality as compared to a baseline\nneural network which does not explicitly model the constraints.\n","authors":["Mitchell Keegan","Mahdi Abolghasemi"],"pdf_url":"https://arxiv.org/pdf/2312.03413v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03406v1","updated":"2023-12-06T10:42:40Z","published":"2023-12-06T10:42:40Z","title":"SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting","summary":" Spatiotemporal forecasting tasks, such as weather forecasting and traffic\nprediction, offer significant societal benefits. These tasks can be effectively\napproached as image forecasting problems using computer vision models. Vector\nquantization (VQ) is a well-known method for discrete representation that\nimproves the latent space, leading to enhanced generalization and transfer\nlearning capabilities. One of the main challenges in using VQ for\nspatiotemporal forecasting is how to balance between keeping enough details and\nremoving noises from the original patterns for better generalization. We\naddress this challenge by developing sparse vector quantization, or {\\bf SVQ}\nfor short, that leverages sparse regression to make better trade-off between\nthe two objectives. The main innovation of this work is to approximate sparse\nregression by a two-layer MLP and a randomly fixed or learnable matrix,\ndramatically improving its computational efficiency. Through experiments\nconducted on diverse datasets in multiple fields including weather forecasting,\ntraffic flow prediction, and video forecasting, we unequivocally demonstrate\nthat our proposed method consistently enhances the performance of base models\nand achieves state-of-the-art results across all benchmarks.\n","authors":["Chao Chen","Tian Zhou","Yanjun Zhao","Hui Liu","Liang Sun","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2312.03406v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03404v1","updated":"2023-12-06T10:40:33Z","published":"2023-12-06T10:40:33Z","title":"An AI for Scientific Discovery Route between Amorphous Networks and\n Mechanical Behavior","summary":" \"AI for science\" is widely recognized as a future trend in the development of\nscientific research. Currently, although machine learning algorithms have\nplayed a crucial role in scientific research with numerous successful cases,\nrelatively few instances exist where AI assists researchers in uncovering the\nunderlying physical mechanisms behind a certain phenomenon and subsequently\nusing that mechanism to improve machine learning algorithms' efficiency. This\narticle uses the investigation into the relationship between extreme Poisson's\nratio values and the structure of amorphous networks as a case study to\nillustrate how machine learning methods can assist in revealing underlying\nphysical mechanisms. Upon recognizing that the Poisson's ratio relies on the\nlow-frequency vibrational modes of dynamical matrix, we can then employ a\nconvolutional neural network, trained on the dynamical matrix instead of\ntraditional image recognition, to predict the Poisson's ratio of amorphous\nnetworks with a much higher efficiency. Through this example, we aim to\nshowcase the role that artificial intelligence can play in revealing\nfundamental physical mechanisms, which subsequently improves the machine\nlearning algorithms significantly.\n","authors":["Changliang Zhu","Chenchao Fang","Zhipeng Jin","Baowen Li","Xiangying Shen","Lei Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03404v1.pdf","comment":"15 pages, 5 figures"},{"id":"http://arxiv.org/abs/2305.16498v3","updated":"2023-12-06T10:39:32Z","published":"2023-05-25T21:54:22Z","title":"Coherent Soft Imitation Learning","summary":" Imitation learning methods seek to learn from an expert either through\nbehavioral cloning (BC) of the policy or inverse reinforcement learning (IRL)\nof the reward. Such methods enable agents to learn complex tasks from humans\nthat are difficult to capture with hand-designed reward functions. Choosing BC\nor IRL for imitation depends on the quality and state-action coverage of the\ndemonstrations, as well as additional access to the Markov decision process.\nHybrid strategies that combine BC and IRL are not common, as initial policy\noptimization against inaccurate rewards diminishes the benefit of pretraining\nthe policy with BC. This work derives an imitation method that captures the\nstrengths of both BC and IRL. In the entropy-regularized ('soft') reinforcement\nlearning setting, we show that the behaviour-cloned policy can be used as both\na shaped reward and a critic hypothesis space by inverting the regularized\npolicy update. This coherency facilitates fine-tuning cloned policies using the\nreward estimate and additional interactions with the environment. This approach\nconveniently achieves imitation learning through initial behaviour cloning,\nfollowed by refinement via RL with online or offline data sources. The\nsimplicity of the approach enables graceful scaling to high-dimensional and\nvision-based tasks, with stable learning and minimal hyperparameter tuning, in\ncontrast to adversarial approaches. For the open-source implementation and\nsimulation results, see https://joemwatson.github.io/csil/.\n","authors":["Joe Watson","Sandy H. Huang","Nicolas Heess"],"pdf_url":"https://arxiv.org/pdf/2305.16498v3.pdf","comment":"51 pages, 49 figures. DeepMind internship report. Accepted as a\n spotlight paper at Advances in Neural Information Processing Systems 2023"},{"id":"http://arxiv.org/abs/2312.03397v1","updated":"2023-12-06T10:10:21Z","published":"2023-12-06T10:10:21Z","title":"Generalized Contrastive Divergence: Joint Training of Energy-Based Model\n and Diffusion Model through Inverse Reinforcement Learning","summary":" We present Generalized Contrastive Divergence (GCD), a novel objective\nfunction for training an energy-based model (EBM) and a sampler simultaneously.\nGCD generalizes Contrastive Divergence (Hinton, 2002), a celebrated algorithm\nfor training EBM, by replacing Markov Chain Monte Carlo (MCMC) distribution\nwith a trainable sampler, such as a diffusion model. In GCD, the joint training\nof EBM and a diffusion model is formulated as a minimax problem, which reaches\nan equilibrium when both models converge to the data distribution. The minimax\nlearning with GCD bears interesting equivalence to inverse reinforcement\nlearning, where the energy corresponds to a negative reward, the diffusion\nmodel is a policy, and the real data is expert demonstrations. We present\npreliminary yet promising results showing that joint training is beneficial for\nboth EBM and a diffusion model. GCD enables EBM training without MCMC while\nimproving the sample quality of a diffusion model.\n","authors":["Sangwoong Yoon","Dohyun Kwon","Himchan Hwang","Yung-Kyun Noh","Frank C. Park"],"pdf_url":"https://arxiv.org/pdf/2312.03397v1.pdf","comment":"NeurIPS 2023 Workshop on Diffusion Models"},{"id":"http://arxiv.org/abs/2312.03395v1","updated":"2023-12-06T10:09:22Z","published":"2023-12-06T10:09:22Z","title":"Diffused Task-Agnostic Milestone Planner","summary":" Addressing decision-making problems using sequence modeling to predict future\ntrajectories shows promising results in recent years. In this paper, we take a\nstep further to leverage the sequence predictive method in wider areas such as\nlong-term planning, vision-based control, and multi-task decision-making. To\nthis end, we propose a method to utilize a diffusion-based generative sequence\nmodel to plan a series of milestones in a latent space and to have an agent to\nfollow the milestones to accomplish a given task. The proposed method can learn\ncontrol-relevant, low-dimensional latent representations of milestones, which\nmakes it possible to efficiently perform long-term planning and vision-based\ncontrol. Furthermore, our approach exploits generation flexibility of the\ndiffusion model, which makes it possible to plan diverse trajectories for\nmulti-task decision-making. We demonstrate the proposed method across offline\nreinforcement learning (RL) benchmarks and an visual manipulation environment.\nThe results show that our approach outperforms offline RL methods in solving\nlong-horizon, sparse-reward tasks and multi-task problems, while also achieving\nthe state-of-the-art performance on the most challenging vision-based\nmanipulation benchmark.\n","authors":["Mineui Hong","Minjae Kang","Songhwai Oh"],"pdf_url":"https://arxiv.org/pdf/2312.03395v1.pdf","comment":"37th Conference on Neural Information Processing Systems"},{"id":"http://arxiv.org/abs/2312.01210v2","updated":"2023-12-06T10:08:47Z","published":"2023-12-02T19:39:50Z","title":"When accurate prediction models yield harmful self-fulfilling prophecies","summary":" Prediction models are popular in medical research and practice. By predicting\nan outcome of interest for specific patients, these models may help inform\ndifficult treatment decisions, and are often hailed as the poster children for\npersonalized, data-driven healthcare.\n We show however, that using prediction models for decision making can lead to\nharmful decisions, even when the predictions exhibit good discrimination after\ndeployment. These models are harmful self-fulfilling prophecies: their\ndeployment harms a group of patients but the worse outcome of these patients\ndoes not invalidate the predictive power of the model. Our main result is a\nformal characterization of a set of such prediction models. Next we show that\nmodels that are well calibrated before and after deployment are useless for\ndecision making as they made no change in the data distribution. These results\npoint to the need to revise standard practices for validation, deployment and\nevaluation of prediction models that are used in medical decisions.\n","authors":["Wouter A. C. van Amsterdam","Nan van Geloven","Jesse H. Krijthe","Rajesh Ranganath","Giovanni Ciná"],"pdf_url":"https://arxiv.org/pdf/2312.01210v2.pdf","comment":"ML4H 2023 Findings Track"},{"id":"http://arxiv.org/abs/2305.17390v2","updated":"2023-12-06T10:07:01Z","published":"2023-05-27T07:04:15Z","title":"SwiftSage: A Generative Agent with Fast and Slow Thinking for Complex\n Interactive Tasks","summary":" We introduce SwiftSage, a novel agent framework inspired by the dual-process\ntheory of human cognition, designed to excel in action planning for complex\ninteractive reasoning tasks. SwiftSage integrates the strengths of behavior\ncloning and prompting large language models (LLMs) to enhance task completion\nperformance. The framework comprises two primary modules: the Swift module,\nrepresenting fast and intuitive thinking, and the Sage module, emulating\ndeliberate thought processes. The Swift module is a small encoder-decoder LM\nfine-tuned on the oracle agent's action trajectories, while the Sage module\nemploys LLMs such as GPT-4 for subgoal planning and grounding. We develop a\nheuristic method to harmoniously integrate the two modules, resulting in a more\nefficient and robust problem-solving process. In 30 tasks from the ScienceWorld\nbenchmark, SwiftSage significantly outperforms other methods such as SayCan,\nReAct, and Reflexion, demonstrating its effectiveness in solving complex\ninteractive tasks.\n","authors":["Bill Yuchen Lin","Yicheng Fu","Karina Yang","Faeze Brahman","Shiyu Huang","Chandra Bhagavatula","Prithviraj Ammanabrolu","Yejin Choi","Xiang Ren"],"pdf_url":"https://arxiv.org/pdf/2305.17390v2.pdf","comment":"Accepted to NeurIPS 2023 (spotlight). Project website:\n https://swiftsage.github.io"},{"id":"http://arxiv.org/abs/2312.02248v2","updated":"2023-12-06T10:05:42Z","published":"2023-12-04T16:05:45Z","title":"Towards early diagnosis of Alzheimer's disease: Advances in\n immune-related blood biomarkers and computational modeling approaches","summary":" Alzheimer's disease has an increasing prevalence in the population\nworld-wide, yet current diagnostic methods based on recommended biomarkers are\nonly available in specialized clinics. Due to these circumstances, Alzheimer's\ndisease is usually diagnosed late, which contrasts with the currently available\ntreatment options that are only effective for patients at an early stage.\nBlood-based biomarkers could fill in the gap of easily accessible and low-cost\nmethods for early diagnosis of the disease. In particular, immune-based\nblood-biomarkers might be a promising option, given the recently discovered\ncross-talk of immune cells of the central nervous system with those in the\nperipheral immune system. With the help of machine learning algorithms and\nmechanistic modeling approaches, such as agent-based modeling, an in-depth\nanalysis of the simulation of cell dynamics is possible as well as of\nhigh-dimensional omics resources indicative of pathway signaling changes. Here,\nwe give a background on advances in research on brain-immune system cross-talk\nin Alzheimer's disease and review recent machine learning and mechanistic\nmodeling approaches which leverage modern omics technologies for blood-based\nimmune system-related biomarker discovery.\n","authors":["Sophia Krix","Ella Wilczynski","Neus Falgàs","Raquel Sánchez-Valle","Eti Yoles","Uri Nevo","Kuti Baruch","Holger Fröhlich"],"pdf_url":"https://arxiv.org/pdf/2312.02248v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03386v1","updated":"2023-12-06T09:52:18Z","published":"2023-12-06T09:52:18Z","title":"An Infinite-Width Analysis on the Jacobian-Regularised Training of a\n Neural Network","summary":" The recent theoretical analysis of deep neural networks in their\ninfinite-width limits has deepened our understanding of initialisation, feature\nlearning, and training of those networks, and brought new practical techniques\nfor finding appropriate hyperparameters, learning network weights, and\nperforming inference. In this paper, we broaden this line of research by\nshowing that this infinite-width analysis can be extended to the Jacobian of a\ndeep neural network. We show that a multilayer perceptron (MLP) and its\nJacobian at initialisation jointly converge to a Gaussian process (GP) as the\nwidths of the MLP's hidden layers go to infinity and characterise this GP. We\nalso prove that in the infinite-width limit, the evolution of the MLP under the\nso-called robust training (i.e., training with a regulariser on the Jacobian)\nis described by a linear first-order ordinary differential equation that is\ndetermined by a variant of the Neural Tangent Kernel. We experimentally show\nthe relevance of our theoretical claims to wide finite networks, and\nempirically analyse the properties of kernel regression solution to obtain an\ninsight into Jacobian regularisation.\n","authors":["Taeyoung Kim","Hongseok Yang"],"pdf_url":"https://arxiv.org/pdf/2312.03386v1.pdf","comment":"72 pages, 21 figures"},{"id":"http://arxiv.org/abs/2306.01546v2","updated":"2023-12-06T09:43:41Z","published":"2023-06-02T13:50:30Z","title":"Publicly available datasets of breast histopathology H&E whole-slide\n images: A scoping review","summary":" Advancements in digital pathology and computing resources have made a\nsignificant impact in the field of computational pathology for breast cancer\ndiagnosis and treatment. However, access to high-quality labeled\nhistopathological images of breast cancer is a big challenge that limits the\ndevelopment of accurate and robust deep learning models. In this scoping\nreview, we identified the publicly available datasets of breast H&E stained\nwhole-slide images (WSI) that can be used to develop deep learning algorithms.\nWe systematically searched nine scientific literature databases and nine\nresearch data repositories and found 17 publicly available datasets containing\n10385 H&E WSIs of breast cancer. Moreover, we reported image metadata and\ncharacteristics for each dataset to assist researchers in selecting proper\ndatasets for specific tasks in breast cancer computational pathology. In\naddition, we compiled two lists of breast H&E patches and private datasets as\nsupplementary resources for researchers. Notably, only 28% of the included\narticles utilized multiple datasets, and only 14% used an external validation\nset, suggesting that the performance of other developed models may be\nsusceptible to overestimation. The TCGA-BRCA was used in 52% of the selected\nstudies. This dataset has a considerable selection bias that can impact the\nrobustness and generalizability of the trained algorithms. There is also a lack\nof consistent metadata reporting of breast WSI datasets that can be an issue in\ndeveloping accurate deep learning models, indicating the necessity of\nestablishing explicit guidelines for documenting breast WSI dataset\ncharacteristics and metadata.\n","authors":["Masoud Tafavvoghi","Lars Ailo Bongo","Nikita Shvetsov","Lill-Tove Rasmussen Busund","Kajsa Møllersen"],"pdf_url":"https://arxiv.org/pdf/2306.01546v2.pdf","comment":"27 pages (including references), 8 figures, 3 tables, 5 supporting\n information materials"},{"id":"http://arxiv.org/abs/2006.16144v3","updated":"2023-12-06T09:20:09Z","published":"2020-06-29T16:05:48Z","title":"Estimates on the generalization error of Physics Informed Neural\n Networks (PINNs) for approximating PDEs","summary":" Physics informed neural networks (PINNs) have recently been widely used for\nrobust and accurate approximation of PDEs. We provide rigorous upper bounds on\nthe generalization error of PINNs approximating solutions of the forward\nproblem for PDEs. An abstract formalism is introduced and stability properties\nof the underlying PDE are leveraged to derive an estimate for the\ngeneralization error in terms of the training error and number of training\nsamples. This abstract framework is illustrated with several examples of\nnonlinear PDEs. Numerical experiments, validating the proposed theory, are also\npresented.\n","authors":["Siddhartha Mishra","Roberto Molinaro"],"pdf_url":"https://arxiv.org/pdf/2006.16144v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2009.13291v3","updated":"2023-12-06T09:16:26Z","published":"2020-09-25T16:07:02Z","title":"Physics Informed Neural Networks for Simulating Radiative Transfer","summary":" We propose a novel machine learning algorithm for simulating radiative\ntransfer. Our algorithm is based on physics informed neural networks (PINNs),\nwhich are trained by minimizing the residual of the underlying radiative\ntranfer equations. We present extensive experiments and theoretical error\nestimates to demonstrate that PINNs provide a very easy to implement, fast,\nrobust and accurate method for simulating radiative transfer. We also present a\nPINN based algorithm for simulating inverse problems for radiative transfer\nefficiently.\n","authors":["Siddhartha Mishra","Roberto Molinaro"],"pdf_url":"https://arxiv.org/pdf/2009.13291v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2007.01138v3","updated":"2023-12-06T09:06:19Z","published":"2020-06-29T16:23:58Z","title":"Estimates on the generalization error of Physics Informed Neural\n Networks (PINNs) for approximating a class of inverse problems for PDEs","summary":" Physics informed neural networks (PINNs) have recently been very successfully\napplied for efficiently approximating inverse problems for PDEs. We focus on a\nparticular class of inverse problems, the so-called data assimilation or unique\ncontinuation problems, and prove rigorous estimates on the generalization error\nof PINNs approximating them. An abstract framework is presented and conditional\nstability estimates for the underlying inverse problem are employed to derive\nthe estimate on the PINN generalization error, providing rigorous justification\nfor the use of PINNs in this context. The abstract framework is illustrated\nwith examples of four prototypical linear PDEs. Numerical experiments,\nvalidating the proposed theory, are also presented.\n","authors":["Siddhartha Mishra","Roberto Molinaro"],"pdf_url":"https://arxiv.org/pdf/2007.01138v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.14468v3","updated":"2023-12-06T08:59:24Z","published":"2023-06-26T07:20:25Z","title":"A General Framework for Sequential Decision-Making under Adaptivity\n Constraints","summary":" We take the first step in studying general sequential decision-making under\ntwo adaptivity constraints: rare policy switch and batch learning. First, we\nprovide a general class called the Eluder Condition class, which includes a\nwide range of reinforcement learning classes. Then, for the rare policy switch\nconstraint, we provide a generic algorithm to achieve a\n$\\widetilde{\\mathcal{O}}(\\log K) $ switching cost with a\n$\\widetilde{\\mathcal{O}}(\\sqrt{K})$ regret on the EC class. For the batch\nlearning constraint, we provide an algorithm that provides a\n$\\widetilde{\\mathcal{O}}(\\sqrt{K}+K/B)$ regret with the number of batches $B.$\nThis paper is the first work considering rare policy switch and batch learning\nunder general function classes, which covers nearly all the models studied in\nthe previous works such as tabular MDP (Bai et al. 2019; Zhang et al. 2020),\nlinear MDP (Wang et al. 2021; Gao et al. 2021), low eluder dimension MDP (Kong\net al. 2021; Gao et al. 2021), generalized linear function approximation (Qiao\net al. 2023), and also some new classes such as the low $D_\\Delta$-type Bellman\neluder dimension problem, linear mixture MDP, kernelized nonlinear regulator\nand undercomplete partially observed Markov decision process (POMDP).\n","authors":["Nuoya Xiong","Zhaoran Wang","Zhuoran Yang"],"pdf_url":"https://arxiv.org/pdf/2306.14468v3.pdf","comment":"48 pages"},{"id":"http://arxiv.org/abs/2206.08225v3","updated":"2023-12-06T08:57:52Z","published":"2022-06-16T14:51:28Z","title":"All the World's a (Hyper)Graph: A Data Drama","summary":" We introduce Hyperbard, a dataset of diverse relational data representations\nderived from Shakespeare's plays. Our representations range from simple graphs\ncapturing character co-occurrence in single scenes to hypergraphs encoding\ncomplex communication settings and character contributions as hyperedges with\nedge-specific node weights. By making multiple intuitive representations\nreadily available for experimentation, we facilitate rigorous representation\nrobustness checks in graph learning, graph mining, and network analysis,\nhighlighting the advantages and drawbacks of specific representations.\nLeveraging the data released in Hyperbard, we demonstrate that many solutions\nto popular graph mining problems are highly dependent on the representation\nchoice, thus calling current graph curation practices into question. As an\nhomage to our data source, and asserting that science can also be art, we\npresent all our points in the form of a play.\n","authors":["Corinna Coupette","Jilles Vreeken","Bastian Rieck"],"pdf_url":"https://arxiv.org/pdf/2206.08225v3.pdf","comment":"This is the full version of our paper; an abridged version appears in\n Digital Scholarship in the Humanities. Landing page for code and data:\n https://hyperbard.net/"},{"id":"http://arxiv.org/abs/2312.03360v1","updated":"2023-12-06T08:55:55Z","published":"2023-12-06T08:55:55Z","title":"Teaching Specific Scientific Knowledge into Large Language Models\n through Additional Training","summary":" Through additional training, we explore embedding specialized scientific\nknowledge into the Llama 2 Large Language Model (LLM). Key findings reveal that\neffective knowledge integration requires reading texts from multiple\nperspectives, especially in instructional formats. We utilize text augmentation\nto tackle the scarcity of specialized texts, including style conversions and\ntranslations. Hyperparameter optimization proves crucial, with different size\nmodels (7b, 13b, and 70b) reasonably undergoing additional training. Validating\nour methods, we construct a dataset of 65,000 scientific papers. Although we\nhave succeeded in partially embedding knowledge, the study highlights the\ncomplexities and limitations of incorporating specialized information into\nLLMs, suggesting areas for further improvement.\n","authors":["Kan Hatakeyama-Sato","Yasuhiko Igarashi","Shun Katakami","Yuta Nabae","Teruaki Hayakawa"],"pdf_url":"https://arxiv.org/pdf/2312.03360v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2304.14176v2","updated":"2023-12-06T08:55:05Z","published":"2023-04-27T13:25:34Z","title":"Exploring the flavor structure of quarks and leptons with reinforcement\n learning","summary":" We propose a method to explore the flavor structure of quarks and leptons\nwith reinforcement learning. As a concrete model, we utilize a basic\nvalue-based algorithm for models with $U(1)$ flavor symmetry. By training\nneural networks on the $U(1)$ charges of quarks and leptons, the agent finds 21\nmodels to be consistent with experimentally measured masses and mixing angles\nof quarks and leptons. In particular, an intrinsic value of normal ordering\ntends to be larger than that of inverted ordering, and the normal ordering is\nwell fitted with the current experimental data in contrast to the inverted\nordering. A specific value of effective mass for the neutrinoless double beta\ndecay and a sizable leptonic CP violation induced by an angular component of\nflavon field are predicted by autonomous behavior of the agent. Our finding\nresults indicate that the reinforcement learning can be a new method for\nunderstanding the flavor structure.\n","authors":["Satsuki Nishimura","Coh Miyao","Hajime Otsuka"],"pdf_url":"https://arxiv.org/pdf/2304.14176v2.pdf","comment":"45 pages, 15 figures, v2: published version"},{"id":"http://arxiv.org/abs/2312.03351v1","updated":"2023-12-06T08:50:01Z","published":"2023-12-06T08:50:01Z","title":"On the variants of SVM methods applied to GPR data to classify tack coat\n characteristics in French pavements: two experimental case studies","summary":" Among the commonly used non-destructive techniques, the Ground Penetrating\nRadar (GPR) is one of the most widely adopted today for assessing pavement\nconditions in France. However, conventional radar systems and their forward\nprocessing methods have shown their limitations for the physical and\ngeometrical characterization of very thin layers such as tack coats. However,\nthe use of Machine Learning methods applied to GPR with an inverse approach\nshowed that it was numerically possible to identify the tack coat\ncharacteristics despite masking effects due to low timefrequency resolution\nnoted in the raw B-scans. Thus, we propose in this paper to apply the inverse\napproach based on Machine Learning, already validated in previous works on\nnumerical data, on two experimental cases with different pavement structures.\nThe first case corresponds to a validation on known pavement structures on the\nGustave Eiffel University (Nantes, France) with its pavement fatigue carousel\nand the second case focuses on a new real road in Vend{\\'e}e department\n(France). In both case studies, the performances of SVM/SVR methods showed the\nefficiency of supervised learning methods to classify and estimate the emulsion\nproportioning in the tack coats.\n","authors":["Grégory Andreoli","Amine Ihamouten","Mai Lan Nguyen","Yannick Fargier","Cyrille Fauchard","Jean-Michel Simonin","Viktoriia Buliuk","David Souriou","Xavier Dérobert"],"pdf_url":"https://arxiv.org/pdf/2312.03351v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03344v1","updated":"2023-12-06T08:36:23Z","published":"2023-12-06T08:36:23Z","title":"Interpretable Mechanistic Representations for Meal-level Glycemic\n Control in the Wild","summary":" Diabetes encompasses a complex landscape of glycemic control that varies\nwidely among individuals. However, current methods do not faithfully capture\nthis variability at the meal level. On the one hand, expert-crafted features\nlack the flexibility of data-driven methods; on the other hand, learned\nrepresentations tend to be uninterpretable which hampers clinical adoption. In\nthis paper, we propose a hybrid variational autoencoder to learn interpretable\nrepresentations of CGM and meal data. Our method grounds the latent space to\nthe inputs of a mechanistic differential equation, producing embeddings that\nreflect physiological quantities, such as insulin sensitivity, glucose\neffectiveness, and basal glucose levels. Moreover, we introduce a novel method\nto infer the glucose appearance rate, making the mechanistic model robust to\nunreliable meal logs. On a dataset of CGM and self-reported meals from\nindividuals with type-2 diabetes and pre-diabetes, our unsupervised\nrepresentation discovers a separation between individuals proportional to their\ndisease severity. Our embeddings produce clusters that are up to 4x better than\nnaive, expert, black-box, and pure mechanistic features. Our method provides a\nnuanced, yet interpretable, embedding space to compare glycemic control within\nand across individuals, directly learnable from in-the-wild data.\n","authors":["Ke Alexander Wang","Emily B. Fox"],"pdf_url":"https://arxiv.org/pdf/2312.03344v1.pdf","comment":"Proceedings of Machine Learning for Health (ML4H) 2023. Code\n available at: https://github.com/KeAWang/interpretable-cgm-representations"},{"id":"http://arxiv.org/abs/2212.07624v3","updated":"2023-12-06T08:22:39Z","published":"2022-12-15T05:54:16Z","title":"Neuroevolution of Physics-Informed Neural Nets: Benchmark Problems and\n Comparative Results","summary":" The potential of learned models for fundamental scientific research and\ndiscovery is drawing increasing attention worldwide. Physics-informed neural\nnetworks (PINNs), where the loss function directly embeds governing equations\nof scientific phenomena, is one of the key techniques at the forefront of\nrecent advances. PINNs are typically trained using stochastic gradient descent\nmethods, akin to their deep learning counterparts. However, analysis in this\npaper shows that PINNs' unique loss formulations lead to a high degree of\ncomplexity and ruggedness that may not be conducive for gradient descent.\nUnlike in standard deep learning, PINN training requires globally optimum\nparameter values that satisfy physical laws as closely as possible. Spurious\nlocal optimum, indicative of erroneous physics, must be avoided. Hence,\nneuroevolution algorithms, with their superior global search capacity, may be a\nbetter choice for PINNs relative to gradient descent methods. Here, we propose\na set of five benchmark problems, with open-source codes, spanning diverse\nphysical phenomena for novel neuroevolution algorithm development. Using this,\nwe compare two neuroevolution algorithms against the commonly used stochastic\ngradient descent, and our baseline results support the claim that\nneuroevolution can surpass gradient descent, ensuring better physics compliance\nin the predicted outputs. %Furthermore, implementing neuroevolution with JAX\nleads to orders of magnitude speedup relative to standard implementations.\n","authors":["Nicholas Sung Wei Yong","Jian Cheng Wong","Pao-Hsiung Chiu","Abhishek Gupta","Chinchun Ooi","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2212.07624v3.pdf","comment":"11 pages, 6 figures, 4 tables"},{"id":"http://arxiv.org/abs/2306.08289v2","updated":"2023-12-06T07:48:15Z","published":"2023-06-14T06:52:07Z","title":"$\\textbf{A}^2\\textbf{CiD}^2$: Accelerating Asynchronous Communication in\n Decentralized Deep Learning","summary":" Distributed training of Deep Learning models has been critical to many recent\nsuccesses in the field. Current standard methods primarily rely on synchronous\ncentralized algorithms which induce major communication bottlenecks and\nsynchronization locks at scale. Decentralized asynchronous algorithms are\nemerging as a potential alternative but their practical applicability still\nlags. In order to mitigate the increase in communication cost that naturally\ncomes with scaling the number of workers, we introduce a principled\nasynchronous, randomized, gossip-based optimization algorithm which works\nthanks to a continuous local momentum named $\\textbf{A}^2\\textbf{CiD}^2$. Our\nmethod allows each worker to continuously process mini-batches without\nstopping, and run a peer-to-peer averaging routine in parallel, reducing idle\ntime. In addition to inducing a significant communication acceleration at no\ncost other than adding a local momentum variable, minimal adaptation is\nrequired to incorporate $\\textbf{A}^2\\textbf{CiD}^2$ to standard asynchronous\napproaches. Our theoretical analysis proves accelerated rates compared to\nprevious asynchronous decentralized baselines and we empirically show that\nusing our $\\textbf{A}^2\\textbf{CiD}^2$ momentum significantly decrease\ncommunication costs in poorly connected networks. In particular, we show\nconsistent improvement on the ImageNet dataset using up to 64 asynchronous\nworkers (A100 GPUs) and various communication network topologies.\n","authors":["Adel Nabli","Eugene Belilovsky","Edouard Oyallon"],"pdf_url":"https://arxiv.org/pdf/2306.08289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03330v1","updated":"2023-12-06T07:38:46Z","published":"2023-12-06T07:38:46Z","title":"Measuring Misogyny in Natural Language Generation: Preliminary Results\n from a Case Study on two Reddit Communities","summary":" Generic `toxicity' classifiers continue to be used for evaluating the\npotential for harm in natural language generation, despite mounting evidence of\ntheir shortcomings. We consider the challenge of measuring misogyny in natural\nlanguage generation, and argue that generic `toxicity' classifiers are\ninadequate for this task. We use data from two well-characterised `Incel'\ncommunities on Reddit that differ primarily in their degrees of misogyny to\nconstruct a pair of training corpora which we use to fine-tune two language\nmodels. We show that an open source `toxicity' classifier is unable to\ndistinguish meaningfully between generations from these models. We contrast\nthis with a misogyny-specific lexicon recently proposed by feminist\nsubject-matter experts, demonstrating that, despite the limitations of simple\nlexicon-based approaches, this shows promise as a benchmark to evaluate\nlanguage models for misogyny, and that it is sensitive enough to reveal the\nknown differences in these Reddit communities. Our preliminary findings\nhighlight the limitations of a generic approach to evaluating harms, and\nfurther emphasise the need for careful benchmark design and selection in\nnatural language evaluation.\n","authors":["Aaron J. Snoswell","Lucinda Nelson","Hao Xue","Flora D. Salim","Nicolas Suzor","Jean Burgess"],"pdf_url":"https://arxiv.org/pdf/2312.03330v1.pdf","comment":"This extended abstract was presented at the Generation, Evaluation\n and Metrics workshop at Empirical Methods in Natural Language Processing in\n 2023 (GEM@EMNLP 2023) in Singapore"},{"id":"http://arxiv.org/abs/2310.19786v3","updated":"2023-12-06T07:34:24Z","published":"2023-10-30T17:50:29Z","title":"From External to Swap Regret 2.0: An Efficient Reduction and Oblivious\n Adversary for Large Action Spaces","summary":" We provide a novel reduction from swap-regret minimization to external-regret\nminimization, which improves upon the classical reductions of Blum-Mansour\n[BM07] and Stolz-Lugosi [SL05] in that it does not require finiteness of the\nspace of actions. We show that, whenever there exists a no-external-regret\nalgorithm for some hypothesis class, there must also exist a no-swap-regret\nalgorithm for that same class. For the problem of learning with expert advice,\nour result implies that it is possible to guarantee that the swap regret is\nbounded by {\\epsilon} after $\\log(N)^{O(1/\\epsilon)}$ rounds and with $O(N)$\nper iteration complexity, where $N$ is the number of experts, while the\nclassical reductions of Blum-Mansour and Stolz-Lugosi require $O(N/\\epsilon^2)$\nrounds and at least $\\Omega(N^2)$ per iteration complexity. Our result comes\nwith an associated lower bound, which -- in contrast to that in [BM07] -- holds\nfor oblivious and $\\ell_1$-constrained adversaries and learners that can employ\ndistributions over experts, showing that the number of rounds must be\n$\\tilde\\Omega(N/\\epsilon^2)$ or exponential in $1/\\epsilon$.\n Our reduction implies that, if no-regret learning is possible in some game,\nthen this game must have approximate correlated equilibria, of arbitrarily good\napproximation. This strengthens the folklore implication of no-regret learning\nthat approximate coarse correlated equilibria exist. Importantly, it provides a\nsufficient condition for the existence of correlated equilibrium which vastly\nextends the requirement that the action set is finite, thus answering a\nquestion left open by [DG22; Ass+23]. Moreover, it answers several outstanding\nquestions about equilibrium computation and learning in games.\n","authors":["Yuval Dagan","Constantinos Daskalakis","Maxwell Fishelson","Noah Golowich"],"pdf_url":"https://arxiv.org/pdf/2310.19786v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03328v1","updated":"2023-12-06T07:33:22Z","published":"2023-12-06T07:33:22Z","title":"Deep Learning for Koopman-based Dynamic Movement Primitives","summary":" The challenge of teaching robots to perform dexterous manipulation, dynamic\nlocomotion, or whole--body manipulation from a small number of demonstrations\nis an important research field that has attracted interest from across the\nrobotics community. In this work, we propose a novel approach by joining the\ntheories of Koopman Operators and Dynamic Movement Primitives to Learning from\nDemonstration. Our approach, named \\gls{admd}, projects nonlinear dynamical\nsystems into linear latent spaces such that a solution reproduces the desired\ncomplex motion. Use of an autoencoder in our approach enables generalizability\nand scalability, while the constraint to a linear system attains\ninterpretability. Our results are comparable to the Extended Dynamic Mode\nDecomposition on the LASA Handwriting dataset but with training on only a small\nfractions of the letters.\n","authors":["Tyler Han","Carl Glen Henshaw"],"pdf_url":"https://arxiv.org/pdf/2312.03328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03325v1","updated":"2023-12-06T07:26:02Z","published":"2023-12-06T07:26:02Z","title":"GCFA:Geodesic Curve Feature Augmentation via Shape Space Theory","summary":" Deep learning has yielded remarkable outcomes in various domains. However,\nthe challenge of requiring large-scale labeled samples still persists in deep\nlearning. Thus, data augmentation has been introduced as a critical strategy to\ntrain deep learning models. However, data augmentation suffers from information\nloss and poor performance in small sample environments. To overcome these\ndrawbacks, we propose a feature augmentation method based on shape space\ntheory, i.e., Geodesic curve feature augmentation, called GCFA in brevity.\nFirst, we extract features from the image with the neural network model. Then,\nthe multiple image features are projected into a pre-shape space as features.\nIn the pre-shape space, a Geodesic curve is built to fit the features. Finally,\nthe many generated features on the Geodesic curve are used to train the various\nmachine learning models. The GCFA module can be seamlessly integrated with most\nmachine learning methods. And the proposed method is simple, effective and\ninsensitive for the small sample datasets. Several examples demonstrate that\nthe GCFA method can greatly improve the performance of the data preprocessing\nmodel in a small sample environment.\n","authors":["Yuexing Han","Guanxin Wan","Bing Wang"],"pdf_url":"https://arxiv.org/pdf/2312.03325v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03318v1","updated":"2023-12-06T07:02:22Z","published":"2023-12-06T07:02:22Z","title":"Complementary Benefits of Contrastive Learning and Self-Training Under\n Distribution Shift","summary":" Self-training and contrastive learning have emerged as leading techniques for\nincorporating unlabeled data, both under distribution shift (unsupervised\ndomain adaptation) and when it is absent (semi-supervised learning). However,\ndespite the popularity and compatibility of these techniques, their efficacy in\ncombination remains unexplored. In this paper, we undertake a systematic\nempirical investigation of this combination, finding that (i) in domain\nadaptation settings, self-training and contrastive learning offer significant\ncomplementary gains; and (ii) in semi-supervised learning settings,\nsurprisingly, the benefits are not synergistic. Across eight distribution shift\ndatasets (e.g., BREEDs, WILDS), we demonstrate that the combined method obtains\n3--8% higher accuracy than either approach independently. We then theoretically\nanalyze these techniques in a simplified model of distribution shift,\ndemonstrating scenarios under which the features produced by contrastive\nlearning can yield a good initialization for self-training to further amplify\ngains and achieve optimal performance, even when either method alone would\nfail.\n","authors":["Saurabh Garg","Amrith Setlur","Zachary Chase Lipton","Sivaraman Balakrishnan","Virginia Smith","Aditi Raghunathan"],"pdf_url":"https://arxiv.org/pdf/2312.03318v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2112.12909v3","updated":"2023-12-06T06:52:09Z","published":"2021-12-24T02:13:04Z","title":"Optimal Variable Clustering for High-Dimensional Matrix Valued Data","summary":" Matrix valued data has become increasingly prevalent in many applications.\nMost of the existing clustering methods for this type of data are tailored to\nthe mean model and do not account for the dependence structure of the features,\nwhich can be very informative, especially in high-dimensional settings or when\nmean information is not available. To extract the information from the\ndependence structure for clustering, we propose a new latent variable model for\nthe features arranged in matrix form, with some unknown membership matrices\nrepresenting the clusters for the rows and columns. Under this model, we\nfurther propose a class of hierarchical clustering algorithms using the\ndifference of a weighted covariance matrix as the dissimilarity measure.\nTheoretically, we show that under mild conditions, our algorithm attains\nclustering consistency in the high-dimensional setting. While this consistency\nresult holds for our algorithm with a broad class of weighted covariance\nmatrices, the conditions for this result depend on the choice of the weight. To\ninvestigate how the weight affects the theoretical performance of our\nalgorithm, we establish the minimax lower bound for clustering under our latent\nvariable model in terms of some cluster separation metric. Given these results,\nwe identify the optimal weight in the sense that using this weight guarantees\nour algorithm to be minimax rate-optimal. The practical implementation of our\nalgorithm with the optimal weight is also discussed. Simulation studies show\nthat our algorithm performs better than existing methods in terms of the\nadjusted Rand index (ARI). The method is applied to a genomic dataset and\nyields meaningful interpretations.\n","authors":["Inbeom Lee","Siyi Deng","Yang Ning"],"pdf_url":"https://arxiv.org/pdf/2112.12909v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.08274v2","updated":"2023-12-06T06:38:01Z","published":"2023-06-14T06:24:58Z","title":"A Simple and Scalable Graph Neural Network for Large Directed Graphs","summary":" Node classification is one of the hottest tasks in graph analysis. Though\nexisting studies have explored various node representations in directed and\nundirected graphs, they have overlooked the distinctions of their capabilities\nto capture the information of graphs. To tackle the limitation, we investigate\nvarious combinations of node representations (aggregated features vs. adjacency\nlists) and edge direction awareness within an input graph (directed vs.\nundirected). We address the first empirical study to benchmark the performance\nof various GNNs that use either combination of node representations and edge\ndirection awareness. Our experiments demonstrate that no single combination\nstably achieves state-of-the-art results across datasets, which indicates that\nwe need to select appropriate combinations depending on the dataset\ncharacteristics. In response, we propose a simple yet holistic classification\nmethod A2DUG which leverages all combinations of node representations in\ndirected and undirected graphs. We demonstrate that A2DUG stably performs well\non various datasets and improves the accuracy up to 11.29 compared with the\nstate-of-the-art methods. To spur the development of new methods, we publicly\nrelease our complete codebase under the MIT license.\n","authors":["Seiji Maekawa","Yuya Sasaki","Makoto Onizuka"],"pdf_url":"https://arxiv.org/pdf/2306.08274v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03311v1","updated":"2023-12-06T06:33:25Z","published":"2023-12-06T06:33:25Z","title":"On the Nystrom Approximation for Preconditioning in Kernel Machines","summary":" Kernel methods are a popular class of nonlinear predictive models in machine\nlearning. Scalable algorithms for learning kernel models need to be iterative\nin nature, but convergence can be slow due to poor conditioning. Spectral\npreconditioning is an important tool to speed-up the convergence of such\niterative algorithms for training kernel models. However computing and storing\na spectral preconditioner can be expensive which can lead to large\ncomputational and storage overheads, precluding the application of kernel\nmethods to problems with large datasets. A Nystrom approximation of the\nspectral preconditioner is often cheaper to compute and store, and has\ndemonstrated success in practical applications. In this paper we analyze the\ntrade-offs of using such an approximated preconditioner. Specifically, we show\nthat a sample of logarithmic size (as a function of the size of the dataset)\nenables the Nystrom-based approximated preconditioner to accelerate gradient\ndescent nearly as well as the exact preconditioner, while also reducing the\ncomputational and storage overheads.\n","authors":["Amirhesam Abedsoltan","Mikhail Belkin","Parthe Pandit","Luis Rademacher"],"pdf_url":"https://arxiv.org/pdf/2312.03311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03309v1","updated":"2023-12-06T06:27:27Z","published":"2023-12-06T06:27:27Z","title":"Benchmarking Continual Learning from Cognitive Perspectives","summary":" Continual learning addresses the problem of continuously acquiring and\ntransferring knowledge without catastrophic forgetting of old concepts. While\nhumans achieve continual learning via diverse neurocognitive mechanisms, there\nis a mismatch between cognitive properties and evaluation methods of continual\nlearning models. First, the measurement of continual learning models mostly\nrelies on evaluation metrics at a micro-level, which cannot characterize\ncognitive capacities of the model. Second, the measurement is method-specific,\nemphasizing model strengths in one aspect while obscuring potential weaknesses\nin other respects. To address these issues, we propose to integrate model\ncognitive capacities and evaluation metrics into a unified evaluation paradigm.\nWe first characterize model capacities via desiderata derived from cognitive\nproperties supporting human continual learning. The desiderata concern (1)\nadaptability in varying lengths of task sequence; (2) sensitivity to dynamic\ntask variations; and (3) efficiency in memory usage and training time\nconsumption. Then we design evaluation protocols for each desideratum to assess\ncognitive capacities of recent continual learning models. Experimental results\nshow that no method we consider has satisfied all the desiderata and is still\nfar away from realizing truly continual learning. Although some methods exhibit\nsome degree of adaptability and efficiency, no method is able to identify task\nrelationships when encountering dynamic task variations, or achieve a trade-off\nin learning similarities and differences between tasks. Inspired by these\nresults, we discuss possible factors that influence model performance in these\ndesiderata and provide guidance for the improvement of continual learning\nmodels.\n","authors":["Xiaoqian Liu","Junge Zhang","Mingyi Zhang","Peipei Yang"],"pdf_url":"https://arxiv.org/pdf/2312.03309v1.pdf","comment":"12 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.03307v1","updated":"2023-12-06T06:15:48Z","published":"2023-12-06T06:15:48Z","title":"Balanced Marginal and Joint Distributional Learning via Mixture\n Cramer-Wold Distance","summary":" In the process of training a generative model, it becomes essential to\nmeasure the discrepancy between two high-dimensional probability distributions:\nthe generative distribution and the ground-truth distribution of the observed\ndataset. Recently, there has been growing interest in an approach that involves\nslicing high-dimensional distributions, with the Cramer-Wold distance emerging\nas a promising method. However, we have identified that the Cramer-Wold\ndistance primarily focuses on joint distributional learning, whereas\nunderstanding marginal distributional patterns is crucial for effective\nsynthetic data generation. In this paper, we introduce a novel measure of\ndissimilarity, the mixture Cramer-Wold distance. This measure enables us to\ncapture both marginal and joint distributional information simultaneously, as\nit incorporates a mixture measure with point masses on standard basis vectors.\nBuilding upon the mixture Cramer-Wold distance, we propose a new generative\nmodel called CWDAE (Cramer-Wold Distributional AutoEncoder), which shows\nremarkable performance in generating synthetic data when applied to real\ntabular datasets. Furthermore, our model offers the flexibility to adjust the\nlevel of data privacy with ease.\n","authors":["Seunghwan An","Sungchul Hong","Jong-June Jeon"],"pdf_url":"https://arxiv.org/pdf/2312.03307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03303v1","updated":"2023-12-06T06:07:50Z","published":"2023-12-06T06:07:50Z","title":"Dyport: Dynamic Importance-based Hypothesis Generation Benchmarking\n Technique","summary":" This paper presents a novel benchmarking framework Dyport for evaluating\nbiomedical hypothesis generation systems. Utilizing curated datasets, our\napproach tests these systems under realistic conditions, enhancing the\nrelevance of our evaluations. We integrate knowledge from the curated databases\ninto a dynamic graph, accompanied by a method to quantify discovery importance.\nThis not only assesses hypothesis accuracy but also their potential impact in\nbiomedical research which significantly extends traditional link prediction\nbenchmarks. Applicability of our benchmarking process is demonstrated on\nseveral link prediction systems applied on biomedical semantic knowledge\ngraphs. Being flexible, our benchmarking system is designed for broad\napplication in hypothesis generation quality verification, aiming to expand the\nscope of scientific discovery within the biomedical research community.\nAvailability and implementation: Dyport framework is fully open-source. All\ncode and datasets are available at: https://github.com/IlyaTyagin/Dyport\n","authors":["Ilya Tyagin","Ilya Safro"],"pdf_url":"https://arxiv.org/pdf/2312.03303v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.00845v4","updated":"2023-12-06T05:49:55Z","published":"2023-02-02T03:15:29Z","title":"CD-GraB: Coordinating Distributed Example Orders for Provably\n Accelerated Training","summary":" Recent research on online Gradient Balancing (GraB) has revealed that there\nexist permutation-based example orderings for SGD that are guaranteed to\noutperform random reshuffling (RR). Whereas RR arbitrarily permutes training\nexamples, GraB leverages stale gradients from prior epochs to order examples --\nachieving a provably faster convergence rate than RR. However, GraB is limited\nby design: while it demonstrates an impressive ability to scale-up training on\ncentralized data, it does not naturally extend to modern distributed ML\nworkloads. We therefore propose Coordinated Distributed GraB (CD-GraB), which\nuses insights from prior work on kernel thinning to translate the benefits of\nprovably faster permutation-based example ordering to distributed settings.\nWith negligible overhead, CD-GraB exhibits a linear speedup in convergence rate\nover centralized GraB and outperforms distributed RR on a variety of benchmark\ntasks.\n","authors":["A. Feder Cooper","Wentao Guo","Khiem Pham","Tiancheng Yuan","Charlie F. Ruan","Yucheng Lu","Christopher De Sa"],"pdf_url":"https://arxiv.org/pdf/2302.00845v4.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.03296v1","updated":"2023-12-06T05:36:52Z","published":"2023-12-06T05:36:52Z","title":"Cooperative Probabilistic Trajectory Forecasting under Occlusion","summary":" Perception and planning under occlusion is essential for safety-critical\ntasks. Occlusion-aware planning often requires communicating the information of\nthe occluded object to the ego agent for safe navigation. However,\ncommunicating rich sensor information under adverse conditions during\ncommunication loss and limited bandwidth may not be always feasible. Further,\nin GPS denied environments and indoor navigation, localizing and sharing of\noccluded objects can be challenging. To overcome this, relative pose estimation\nbetween connected agents sharing a common field of view can be a\ncomputationally effective way of communicating information about surrounding\nobjects. In this paper, we design an end-to-end network that cooperatively\nestimates the current states of occluded pedestrian in the reference frame of\nego agent and then predicts the trajectory with safety guarantees.\nExperimentally, we show that the uncertainty-aware trajectory prediction of\noccluded pedestrian by the ego agent is almost similar to the ground truth\ntrajectory assuming no occlusion. The current research holds promise for\nuncertainty-aware navigation among multiple connected agents under occlusion.\n","authors":["Anshul Nayak","Azim Eskandarian"],"pdf_url":"https://arxiv.org/pdf/2312.03296v1.pdf","comment":"10 pages, 13 figures, 1 table"},{"id":"http://arxiv.org/abs/2311.13750v2","updated":"2023-12-06T05:20:16Z","published":"2023-11-23T00:53:11Z","title":"Towards Transferable Multi-modal Perception Representation Learning for\n Autonomy: NeRF-Supervised Masked AutoEncoder","summary":" This work proposes a unified self-supervised pre-training framework for\ntransferable multi-modal perception representation learning via masked\nmulti-modal reconstruction in Neural Radiance Field (NeRF), namely\nNeRF-Supervised Masked AutoEncoder (NS-MAE). Specifically, conditioned on\ncertain view directions and locations, multi-modal embeddings extracted from\ncorrupted multi-modal input signals, i.e., Lidar point clouds and images, are\nrendered into projected multi-modal feature maps via neural rendering. Then,\noriginal multi-modal signals serve as reconstruction targets for the rendered\nmulti-modal feature maps to enable self-supervised representation learning.\nExtensive experiments show that the representation learned via NS-MAE shows\npromising transferability for diverse multi-modal and single-modal (camera-only\nand Lidar-only) perception models on diverse 3D perception downstream tasks (3D\nobject detection and BEV map segmentation) with diverse amounts of fine-tuning\nlabeled data. Moreover, we empirically find that NS-MAE enjoys the synergy of\nboth the mechanism of masked autoencoder and neural radiance field. We hope\nthis study can inspire exploration of more general multi-modal representation\nlearning for autonomous agents.\n","authors":["Xiaohao Xu"],"pdf_url":"https://arxiv.org/pdf/2311.13750v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02125v2","updated":"2023-12-06T05:19:11Z","published":"2023-12-04T18:52:26Z","title":"TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and\n Advanced Decoding Techniques","summary":" Recent advances in language models (LMs), have demonstrated significant\nefficacy in tasks related to the arts and humanities. While LMs have exhibited\nexceptional performance across a wide range of natural language processing\ntasks, there are notable challenges associated with their utilization on small\ndatasets and their ability to replicate more creative human capacities. In this\nstudy, we aim to address these challenges by training a Persian classical\npoetry generation model using a transformer architecture on a specialized\ndataset with no pretraining. Additionally, we propose a novel decoding method\nto enhance coherence and meaningfulness in the generated poetry, effectively\nmanaging the tradeoff between diversity and quality. Furthermore, the results\nof our training approach and the proposed decoding method are evaluated through\ncomprehensive set of automatic and human evaluations and showed its superior\ncapability to generate coherent and meaningful poetry in compare to other\ndecoding methods and an existing Persian large language model (LLM).\n","authors":["Amir Panahandeh","Hanie Asemi","Esmaeil Nourani"],"pdf_url":"https://arxiv.org/pdf/2312.02125v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02168v2","updated":"2023-12-06T05:16:37Z","published":"2023-10-30T15:38:31Z","title":"The SVHN Dataset Is Deceptive for Probabilistic Generative Models Due to\n a Distribution Mismatch","summary":" The Street View House Numbers (SVHN) dataset is a popular benchmark dataset\nin deep learning. Originally designed for digit classification tasks, the SVHN\ndataset has been widely used as a benchmark for various other tasks including\ngenerative modeling. However, with this work, we aim to warn the community\nabout an issue of the SVHN dataset as a benchmark for generative modeling\ntasks: we discover that the official split into training set and test set of\nthe SVHN dataset are not drawn from the same distribution. We empirically show\nthat this distribution mismatch has little impact on the classification task\n(which may explain why this issue has not been detected before), but it\nseverely affects the evaluation of probabilistic generative models, such as\nVariational Autoencoders and diffusion models. As a workaround, we propose to\nmix and re-split the official training and test set when SVHN is used for tasks\nother than classification. We publish a new split and the indices we used to\ncreate it at https://jzenn.github.io/svhn-remix/ .\n","authors":["Tim Z. Xiao","Johannes Zenn","Robert Bamler"],"pdf_url":"https://arxiv.org/pdf/2312.02168v2.pdf","comment":"Accepted at NeurIPS 2023 Workshop on Distribution Shifts; 4 pages +\n appendix; proposed data set at https://jzenn.github.io/svhn-remix/"},{"id":"http://arxiv.org/abs/2312.03292v1","updated":"2023-12-06T05:02:10Z","published":"2023-12-06T05:02:10Z","title":"Enhancing Molecular Property Prediction via Mixture of Collaborative\n Experts","summary":" Molecular Property Prediction (MPP) task involves predicting biochemical\nproperties based on molecular features, such as molecular graph structures,\ncontributing to the discovery of lead compounds in drug development. To address\ndata scarcity and imbalance in MPP, some studies have adopted Graph Neural\nNetworks (GNN) as an encoder to extract commonalities from molecular graphs.\nHowever, these approaches often use a separate predictor for each task,\nneglecting the shared characteristics among predictors corresponding to\ndifferent tasks. In response to this limitation, we introduce the GNN-MoCE\narchitecture. It employs the Mixture of Collaborative Experts (MoCE) as\npredictors, exploiting task commonalities while confronting the homogeneity\nissue in the expert pool and the decision dominance dilemma within the expert\ngroup. To enhance expert diversity for collaboration among all experts, the\nExpert-Specific Projection method is proposed to assign a unique projection\nperspective to each expert. To balance decision-making influence for\ncollaboration within the expert group, the Expert-Specific Loss is presented to\nintegrate individual expert loss into the weighted decision loss of the group\nfor more equitable training. Benefiting from the enhancements of MoCE in expert\ncreation, dynamic expert group formation, and experts' collaboration, our model\ndemonstrates superior performance over traditional methods on 24 MPP datasets,\nespecially in tasks with limited data or high imbalance.\n","authors":["Xu Yao","Shuang Liang","Songqiao Han","Hailiang Huang"],"pdf_url":"https://arxiv.org/pdf/2312.03292v1.pdf","comment":"11 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.03291v1","updated":"2023-12-06T04:53:12Z","published":"2023-12-06T04:53:12Z","title":"OMNIINPUT: A Model-centric Evaluation Framework through Output\n Distribution","summary":" We propose a novel model-centric evaluation framework, OmniInput, to evaluate\nthe quality of an AI/ML model's predictions on all possible inputs (including\nhuman-unrecognizable ones), which is crucial for AI safety and reliability.\nUnlike traditional data-centric evaluation based on pre-defined test sets, the\ntest set in OmniInput is self-constructed by the model itself and the model\nquality is evaluated by investigating its output distribution. We employ an\nefficient sampler to obtain representative inputs and the output distribution\nof the trained model, which, after selective annotation, can be used to\nestimate the model's precision and recall at different output values and a\ncomprehensive precision-recall curve. Our experiments demonstrate that\nOmniInput enables a more fine-grained comparison between models, especially\nwhen their performance is almost the same on pre-defined datasets, leading to\nnew findings and insights for how to train more robust, generalizable models.\n","authors":["Weitang Liu","Ying Wai Li","Tianle Wang","Yi-Zhuang You","Jingbo Shang"],"pdf_url":"https://arxiv.org/pdf/2312.03291v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02850v2","updated":"2023-12-06T04:42:58Z","published":"2023-12-05T16:06:23Z","title":"A Kernel-Based Neural Network Test for High-dimensional Sequencing Data\n Analysis","summary":" The recent development of artificial intelligence (AI) technology, especially\nthe advance of deep neural network (DNN) technology, has revolutionized many\nfields. While DNN plays a central role in modern AI technology, it has been\nrarely used in sequencing data analysis due to challenges brought by\nhigh-dimensional sequencing data (e.g., overfitting). Moreover, due to the\ncomplexity of neural networks and their unknown limiting distributions,\nbuilding association tests on neural networks for genetic association analysis\nremains a great challenge. To address these challenges and fill the important\ngap of using AI in high-dimensional sequencing data analysis, we introduce a\nnew kernel-based neural network (KNN) test for complex association analysis of\nsequencing data. The test is built on our previously developed KNN framework,\nwhich uses random effects to model the overall effects of high-dimensional\ngenetic data and adopts kernel-based neural network structures to model complex\ngenotype-phenotype relationships. Based on KNN, a Wald-type test is then\nintroduced to evaluate the joint association of high-dimensional genetic data\nwith a disease phenotype of interest, considering non-linear and non-additive\neffects (e.g., interaction effects). Through simulations, we demonstrated that\nour proposed method attained higher power compared to the sequence kernel\nassociation test (SKAT), especially in the presence of non-linear and\ninteraction effects. Finally, we apply the methods to the whole genome\nsequencing (WGS) dataset from the Alzheimer's Disease Neuroimaging Initiative\n(ADNI) study, investigating new genes associated with the hippocampal volume\nchange over time.\n","authors":["Tingting Hou","Chang Jiang","Qing Lu"],"pdf_url":"https://arxiv.org/pdf/2312.02850v2.pdf","comment":"31 pages, 5 figures and 3 tabels"},{"id":"http://arxiv.org/abs/2312.03288v1","updated":"2023-12-06T04:36:58Z","published":"2023-12-06T04:36:58Z","title":"STEP CATFormer: Spatial-Temporal Effective Body-Part Cross Attention\n Transformer for Skeleton-based Action Recognition","summary":" Graph convolutional networks (GCNs) have been widely used and achieved\nremarkable results in skeleton-based action recognition. We think the key to\nskeleton-based action recognition is a skeleton hanging in frames, so we focus\non how the Graph Convolutional Convolution networks learn different topologies\nand effectively aggregate joint features in the global temporal and local\ntemporal. In this work, we propose three Channel-wise Tolopogy Graph\nConvolution based on Channel-wise Topology Refinement Graph Convolution\n(CTR-GCN). Combining CTR-GCN with two joint cross-attention modules can capture\nthe upper-lower body part and hand-foot relationship skeleton features. After\nthat, to capture features of human skeletons changing in frames we design the\nTemporal Attention Transformers to extract skeletons effectively. The Temporal\nAttention Transformers can learn the temporal features of human skeleton\nsequences. Finally, we fuse the temporal features output scale with MLP and\nclassification. We develop a powerful graph convolutional network named Spatial\nTemporal Effective Body-part Cross Attention Transformer which notably\nhigh-performance on the NTU RGB+D, NTU RGB+D 120 datasets. Our code and models\nare available at https://github.com/maclong01/STEP-CATFormer\n","authors":["Nguyen Huu Bao Long"],"pdf_url":"https://arxiv.org/pdf/2312.03288v1.pdf","comment":"Accepted to BMVC 2023: Computer Vision for Games and Games for\n Computer Vision (CVG). 9 pages"},{"id":"http://arxiv.org/abs/2305.14387v3","updated":"2023-12-06T04:34:44Z","published":"2023-05-22T17:55:50Z","title":"AlpacaFarm: A Simulation Framework for Methods that Learn from Human\n Feedback","summary":" Large language models (LLMs) such as ChatGPT have seen widespread adoption\ndue to their ability to follow user instructions well. Developing these LLMs\ninvolves a complex yet poorly understood workflow requiring training with human\nfeedback. Replicating and understanding this instruction-following process\nfaces three major challenges: the high cost of data collection, the lack of\ntrustworthy evaluation, and the absence of reference method implementations. We\naddress these challenges with AlpacaFarm, a simulator that enables research and\ndevelopment for learning from feedback at a low cost. First, we design LLM\nprompts to simulate human feedback that are 45x cheaper than crowdworkers and\ndisplay high agreement with humans. Second, we propose an automatic evaluation\nand validate it against human instructions obtained on real-world interactions.\nThird, we contribute reference implementations for several methods (PPO, DPO,\nbest-of-n, expert iteration, and more) that learn from pairwise feedback.\nFinally, as an end-to-end validation of AlpacaFarm, we train and evaluate\neleven models on 10k pairs of real human feedback and show that rankings of\nmodels trained in AlpacaFarm match rankings of models trained on human data. As\na demonstration of the research possible in AlpacaFarm, we find that methods\nthat use a reward model can substantially improve over supervised fine-tuning\nand that our reference PPO implementation leads to a +10% improvement in\nwin-rate against Davinci003. We release all components of AlpacaFarm at\nhttps://github.com/tatsu-lab/alpaca_farm.\n","authors":["Yann Dubois","Xuechen Li","Rohan Taori","Tianyi Zhang","Ishaan Gulrajani","Jimmy Ba","Carlos Guestrin","Percy Liang","Tatsunori B. Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2305.14387v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03277v1","updated":"2023-12-06T04:05:17Z","published":"2023-12-06T04:05:17Z","title":"Anomaly Detection for Scalable Task Grouping in Reinforcement\n Learning-based RAN Optimization","summary":" The use of learning-based methods for optimizing cellular radio access\nnetworks (RAN) has received increasing attention in recent years. This\ncoincides with a rapid increase in the number of cell sites worldwide, driven\nlargely by dramatic growth in cellular network traffic. Training and\nmaintaining learned models that work well across a large number of cell sites\nhas thus become a pertinent problem. This paper proposes a scalable framework\nfor constructing a reinforcement learning policy bank that can perform RAN\noptimization across a large number of cell sites with varying traffic patterns.\nCentral to our framework is a novel application of anomaly detection techniques\nto assess the compatibility between sites (tasks) and the policy bank. This\nallows our framework to intelligently identify when a policy can be reused for\na task, and when a new policy needs to be trained and added to the policy bank.\nOur results show that our approach to compatibility assessment leads to an\nefficient use of computational resources, by allowing us to construct a\nperformant policy bank without exhaustively training on all tasks, which makes\nit applicable under real-world constraints.\n","authors":["Jimmy Li","Igor Kozlov","Di Wu","Xue Liu","Gregory Dudek"],"pdf_url":"https://arxiv.org/pdf/2312.03277v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04256v2","updated":"2023-12-06T03:42:19Z","published":"2023-11-07T14:03:28Z","title":"Foundational propositions of hesitant fuzzy sets and parameter\n reductions of hesitant fuzzy information systems","summary":" Hesitant fuzzy sets are widely used in the instances of uncertainty and\nhesitation. The inclusion relationship is an important and foundational\ndefinition for sets. Hesitant fuzzy set, as a kind of set, needs explicit\ndefinition of inclusion relationship. Base on the hesitant fuzzy membership\ndegree of discrete form, several kinds of inclusion relationships for hesitant\nfuzzy sets are proposed. And then some foundational propositions of hesitant\nfuzzy sets and the families of hesitant fuzzy sets are presented. Finally, some\nfoundational propositions of hesitant fuzzy information systems with respect to\nparameter reductions are put forward, and an example and an algorithm are given\nto illustrate the processes of parameter reductions.\n","authors":["Shizhan Lu"],"pdf_url":"https://arxiv.org/pdf/2311.04256v2.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2309.10954v2","updated":"2023-12-06T03:34:00Z","published":"2023-09-19T22:41:44Z","title":"In-Context Learning for Text Classification with Many Labels","summary":" In-context learning (ICL) using large language models for tasks with many\nlabels is challenging due to the limited context window, which makes it\ndifficult to fit a sufficient number of examples in the prompt. In this paper,\nwe use a pre-trained dense retrieval model to bypass this limitation, giving\nthe model only a partial view of the full label space for each inference call.\nTesting with recent open-source LLMs (OPT, LLaMA), we set new state of the art\nperformance in few-shot settings for three common intent classification\ndatasets, with no finetuning. We also surpass fine-tuned performance on\nfine-grained sentiment classification in certain cases. We analyze the\nperformance across number of in-context examples and different model scales,\nshowing that larger models are necessary to effectively and consistently make\nuse of larger context lengths for ICL. By running several ablations, we analyze\nthe model's use of: a) the similarity of the in-context examples to the current\ninput, b) the semantic content of the class names, and c) the correct\ncorrespondence between examples and labels. We demonstrate that all three are\nneeded to varying degrees depending on the domain, contrary to certain recent\nworks.\n","authors":["Aristides Milios","Siva Reddy","Dzmitry Bahdanau"],"pdf_url":"https://arxiv.org/pdf/2309.10954v2.pdf","comment":"12 pages, 4 figures"},{"id":"http://arxiv.org/abs/2312.03262v1","updated":"2023-12-06T03:18:49Z","published":"2023-12-06T03:18:49Z","title":"Low-Cost High-Power Membership Inference by Boosting Relativity","summary":" We present a robust membership inference attack (RMIA) that amplifies the\ndistinction between population data and the training data on any target model,\nby effectively leveraging both reference models and reference data in our\nlikelihood ratio test. Our algorithm exhibits superior test power\n(true-positive rate) when compared to prior methods, even at extremely low\nfalse-positive error rates (as low as 0). Also, under computation constraints,\nwhere only a limited number of reference models (as few as 1) are available,\nour method performs exceptionally well, unlike some prior attacks that approach\nrandom guessing in such scenarios. Our method lays the groundwork for\ncost-effective and practical yet powerful and robust privacy risk analysis of\nmachine learning algorithms.\n","authors":["Sajjad Zarifzadeh","Philippe Liu","Reza Shokri"],"pdf_url":"https://arxiv.org/pdf/2312.03262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03259v1","updated":"2023-12-06T03:14:16Z","published":"2023-12-06T03:14:16Z","title":"f-FERM: A Scalable Framework for Robust Fair Empirical Risk Minimization","summary":" Training and deploying machine learning models that meet fairness criteria\nfor protected groups are fundamental in modern artificial intelligence. While\nnumerous constraints and regularization terms have been proposed in the\nliterature to promote fairness in machine learning tasks, most of these methods\nare not amenable to stochastic optimization due to the complex and nonlinear\nstructure of constraints and regularizers. Here, the term \"stochastic\" refers\nto the ability of the algorithm to work with small mini-batches of data.\nMotivated by the limitation of existing literature, this paper presents a\nunified stochastic optimization framework for fair empirical risk minimization\nbased on f-divergence measures (f-FERM). The proposed stochastic algorithm\nenjoys theoretical convergence guarantees. In addition, our experiments\ndemonstrate the superiority of fairness-accuracy tradeoffs offered by f-FERM\nfor almost all batch sizes (ranging from full-batch to batch size of one).\nMoreover, we show that our framework can be extended to the case where there is\na distribution shift from training to the test data. Our extension is based on\na distributionally robust optimization reformulation of f-FERM objective under\n$L_p$ norms as uncertainty sets. Again, in this distributionally robust\nsetting, f-FERM not only enjoys theoretical convergence guarantees but also\noutperforms other baselines in the literature in the tasks involving\ndistribution shifts. An efficient stochastic implementation of $f$-FERM is\npublicly available.\n","authors":["Sina Baharlouei","Shivam Patel","Meisam Razaviyayn"],"pdf_url":"https://arxiv.org/pdf/2312.03259v1.pdf","comment":"23 Pages,5 figures"},{"id":"http://arxiv.org/abs/2312.03256v1","updated":"2023-12-06T03:09:19Z","published":"2023-12-06T03:09:19Z","title":"CAFE: Towards Compact, Adaptive, and Fast Embedding for Large-scale\n Recommendation Models","summary":" Recently, the growing memory demands of embedding tables in Deep Learning\nRecommendation Models (DLRMs) pose great challenges for model training and\ndeployment. Existing embedding compression solutions cannot simultaneously meet\nthree key design requirements: memory efficiency, low latency, and adaptability\nto dynamic data distribution. This paper presents CAFE, a Compact, Adaptive,\nand Fast Embedding compression framework that addresses the above requirements.\nThe design philosophy of CAFE is to dynamically allocate more memory resources\nto important features (called hot features), and allocate less memory to\nunimportant ones. In CAFE, we propose a fast and lightweight sketch data\nstructure, named HotSketch, to capture feature importance and report hot\nfeatures in real time. For each reported hot feature, we assign it a unique\nembedding. For the non-hot features, we allow multiple features to share one\nembedding by using hash embedding technique. Guided by our design philosophy,\nwe further propose a multi-level hash embedding framework to optimize the\nembedding tables of non-hot features. We theoretically analyze the accuracy of\nHotSketch, and analyze the model convergence against deviation. Extensive\nexperiments show that CAFE significantly outperforms existing embedding\ncompression methods, yielding 3.92% and 3.68% superior testing AUC on Criteo\nKaggle dataset and CriteoTB dataset at a compression ratio of 10000x. The\nsource codes of CAFE are available at GitHub.\n","authors":["Hailin Zhang","Zirui Liu","Boxuan Chen","Yikai Zhao","Tong Zhao","Tong Yang","Bin Cui"],"pdf_url":"https://arxiv.org/pdf/2312.03256v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02429v2","updated":"2023-12-06T03:08:31Z","published":"2023-12-05T02:08:48Z","title":"PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval\n Models","summary":" Embedding-based Retrieval Models (ERMs) have emerged as a promising framework\nfor large-scale text retrieval problems due to powerful large language models.\nNevertheless, fine-tuning ERMs to reach state-of-the-art results can be\nexpensive due to the extreme scale of data as well as the complexity of\nmulti-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this\nwork, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast\ntuning of ERMs without any backward pass in the optimization. At index building\nstage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN)\ncomponent. At inference stage, PEFA performs a convex combination of two\nscoring functions, one from the ERM and the other from the kNN. Based on the\nneighborhood definition, PEFA framework induces two realizations, namely\nPEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra\nsmall) using a single ANN index. Empirically, PEFA achieves significant\nimprovement on two retrieval applications. For document retrieval, regarding\nRecall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an\naverage of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%,\nrespectively. For product search, PEFA improves the Recall@100 of the\nfine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL,\nrespectively. Our code is available at\nhttps://github.com/amzn/pecos/tree/mainline/examples/pefa-wsdm24.\n","authors":["Wei-Cheng Chang","Jyun-Yu Jiang","Jiong Zhang","Mutasem Al-Darabsah","Choon Hui Teo","Cho-Jui Hsieh","Hsiang-Fu Yu","S. V. N. Vishwanathan"],"pdf_url":"https://arxiv.org/pdf/2312.02429v2.pdf","comment":"Accept by WSDM 2024"},{"id":"http://arxiv.org/abs/2312.03253v1","updated":"2023-12-06T02:58:49Z","published":"2023-12-06T02:58:49Z","title":"Seller-side Outcome Fairness in Online Marketplaces","summary":" This paper aims to investigate and achieve seller-side fairness within online\nmarketplaces, where many sellers and their items are not sufficiently exposed\nto customers in an e-commerce platform. This phenomenon raises concerns\nregarding the potential loss of revenue associated with less exposed items as\nwell as less marketplace diversity. We introduce the notion of seller-side\noutcome fairness and build an optimization model to balance collected\nrecommendation rewards and the fairness metric. We then propose a\ngradient-based data-driven algorithm based on the duality and bandit theory.\nOur numerical experiments on real e-commerce data sets show that our algorithm\ncan lift seller fairness measures while not hurting metrics like collected\nGross Merchandise Value (GMV) and total purchases.\n","authors":["Zikun Ye","Reza Yousefi Maragheh","Lalitesh Morishetti","Shanu Vashishtha","Jason Cho","Kaushiki Nag","Sushant Kumar","Kannan Achan"],"pdf_url":"https://arxiv.org/pdf/2312.03253v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01454v2","updated":"2023-12-06T02:53:11Z","published":"2023-12-03T16:58:10Z","title":"D-Bot: Database Diagnosis System using Large Language Models","summary":" Database administrators (DBAs) play an important role in managing,\nmaintaining and optimizing database systems. However, it is hard and tedious\nfor DBAs to manage a large number of databases and give timely response\n(waiting for hours is intolerable in many online cases). In addition, existing\nempirical methods only support limited diagnosis scenarios, which are also\nlabor-intensive to update the diagnosis rules for database version updates.\nRecently large language models (LLMs) have shown great potential in various\nfields. Thus, we propose D-Bot, an LLM-based database diagnosis system that can\nautomatically acquire knowledge from diagnosis documents, and generate\nreasonable and well-founded diagnosis report (i.e., identifying the root causes\nand solutions) within acceptable time (e.g., under 10 minutes compared to hours\nby a DBA). The techniques in D-Bot include (i) offline knowledge extraction\nfrom documents, (ii) automatic prompt generation (e.g., knowledge matching,\ntool retrieval), (iii) root cause analysis using tree search algorithm, and\n(iv) collaborative mechanism for complex anomalies with multiple root causes.\nWe verify D-Bot on real benchmarks (including 539 anomalies of six typical\napplications), and the results show that D-Bot can effectively analyze the root\ncauses of unseen anomalies and significantly outperforms traditional methods\nand vanilla models like GPT-4.\n","authors":["Xuanhe Zhou","Guoliang Li","Zhaoyan Sun","Zhiyuan Liu","Weize Chen","Jianming Wu","Jiesi Liu","Ruohang Feng","Guoyang Zeng"],"pdf_url":"https://arxiv.org/pdf/2312.01454v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2206.10706v2","updated":"2023-12-06T02:49:49Z","published":"2022-06-21T19:55:07Z","title":"TraSE: Towards Tackling Authorial Style from a Cognitive Science\n Perspective","summary":" Stylistic analysis of text is a key task in research areas ranging from\nauthorship attribution to forensic analysis and personality profiling. The\nexisting approaches for stylistic analysis are plagued by issues like topic\ninfluence, lack of discriminability for large number of authors and the\nrequirement for large amounts of diverse data. In this paper, the source of\nthese issues are identified along with the necessity for a cognitive\nperspective on authorial style in addressing them. A novel feature\nrepresentation, called Trajectory-based Style Estimation (TraSE), is introduced\nto support this purpose. Authorship attribution experiments with over 27,000\nauthors and 1.4 million samples in a cross-domain scenario resulted in 90%\nattribution accuracy suggesting that the feature representation is immune to\nsuch negative influences and an excellent candidate for stylistic analysis.\nFinally, a qualitative analysis is performed on TraSE using physical human\ncharacteristics, like age, to validate its claim on capturing cognitive traits.\n","authors":["Ronald Wilson","Avanti Bhandarkar","Damon Woodard"],"pdf_url":"https://arxiv.org/pdf/2206.10706v2.pdf","comment":"Experimental results in the paper are incorrectly reported due to an\n unforeseen glitch in the software prototype. The paper and its findings are\n withdrawn"},{"id":"http://arxiv.org/abs/2312.03248v1","updated":"2023-12-06T02:47:56Z","published":"2023-12-06T02:47:56Z","title":"Customizable Combination of Parameter-Efficient Modules for Multi-Task\n Learning","summary":" Modular and composable transfer learning is an emerging direction in the\nfield of Parameter Efficient Fine-Tuning, as it enables neural networks to\nbetter organize various aspects of knowledge, leading to improved cross-task\ngeneralization. In this paper, we introduce a novel approach Customized\nPolytropon C-Poly that combines task-common skills and task-specific skills,\nwhile the skill parameters being highly parameterized using low-rank\ntechniques. Each task is associated with a customizable number of exclusive\nspecialized skills and also benefits from skills shared with peer tasks. A\nskill assignment matrix is jointly learned. To evaluate our approach, we\nconducted extensive experiments on the Super-NaturalInstructions and the\nSuperGLUE benchmarks. Our findings demonstrate that C-Poly outperforms\nfully-shared, task-specific, and skill-indistinguishable baselines,\nsignificantly enhancing the sample efficiency in multi-task learning scenarios.\n","authors":["Haowen Wang","Tao Sun","Cong Fan","Jinjie Gu"],"pdf_url":"https://arxiv.org/pdf/2312.03248v1.pdf","comment":"22 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.03243v1","updated":"2023-12-06T02:31:12Z","published":"2023-12-06T02:31:12Z","title":"Generalizable Neural Physics Solvers by Baldwinian Evolution","summary":" Physics-informed neural networks (PINNs) are at the forefront of scientific\nmachine learning, making possible the creation of machine intelligence that is\ncognizant of physical laws and able to accurately simulate them. In this paper,\nthe potential of discovering PINNs that generalize over an entire family of\nphysics tasks is studied, for the first time, through a biological lens of the\nBaldwin effect. Drawing inspiration from the neurodevelopment of precocial\nspecies that have evolved to learn, predict and react quickly to their\nenvironment, we envision PINNs that are pre-wired with connection strengths\ninducing strong biases towards efficient learning of physics. To this end,\nevolutionary selection pressure (guided by proficiency over a family of tasks)\nis coupled with lifetime learning (to specialize on a smaller subset of those\ntasks) to produce PINNs that demonstrate fast and physics-compliant prediction\ncapabilities across a range of empirically challenging problem instances. The\nBaldwinian approach achieves an order of magnitude improvement in prediction\naccuracy at a fraction of the computation cost compared to state-of-the-art\nresults with PINNs meta-learned by gradient descent. This paper marks a leap\nforward in the meta-learning of PINNs as generalizable physics solvers.\n","authors":["Jian Cheng Wong","Chin Chun Ooi","Abhishek Gupta","Pao-Hsiung Chiu","Joshua Shao Zheng Low","My Ha Dao","Yew-Soon Ong"],"pdf_url":"https://arxiv.org/pdf/2312.03243v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03236v1","updated":"2023-12-06T02:16:44Z","published":"2023-12-06T02:16:44Z","title":"Multicoated and Folded Graph Neural Networks with Strong Lottery Tickets","summary":" The Strong Lottery Ticket Hypothesis (SLTH) demonstrates the existence of\nhigh-performing subnetworks within a randomly initialized model, discoverable\nthrough pruning a convolutional neural network (CNN) without any weight\ntraining. A recent study, called Untrained GNNs Tickets (UGT), expanded SLTH\nfrom CNNs to shallow graph neural networks (GNNs). However, discrepancies\npersist when comparing baseline models with learned dense weights.\nAdditionally, there remains an unexplored area in applying SLTH to deeper GNNs,\nwhich, despite delivering improved accuracy with additional layers, suffer from\nexcessive memory requirements. To address these challenges, this work utilizes\nMulticoated Supermasks (M-Sup), a scalar pruning mask method, and implements it\nin GNNs by proposing a strategy for setting its pruning thresholds adaptively.\nIn the context of deep GNNs, this research uncovers the existence of untrained\nrecurrent networks, which exhibit performance on par with their trained\nfeed-forward counterparts. This paper also introduces the Multi-Stage Folding\nand Unshared Masks methods to expand the search space in terms of both\narchitecture and parameters. Through the evaluation of various datasets,\nincluding the Open Graph Benchmark (OGB), this work establishes a triple-win\nscenario for SLTH-based GNNs: by achieving high sparsity, competitive\nperformance, and high memory efficiency with up to 98.7\\% reduction, it\ndemonstrates suitability for energy-efficient graph processing.\n","authors":["Jiale Yan","Hiroaki Ito","Ángel López García-Arias","Yasuyuki Okoshi","Hikari Otsuka","Kazushi Kawamura","Thiem Van Chu","Masato Motomura"],"pdf_url":"https://arxiv.org/pdf/2312.03236v1.pdf","comment":"9 pages, accepted in the Second Learning on Graphs Conference (LoG\n 2023)"},{"id":"http://arxiv.org/abs/2312.00710v2","updated":"2023-12-06T02:00:53Z","published":"2023-12-01T16:42:57Z","title":"SpaCE: The Spatial Confounding Environment","summary":" Spatial confounding poses a significant challenge in scientific studies\ninvolving spatial data, where unobserved spatial variables can influence both\ntreatment and outcome, possibly leading to spurious associations. To address\nthis problem, we introduce SpaCE: The Spatial Confounding Environment, the\nfirst toolkit to provide realistic benchmark datasets and tools for\nsystematically evaluating causal inference methods designed to alleviate\nspatial confounding. Each dataset includes training data, true counterfactuals,\na spatial graph with coordinates, and smoothness and confounding scores\ncharacterizing the effect of a missing spatial confounder. It also includes\nrealistic semi-synthetic outcomes and counterfactuals, generated using\nstate-of-the-art machine learning ensembles, following best practices for\ncausal inference benchmarks. The datasets cover real treatment and covariates\nfrom diverse domains, including climate, health and social sciences. SpaCE\nfacilitates an automated end-to-end pipeline, simplifying data loading,\nexperimental setup, and evaluating machine learning and causal inference\nmodels. The SpaCE project provides several dozens of datasets of diverse sizes\nand spatial complexity. It is publicly available as a Python package,\nencouraging community feedback and contributions.\n","authors":["Mauricio Tec","Ana Trisovic","Michelle Audirac","Sophie Woodward","Jie Kate Hu","Naeem Khoshnevis","Francesca Dominici"],"pdf_url":"https://arxiv.org/pdf/2312.00710v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.15136v2","updated":"2023-12-06T02:00:24Z","published":"2023-05-24T13:31:11Z","title":"ReSync: Riemannian Subgradient-based Robust Rotation Synchronization","summary":" This work presents ReSync, a Riemannian subgradient-based algorithm for\nsolving the robust rotation synchronization problem, which arises in various\nengineering applications. ReSync solves a least-unsquared minimization\nformulation over the rotation group, which is nonsmooth and nonconvex, and aims\nat recovering the underlying rotations directly. We provide strong theoretical\nguarantees for ReSync under the random corruption setting. Specifically, we\nfirst show that the initialization procedure of ReSync yields a proper initial\npoint that lies in a local region around the ground-truth rotations. We next\nestablish the weak sharpness property of the aforementioned formulation and\nthen utilize this property to derive the local linear convergence of ReSync to\nthe ground-truth rotations. By combining these guarantees, we conclude that\nReSync converges linearly to the ground-truth rotations under appropriate\nconditions. Experiment results demonstrate the effectiveness of ReSync.\n","authors":["Huikang Liu","Xiao Li","Anthony Man-Cho So"],"pdf_url":"https://arxiv.org/pdf/2305.15136v2.pdf","comment":"Accepted for publication in NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.03231v1","updated":"2023-12-06T01:59:47Z","published":"2023-12-06T01:59:47Z","title":"Deep Multimodal Fusion for Surgical Feedback Classification","summary":" Quantification of real-time informal feedback delivered by an experienced\nsurgeon to a trainee during surgery is important for skill improvements in\nsurgical training. Such feedback in the live operating room is inherently\nmultimodal, consisting of verbal conversations (e.g., questions and answers) as\nwell as non-verbal elements (e.g., through visual cues like pointing to\nanatomic elements). In this work, we leverage a clinically-validated\nfive-category classification of surgical feedback: \"Anatomic\", \"Technical\",\n\"Procedural\", \"Praise\" and \"Visual Aid\". We then develop a multi-label machine\nlearning model to classify these five categories of surgical feedback from\ninputs of text, audio, and video modalities. The ultimate goal of our work is\nto help automate the annotation of real-time contextual surgical feedback at\nscale. Our automated classification of surgical feedback achieves AUCs ranging\nfrom 71.5 to 77.6 with the fusion improving performance by 3.1%. We also show\nthat high-quality manual transcriptions of feedback audio from experts improve\nAUCs to between 76.5 and 96.2, which demonstrates a clear path toward future\nimprovements. Empirically, we find that the Staged training strategy, with\nfirst pre-training each modality separately and then training them jointly, is\nmore effective than training different modalities altogether. We also present\nintuitive findings on the importance of modalities for different feedback\ncategories. This work offers an important first look at the feasibility of\nautomated classification of real-world live surgical feedback based on text,\naudio, and video modalities.\n","authors":["Rafal Kocielnik","Elyssa Y. Wong","Timothy N. Chu","Lydia Lin","De-An Huang","Jiayun Wang","Anima Anandkumar","Andrew J. Hung"],"pdf_url":"https://arxiv.org/pdf/2312.03231v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.18411v2","updated":"2023-12-06T01:45:02Z","published":"2023-05-28T17:09:32Z","title":"Feature-Learning Networks Are Consistent Across Widths At Realistic\n Scales","summary":" We study the effect of width on the dynamics of feature-learning neural\nnetworks across a variety of architectures and datasets. Early in training,\nwide neural networks trained on online data have not only identical loss curves\nbut also agree in their point-wise test predictions throughout training. For\nsimple tasks such as CIFAR-5m this holds throughout training for networks of\nrealistic widths. We also show that structural properties of the models,\nincluding internal representations, preactivation distributions, edge of\nstability phenomena, and large learning rate effects are consistent across\nlarge widths. This motivates the hypothesis that phenomena seen in realistic\nmodels can be captured by infinite-width, feature-learning limits. For harder\ntasks (such as ImageNet and language modeling), and later training times,\nfinite-width deviations grow systematically. Two distinct effects cause these\ndeviations across widths. First, the network output has\ninitialization-dependent variance scaling inversely with width, which can be\nremoved by ensembling networks. We observe, however, that ensembles of narrower\nnetworks perform worse than a single wide network. We call this the bias of\nnarrower width. We conclude with a spectral perspective on the origin of this\nfinite-width bias.\n","authors":["Nikhil Vyas","Alexander Atanasov","Blake Bordelon","Depen Morwani","Sabarish Sainathan","Cengiz Pehlevan"],"pdf_url":"https://arxiv.org/pdf/2305.18411v2.pdf","comment":"24 pages, 19 figures. NeurIPS 2023. Revised based on reviewer\n feedback"},{"id":"http://arxiv.org/abs/2104.12909v6","updated":"2023-12-06T01:27:41Z","published":"2021-04-26T23:18:34Z","title":"Algorithm as Experiment: Machine Learning, Market Design, and Policy\n Eligibility Rules","summary":" Algorithms make a growing portion of policy and business decisions. We\ndevelop a treatment-effect estimator using algorithmic decisions as instruments\nfor a class of stochastic and deterministic algorithms. Our estimator is\nconsistent and asymptotically normal for well-defined causal effects. A special\ncase of our setup is multidimensional regression discontinuity designs with\ncomplex boundaries. We apply our estimator to evaluate the Coronavirus Aid,\nRelief, and Economic Security Act, which allocated many billions of dollars\nworth of relief funding to hospitals via an algorithmic rule. The funding is\nshown to have little effect on COVID-19-related hospital activities. Naive\nestimates exhibit selection bias.\n","authors":["Yusuke Narita","Kohei Yata"],"pdf_url":"https://arxiv.org/pdf/2104.12909v6.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03218v1","updated":"2023-12-06T01:16:10Z","published":"2023-12-06T01:16:10Z","title":"Accelerated Gradient Algorithms with Adaptive Subspace Search for\n Instance-Faster Optimization","summary":" Gradient-based minimax optimal algorithms have greatly promoted the\ndevelopment of continuous optimization and machine learning. One seminal work\ndue to Yurii Nesterov [Nes83a] established $\\tilde{\\mathcal{O}}(\\sqrt{L/\\mu})$\ngradient complexity for minimizing an $L$-smooth $\\mu$-strongly convex\nobjective. However, an ideal algorithm would adapt to the explicit complexity\nof a particular objective function and incur faster rates for simpler problems,\ntriggering our reconsideration of two defeats of existing optimization modeling\nand analysis. (i) The worst-case optimality is neither the instance optimality\nnor such one in reality. (ii) Traditional $L$-smoothness condition may not be\nthe primary abstraction/characterization for modern practical problems.\n In this paper, we open up a new way to design and analyze gradient-based\nalgorithms with direct applications in machine learning, including linear\nregression and beyond. We introduce two factors $(\\alpha, \\tau_{\\alpha})$ to\nrefine the description of the degenerated condition of the optimization\nproblems based on the observation that the singular values of Hessian often\ndrop sharply. We design adaptive algorithms that solve simpler problems without\npre-known knowledge with reduced gradient or analogous oracle accesses. The\nalgorithms also improve the state-of-art complexities for several problems in\nmachine learning, thereby solving the open problem of how to design faster\nalgorithms in light of the known complexity lower bounds. Specially, with the\n$\\mathcal{O}(1)$-nuclear norm bounded, we achieve an optimal\n$\\tilde{\\mathcal{O}}(\\mu^{-1/3})$ (v.s. $\\tilde{\\mathcal{O}}(\\mu^{-1/2})$)\ngradient complexity for linear regression. We hope this work could invoke the\nrethinking for understanding the difficulty of modern problems in optimization.\n","authors":["Yuanshi Liu","Hanzhen Zhao","Yang Xu","Pengyun Yue","Cong Fang"],"pdf_url":"https://arxiv.org/pdf/2312.03218v1.pdf","comment":"Optimization for Machine Learning"},{"id":"http://arxiv.org/abs/2312.03216v1","updated":"2023-12-06T01:15:34Z","published":"2023-12-06T01:15:34Z","title":"SDSRA: A Skill-Driven Skill-Recombination Algorithm for Efficient Policy\n Learning","summary":" In this paper, we introduce a novel algorithm - the Skill-Driven Skill\nRecombination Algorithm (SDSRA) - an innovative framework that significantly\nenhances the efficiency of achieving maximum entropy in reinforcement learning\ntasks. We find that SDSRA achieves faster convergence compared to the\ntraditional Soft Actor-Critic (SAC) algorithm and produces improved policies.\nBy integrating skill-based strategies within the robust Actor-Critic framework,\nSDSRA demonstrates remarkable adaptability and performance across a wide array\nof complex and diverse benchmarks.\n","authors":["Eric H. Jiang","Andrew Lizarraga"],"pdf_url":"https://arxiv.org/pdf/2312.03216v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03213v1","updated":"2023-12-06T01:02:13Z","published":"2023-12-06T01:02:13Z","title":"Bootstrap Your Own Variance","summary":" Understanding model uncertainty is important for many applications. We\npropose Bootstrap Your Own Variance (BYOV), combining Bootstrap Your Own Latent\n(BYOL), a negative-free Self-Supervised Learning (SSL) algorithm, with Bayes by\nBackprop (BBB), a Bayesian method for estimating model posteriors. We find that\nthe learned predictive std of BYOV vs. a supervised BBB model is well captured\nby a Gaussian distribution, providing preliminary evidence that the learned\nparameter posterior is useful for label free uncertainty estimation. BYOV\nimproves upon the deterministic BYOL baseline (+2.83% test ECE, +1.03% test\nBrier) and presents better calibration and reliability when tested with various\naugmentations (eg: +2.4% test ECE, +1.2% test Brier for Salt & Pepper noise).\n","authors":["Polina Turishcheva","Jason Ramapuram","Sinead Williamson","Dan Busbridge","Eeshan Dhekane","Russ Webb"],"pdf_url":"https://arxiv.org/pdf/2312.03213v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03212v1","updated":"2023-12-06T01:00:07Z","published":"2023-12-06T01:00:07Z","title":"Constrained Bayesian Optimization Under Partial Observations: Balanced\n Improvements and Provable Convergence","summary":" The partially observable constrained optimization problems (POCOPs) impede\ndata-driven optimization techniques since an infeasible solution of POCOPs can\nprovide little information about the objective as well as the constraints. We\nendeavor to design an efficient and provable method for expensive POCOPs under\nthe framework of constrained Bayesian optimization. Our method consists of two\nkey components. Firstly, we present an improved design of the acquisition\nfunctions that introduces balanced exploration during optimization. We\nrigorously study the convergence properties of this design to demonstrate its\neffectiveness. Secondly, we propose a Gaussian process embedding different\nlikelihoods as the surrogate model for a partially observable constraint. This\nmodel leads to a more accurate representation of the feasible regions compared\nto traditional classification-based models. Our proposed method is empirically\nstudied on both synthetic and real-world problems. The results demonstrate the\ncompetitiveness of our method for solving POCOPs.\n","authors":["Shengbo Wang","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2312.03212v1.pdf","comment":"26 pages, 8 figures, under review"},{"id":"http://arxiv.org/abs/2204.08620v4","updated":"2023-12-06T00:49:28Z","published":"2022-04-19T02:54:16Z","title":"Quantifying Spatial Under-reporting Disparities in Resident\n Crowdsourcing","summary":" Modern city governance relies heavily on crowdsourcing to identify problems\nsuch as downed trees and power lines. A major concern is that residents do not\nreport problems at the same rates, with heterogeneous reporting delays directly\ntranslating to downstream disparities in how quickly incidents can be\naddressed. Here we develop a method to identify reporting delays without using\nexternal ground-truth data. Our insight is that the rates at which duplicate\nreports are made about the same incident can be leveraged to disambiguate\nwhether an incident has occurred by investigating its reporting rate once it\nhas occurred. We apply our method to over 100,000 resident reports made in New\nYork City and to over 900,000 reports made in Chicago, finding that there are\nsubstantial spatial and socioeconomic disparities in how quickly incidents are\nreported. We further validate our methods using external data and demonstrate\nhow estimating reporting delays leads to practical insights and interventions\nfor a more equitable, efficient government service.\n","authors":["Zhi Liu","Uma Bhandaram","Nikhil Garg"],"pdf_url":"https://arxiv.org/pdf/2204.08620v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.07320v2","updated":"2023-12-06T00:48:53Z","published":"2023-03-13T17:41:57Z","title":"Model-tuning Via Prompts Makes NLP Models Adversarially Robust","summary":" In recent years, NLP practitioners have converged on the following practice:\n(i) import an off-the-shelf pretrained (masked) language model; (ii) append a\nmultilayer perceptron atop the CLS token's hidden representation (with randomly\ninitialized weights); and (iii) fine-tune the entire model on a downstream task\n(MLP-FT). This procedure has produced massive gains on standard NLP benchmarks,\nbut these models remain brittle, even to mild adversarial perturbations. In\nthis work, we demonstrate surprising gains in adversarial robustness enjoyed by\nModel-tuning Via Prompts (MVP), an alternative method of adapting to downstream\ntasks. Rather than appending an MLP head to make output prediction, MVP appends\na prompt template to the input, and makes prediction via text\ninfilling/completion. Across 5 NLP datasets, 4 adversarial attacks, and 3\ndifferent models, MVP improves performance against adversarial substitutions by\nan average of 8% over standard methods and even outperforms adversarial\ntraining-based state-of-art defenses by 3.5%. By combining MVP with adversarial\ntraining, we achieve further improvements in adversarial robustness while\nmaintaining performance on unperturbed examples. Finally, we conduct ablations\nto investigate the mechanism underlying these gains. Notably, we find that the\nmain causes of vulnerability of MLP-FT can be attributed to the misalignment\nbetween pre-training and fine-tuning tasks, and the randomly initialized MLP\nparameters.\n","authors":["Mrigank Raman","Pratyush Maini","J. Zico Kolter","Zachary C. Lipton","Danish Pruthi"],"pdf_url":"https://arxiv.org/pdf/2303.07320v2.pdf","comment":"Accepted to the EMNLP 2023 Conference"},{"id":"http://arxiv.org/abs/2307.04749v2","updated":"2023-12-06T00:45:08Z","published":"2023-07-10T17:54:57Z","title":"Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image\n Alignment with Iterative VQA Feedback","summary":" The field of text-conditioned image generation has made unparalleled progress\nwith the recent advent of latent diffusion models. While remarkable, as the\ncomplexity of given text input increases, the state-of-the-art diffusion models\nmay still fail in generating images which accurately convey the semantics of\nthe given prompt. Furthermore, it has been observed that such misalignments are\noften left undetected by pretrained multi-modal models such as CLIP. To address\nthese problems, in this paper we explore a simple yet effective decompositional\napproach towards both evaluation and improvement of text-to-image alignment. In\nparticular, we first introduce a Decompositional-Alignment-Score which given a\ncomplex prompt decomposes it into a set of disjoint assertions. The alignment\nof each assertion with generated images is then measured using a VQA model.\nFinally, alignment scores for different assertions are combined aposteriori to\ngive the final text-to-image alignment score. Experimental analysis reveals\nthat the proposed alignment metric shows significantly higher correlation with\nhuman ratings as opposed to traditional CLIP, BLIP scores. Furthermore, we also\nfind that the assertion level alignment scores provide a useful feedback which\ncan then be used in a simple iterative procedure to gradually increase the\nexpression of different assertions in the final image outputs. Human user\nstudies indicate that the proposed approach surpasses previous state-of-the-art\nby 8.7% in overall text-to-image alignment accuracy. Project page for our paper\nis available at https://1jsingh.github.io/divide-evaluate-and-refine\n","authors":["Jaskirat Singh","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03196v1","updated":"2023-12-06T00:28:08Z","published":"2023-12-06T00:28:08Z","title":"Domain Invariant Representation Learning and Sleep Dynamics Modeling for\n Automatic Sleep Staging","summary":" Sleep staging has become a critical task in diagnosing and treating sleep\ndisorders to prevent sleep related diseases. With rapidly growing large scale\npublic sleep databases and advances in machine learning, significant progress\nhas been made toward automatic sleep staging. However, previous studies face\nsome critical problems in sleep studies; the heterogeneity of subjects'\nphysiological signals, the inability to extract meaningful information from\nunlabeled sleep signal data to improve predictive performances, the difficulty\nin modeling correlations between sleep stages, and the lack of an effective\nmechanism to quantify predictive uncertainty. In this study, we propose a\nneural network based automatic sleep staging model, named DREAM, to learn\ndomain generalized representations from physiological signals and models sleep\ndynamics. DREAM learns sleep related and subject invariant representations from\ndiverse subjects' sleep signal segments and models sleep dynamics by capturing\ninteractions between sequential signal segments and between sleep stages. In\nthe experiments, we demonstrate that DREAM outperforms the existing sleep\nstaging methods on three datasets. The case study demonstrates that our model\ncan learn the generalized decision function resulting in good prediction\nperformances for the new subjects, especially in case there are differences\nbetween testing and training subjects. The usage of unlabeled data shows the\nbenefit of leveraging unlabeled EEG data. Further, uncertainty quantification\ndemonstrates that DREAM provides prediction uncertainty, making the model\nreliable and helping sleep experts in real world applications.\n","authors":["Seungyeon Lee","Thai-Hoang Pham","Zhao Cheng","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03196v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2111.09971v2","updated":"2023-12-06T00:09:00Z","published":"2021-11-18T23:21:00Z","title":"Learning Robust Output Control Barrier Functions from Safe Expert\n Demonstrations","summary":" This paper addresses learning safe output feedback control laws from partial\nobservations of expert demonstrations. We assume that a model of the system\ndynamics and a state estimator are available along with corresponding error\nbounds, e.g., estimated from data in practice. We first propose robust output\ncontrol barrier functions (ROCBFs) as a means to guarantee safety, as defined\nthrough controlled forward invariance of a safe set. We then formulate an\noptimization problem to learn ROCBFs from expert demonstrations that exhibit\nsafe system behavior, e.g., data collected from a human operator or an expert\ncontroller. When the parametrization of the ROCBF is linear, then we show that,\nunder mild assumptions, the optimization problem is convex. Along with the\noptimization problem, we provide verifiable conditions in terms of the density\nof the data, smoothness of the system model and state estimator, and the size\nof the error bounds that guarantee validity of the obtained ROCBF. Towards\nobtaining a practical control algorithm, we propose an algorithmic\nimplementation of our theoretical framework that accounts for assumptions made\nin our framework in practice. We empirically validate our algorithm in the\nautonomous driving simulator CARLA and demonstrate how to learn safe control\nlaws from RGB camera images.\n","authors":["Lars Lindemann","Alexander Robey","Lejun Jiang","Satyajeet Das","Stephen Tu","Nikolai Matni"],"pdf_url":"https://arxiv.org/pdf/2111.09971v2.pdf","comment":"Journal paper submission"},{"id":"http://arxiv.org/abs/2312.03194v1","updated":"2023-12-06T00:05:25Z","published":"2023-12-06T00:05:25Z","title":"Corporate Bankruptcy Prediction with Domain-Adapted BERT","summary":" This study performs BERT-based analysis, which is a representative\ncontextualized language model, on corporate disclosure data to predict\nimpending bankruptcies. Prior literature on bankruptcy prediction mainly\nfocuses on developing more sophisticated prediction methodologies with\nfinancial variables. However, in our study, we focus on improving the quality\nof input dataset. Specifically, we employ BERT model to perform sentiment\nanalysis on MD&A disclosures. We show that BERT outperforms dictionary-based\npredictions and Word2Vec-based predictions in terms of adjusted R-square in\nlogistic regression, k-nearest neighbor (kNN-5), and linear kernel support\nvector machine (SVM). Further, instead of pre-training the BERT model from\nscratch, we apply self-learning with confidence-based filtering to corporate\ndisclosure data (10-K). We achieve the accuracy rate of 91.56% and demonstrate\nthat the domain adaptation procedure brings a significant improvement in\nprediction accuracy.\n","authors":["Alex Kim","Sangwon Yoon"],"pdf_url":"https://arxiv.org/pdf/2312.03194v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.04120v2","updated":"2023-12-06T23:53:20Z","published":"2023-05-06T19:10:19Z","title":"A Latent Diffusion Model for Protein Structure Generation","summary":" Proteins are complex biomolecules that perform a variety of crucial functions\nwithin living organisms. Designing and generating novel proteins can pave the\nway for many future synthetic biology applications, including drug discovery.\nHowever, it remains a challenging computational task due to the large modeling\nspace of protein structures. In this study, we propose a latent diffusion model\nthat can reduce the complexity of protein modeling while flexibly capturing the\ndistribution of natural protein structures in a condensed latent space.\nSpecifically, we propose an equivariant protein autoencoder that embeds\nproteins into a latent space and then uses an equivariant diffusion model to\nlearn the distribution of the latent protein representations. Experimental\nresults demonstrate that our method can effectively generate novel protein\nbackbone structures with high designability and efficiency. The code will be\nmade publicly available at\nhttps://github.com/divelab/AIRS/tree/main/OpenProt/LatentDiff\n","authors":["Cong Fu","Keqiang Yan","Limei Wang","Wing Yee Au","Michael McThrow","Tao Komikado","Koji Maruhashi","Kanji Uchino","Xiaoning Qian","Shuiwang Ji"],"pdf_url":"https://arxiv.org/pdf/2305.04120v2.pdf","comment":"Accepted by the Second Learning on Graphs Conference (LoG 2023)"},{"id":"http://arxiv.org/abs/2306.00245v2","updated":"2023-12-06T23:46:36Z","published":"2023-05-31T23:39:18Z","title":"From Pixels to UI Actions: Learning to Follow Instructions via Graphical\n User Interfaces","summary":" Much of the previous work towards digital agents for graphical user\ninterfaces (GUIs) has relied on text-based representations (derived from HTML\nor other structured data sources), which are not always readily available.\nThese input representations have been often coupled with custom, task-specific\naction spaces. This paper focuses on creating agents that interact with the\ndigital world using the same conceptual interface that humans commonly use --\nvia pixel-based screenshots and a generic action space corresponding to\nkeyboard and mouse actions. Building upon recent progress in pixel-based\npretraining, we show, for the first time, that it is possible for such agents\nto outperform human crowdworkers on the MiniWob++ benchmark of GUI-based\ninstruction following tasks.\n","authors":["Peter Shaw","Mandar Joshi","James Cohan","Jonathan Berant","Panupong Pasupat","Hexiang Hu","Urvashi Khandelwal","Kenton Lee","Kristina Toutanova"],"pdf_url":"https://arxiv.org/pdf/2306.00245v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03951v1","updated":"2023-12-06T23:29:00Z","published":"2023-12-06T23:29:00Z","title":"Understanding the Role of Optimization in Double Descent","summary":" The phenomenon of model-wise double descent, where the test error peaks and\nthen reduces as the model size increases, is an interesting topic that has\nattracted the attention of researchers due to the striking observed gap between\ntheory and practice \\citep{Belkin2018ReconcilingMM}. Additionally, while double\ndescent has been observed in various tasks and architectures, the peak of\ndouble descent can sometimes be noticeably absent or diminished, even without\nexplicit regularization, such as weight decay and early stopping. In this\npaper, we investigate this intriguing phenomenon from the optimization\nperspective and propose a simple optimization-based explanation for why double\ndescent sometimes occurs weakly or not at all. To the best of our knowledge, we\nare the first to demonstrate that many disparate factors contributing to\nmodel-wise double descent (initialization, normalization, batch size, learning\nrate, optimization algorithm) are unified from the viewpoint of optimization:\nmodel-wise double descent is observed if and only if the optimizer can find a\nsufficiently low-loss minimum. These factors directly affect the condition\nnumber of the optimization problem or the optimizer and thus affect the final\nminimum found by the optimizer, reducing or increasing the height of the double\ndescent peak. We conduct a series of controlled experiments on random feature\nmodels and two-layer neural networks under various optimization settings,\ndemonstrating this optimization-based unified view. Our results suggest the\nfollowing implication: Double descent is unlikely to be a problem for\nreal-world machine learning setups. Additionally, our results help explain the\ngap between weak double descent peaks in practice and strong peaks observable\nin carefully designed setups.\n","authors":["Chris Yuhao Liu","Jeffrey Flanigan"],"pdf_url":"https://arxiv.org/pdf/2312.03951v1.pdf","comment":"NeurIPS Workshop 2023 Optimization for Machine Learning"},{"id":"http://arxiv.org/abs/2312.03950v1","updated":"2023-12-06T23:22:49Z","published":"2023-12-06T23:22:49Z","title":"A Scalable and Generalizable Pathloss Map Prediction","summary":" Large-scale channel prediction, i.e., estimation of the pathloss from\ngeographical/morphological/building maps, is an essential component of wireless\nnetwork planning. Ray tracing (RT)-based methods have been widely used for many\nyears, but they require significant computational effort that may become\nprohibitive with the increased network densification and/or use of higher\nfrequencies in B5G/6G systems. In this paper, we propose a data-driven,\nmodel-free pathloss map prediction (PMP) method, called PMNet. PMNet uses a\nsupervised learning approach: it is trained on a limited amount of RT (or\nchannel measurement) data and map data. Once trained, PMNet can predict\npathloss over location with high accuracy (an RMSE level of $10^{-2}$) in a few\nmilliseconds. We further extend PMNet by employing transfer learning (TL). TL\nallows PMNet to learn a new network scenario quickly (x5.6 faster training) and\nefficiently (using x4.5 less data) by transferring knowledge from a pre-trained\nmodel, while retaining accuracy. Our results demonstrate that PMNet is a\nscalable and generalizable ML-based PMP method, showing its potential to be\nused in several network optimization applications.\n","authors":["Ju-Hyung Lee","Andreas F. Molisch"],"pdf_url":"https://arxiv.org/pdf/2312.03950v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.09778v2","updated":"2023-12-06T23:18:44Z","published":"2023-08-18T18:58:54Z","title":"Towards Grounded Visual Spatial Reasoning in Multi-Modal Vision Language\n Models","summary":" With pre-training of vision-and-language models (VLMs) on large-scale\ndatasets of image-text pairs, several recent works showed that these\npre-trained models lack fine-grained understanding, such as the ability to\ncount and recognize verbs, attributes, or relationships. The focus of this work\nis to study the ability of these models to understand spatial relations.\nPreviously, this has been tackled using image-text matching (e.g., Visual\nSpatial Reasoning benchmark) or visual question answering (e.g., GQA or VQAv2),\nboth showing poor performance and a large gap compared to human performance. In\nthis work, we use explainability tools to understand the causes of poor\nperformance better and present an alternative fine-grained, compositional\napproach for ranking spatial clauses. We combine the evidence from grounding\nnoun phrases corresponding to objects and their locations to compute the final\nrank of the spatial clause. We demonstrate the approach on representative VLMs\n(such as LXMERT, GPV, and MDETR) and compare and highlight their abilities to\nreason about spatial relationships.\n","authors":["Navid Rajabi","Jana Kosecka"],"pdf_url":"https://arxiv.org/pdf/2308.09778v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03940v1","updated":"2023-12-06T22:43:50Z","published":"2023-12-06T22:43:50Z","title":"PECANN: Parallel Efficient Clustering with Graph-Based Approximate\n Nearest Neighbor Search","summary":" This paper studies density-based clustering of point sets. These methods use\ndense regions of points to detect clusters of arbitrary shapes. In particular,\nwe study variants of density peaks clustering, a popular type of algorithm that\nhas been shown to work well in practice. Our goal is to cluster large\nhigh-dimensional datasets, which are prevalent in practice. Prior solutions are\neither sequential, and cannot scale to large data, or are specialized for\nlow-dimensional data.\n This paper unifies the different variants of density peaks clustering into a\nsingle framework, PECANN, by abstracting out several key steps common to this\nclass of algorithms. One such key step is to find nearest neighbors that\nsatisfy a predicate function, and one of the main contributions of this paper\nis an efficient way to do this predicate search using graph-based approximate\nnearest neighbor search (ANNS). To provide ample parallelism, we propose a\ndoubling search technique that enables points to find an approximate nearest\nneighbor satisfying the predicate in a small number of rounds. Our technique\ncan be applied to many existing graph-based ANNS algorithms, which can all be\nplugged into PECANN.\n We implement five clustering algorithms with PECANN and evaluate them on\nsynthetic and real-world datasets with up to 1.28 million points and up to 1024\ndimensions on a 30-core machine with two-way hyper-threading. Compared to the\nstate-of-the-art FASTDP algorithm for high-dimensional density peaks\nclustering, which is sequential, our best algorithm is 45x-734x faster while\nachieving competitive ARI scores. Compared to the state-of-the-art parallel\nDPC-based algorithm, which is optimized for low dimensions, we show that PECANN\nis two orders of magnitude faster. As far as we know, our work is the first to\nevaluate DPC variants on large high-dimensional real-world image and text\nembedding datasets.\n","authors":["Shangdi Yu","Joshua Engels","Yihao Huang","Julian Shun"],"pdf_url":"https://arxiv.org/pdf/2312.03940v1.pdf","comment":null}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.03700v1","updated":"2023-12-06T18:59:19Z","published":"2023-12-06T18:59:19Z","title":"OneLLM: One Framework to Align All Modalities with Language","summary":" Multimodal large language models (MLLMs) have gained significant attention\ndue to their strong multimodal understanding capability. However, existing\nworks rely heavily on modality-specific encoders, which usually differ in\narchitecture and are limited to common modalities. In this paper, we present\nOneLLM, an MLLM that aligns eight modalities to language using a unified\nframework. We achieve this through a unified multimodal encoder and a\nprogressive multimodal alignment pipeline. In detail, we first train an image\nprojection module to connect a vision encoder with LLM. Then, we build a\nuniversal projection module (UPM) by mixing multiple image projection modules\nand dynamic routing. Finally, we progressively align more modalities to LLM\nwith the UPM. To fully leverage the potential of OneLLM in following\ninstructions, we also curated a comprehensive multimodal instruction dataset,\nincluding 2M items from image, audio, video, point cloud, depth/normal map, IMU\nand fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks,\nencompassing tasks such as multimodal captioning, question answering and\nreasoning, where it delivers excellent performance. Code, data, model and\nonline demo are available at https://github.com/csuhan/OneLLM\n","authors":["Jiaming Han","Kaixiong Gong","Yiyuan Zhang","Jiaqi Wang","Kaipeng Zhang","Dahua Lin","Yu Qiao","Peng Gao","Xiangyu Yue"],"pdf_url":"https://arxiv.org/pdf/2312.03700v1.pdf","comment":"Code: https://github.com/csuhan/OneLLM"},{"id":"http://arxiv.org/abs/2305.14269v2","updated":"2023-12-06T18:21:17Z","published":"2023-05-23T17:20:47Z","title":"Source-Free Domain Adaptation for RGB-D Semantic Segmentation with\n Vision Transformers","summary":" With the increasing availability of depth sensors, multimodal frameworks that\ncombine color information with depth data are gaining interest. However, ground\ntruth data for semantic segmentation is burdensome to provide, thus making\ndomain adaptation a significant research area. Yet most domain adaptation\nmethods are not able to effectively handle multimodal data. Specifically, we\naddress the challenging source-free domain adaptation setting where the\nadaptation is performed without reusing source data. We propose MISFIT:\nMultImodal Source-Free Information fusion Transformer, a depth-aware framework\nwhich injects depth data into a segmentation module based on vision\ntransformers at multiple stages, namely at the input, feature and output\nlevels. Color and depth style transfer helps early-stage domain alignment while\nre-wiring self-attention between modalities creates mixed features, allowing\nthe extraction of better semantic content. Furthermore, a depth-based entropy\nminimization strategy is also proposed to adaptively weight regions at\ndifferent distances. Our framework, which is also the first approach using\nRGB-D vision transformers for source-free semantic segmentation, shows\nnoticeable performance improvements with respect to standard strategies.\n","authors":["Giulia Rizzoli","Donald Shenaj","Pietro Zanuttigh"],"pdf_url":"https://arxiv.org/pdf/2305.14269v2.pdf","comment":"WACV 2024, 2nd Workshop on Pretraining (WACVW)"},{"id":"http://arxiv.org/abs/2312.03641v1","updated":"2023-12-06T17:49:57Z","published":"2023-12-06T17:49:57Z","title":"MotionCtrl: A Unified and Flexible Motion Controller for Video\n Generation","summary":" Motions in a video primarily consist of camera motion, induced by camera\nmovement, and object motion, resulting from object movement. Accurate control\nof both camera and object motion is essential for video generation. However,\nexisting works either mainly focus on one type of motion or do not clearly\ndistinguish between the two, limiting their control capabilities and diversity.\nTherefore, this paper presents MotionCtrl, a unified and flexible motion\ncontroller for video generation designed to effectively and independently\ncontrol camera and object motion. The architecture and training strategy of\nMotionCtrl are carefully devised, taking into account the inherent properties\nof camera motion, object motion, and imperfect training data. Compared to\nprevious methods, MotionCtrl offers three main advantages: 1) It effectively\nand independently controls camera motion and object motion, enabling more\nfine-grained motion control and facilitating flexible and diverse combinations\nof both types of motion. 2) Its motion conditions are determined by camera\nposes and trajectories, which are appearance-free and minimally impact the\nappearance or shape of objects in generated videos. 3) It is a relatively\ngeneralizable model that can adapt to a wide array of camera poses and\ntrajectories once trained. Extensive qualitative and quantitative experiments\nhave been conducted to demonstrate the superiority of MotionCtrl over existing\nmethods.\n","authors":["Zhouxia Wang","Ziyang Yuan","Xintao Wang","Tianshui Chen","Menghan Xia","Ping Luo","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.03641v1.pdf","comment":"Project Page: https://wzhouxiff.github.io/projects/MotionCtrl/"},{"id":"http://arxiv.org/abs/2312.03511v1","updated":"2023-12-06T14:13:38Z","published":"2023-12-06T14:13:38Z","title":"Kandinsky 3.0 Technical Report","summary":" We present Kandinsky 3.0, a large-scale text-to-image generation model based\non latent diffusion, continuing the series of text-to-image Kandinsky models\nand reflecting our progress to achieve higher quality and realism of image\ngeneration. Compared to previous versions of Kandinsky 2.x, Kandinsky 3.0\nleverages a two times larger U-Net backbone, a ten times larger text encoder\nand removes diffusion mapping. We describe the architecture of the model, the\ndata collection procedure, the training technique, and the production system of\nuser interaction. We focus on the key components that, as we have identified as\na result of a large number of experiments, had the most significant impact on\nimproving the quality of our model compared to the others. By our side-by-side\ncomparisons, Kandinsky becomes better in text understanding and works better on\nspecific domains. Project page: https://ai-forever.github.io/Kandinsky-3\n","authors":["Vladimir Arkhipkin","Andrei Filatov","Viacheslav Vasilev","Anastasia Maltseva","Said Azizov","Igor Pavlov","Julia Agafonova","Andrey Kuznetsov","Denis Dimitrov"],"pdf_url":"https://arxiv.org/pdf/2312.03511v1.pdf","comment":"Project page: https://ai-forever.github.io/Kandinsky-3"},{"id":"http://arxiv.org/abs/2312.03410v1","updated":"2023-12-06T10:48:36Z","published":"2023-12-06T10:48:36Z","title":"Detecting Voice Cloning Attacks via Timbre Watermarking","summary":" Nowadays, it is common to release audio content to the public. However, with\nthe rise of voice cloning technology, attackers have the potential to easily\nimpersonate a specific person by utilizing his publicly released audio without\nany permission. Therefore, it becomes significant to detect any potential\nmisuse of the released audio content and protect its timbre from being\nimpersonated. To this end, we introduce a novel concept, \"Timbre Watermarking\",\nwhich embeds watermark information into the target individual's speech,\neventually defeating the voice cloning attacks. To ensure the watermark is\nrobust to the voice cloning model's learning process, we design an end-to-end\nvoice cloning-resistant detection framework. The core idea of our solution is\nto embed and extract the watermark in the frequency domain in a temporally\ninvariant manner. To acquire generalization across different voice cloning\nattacks, we modulate their shared process and integrate it into our framework\nas a distortion layer. Experiments demonstrate that the proposed timbre\nwatermarking can defend against different voice cloning attacks, exhibit strong\nresistance against various adaptive attacks (e.g., reconstruction-based removal\nattacks, watermark overwriting attacks), and achieve practicality in real-world\nservices such as PaddleSpeech, Voice-Cloning-App, and so-vits-svc. In addition,\nablation studies are also conducted to verify the effectiveness of our design.\nSome audio samples are available at\nhttps://timbrewatermarking.github.io/samples.\n","authors":["Chang Liu","Jie Zhang","Tianwei Zhang","Xi Yang","Weiming Zhang","Nenghai Yu"],"pdf_url":"https://arxiv.org/pdf/2312.03410v1.pdf","comment":"NDSS 2024"},{"id":"http://arxiv.org/abs/2307.04749v2","updated":"2023-12-06T00:45:08Z","published":"2023-07-10T17:54:57Z","title":"Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image\n Alignment with Iterative VQA Feedback","summary":" The field of text-conditioned image generation has made unparalleled progress\nwith the recent advent of latent diffusion models. While remarkable, as the\ncomplexity of given text input increases, the state-of-the-art diffusion models\nmay still fail in generating images which accurately convey the semantics of\nthe given prompt. Furthermore, it has been observed that such misalignments are\noften left undetected by pretrained multi-modal models such as CLIP. To address\nthese problems, in this paper we explore a simple yet effective decompositional\napproach towards both evaluation and improvement of text-to-image alignment. In\nparticular, we first introduce a Decompositional-Alignment-Score which given a\ncomplex prompt decomposes it into a set of disjoint assertions. The alignment\nof each assertion with generated images is then measured using a VQA model.\nFinally, alignment scores for different assertions are combined aposteriori to\ngive the final text-to-image alignment score. Experimental analysis reveals\nthat the proposed alignment metric shows significantly higher correlation with\nhuman ratings as opposed to traditional CLIP, BLIP scores. Furthermore, we also\nfind that the assertion level alignment scores provide a useful feedback which\ncan then be used in a simple iterative procedure to gradually increase the\nexpression of different assertions in the final image outputs. Human user\nstudies indicate that the proposed approach surpasses previous state-of-the-art\nby 8.7% in overall text-to-image alignment accuracy. Project page for our paper\nis available at https://1jsingh.github.io/divide-evaluate-and-refine\n","authors":["Jaskirat Singh","Liang Zheng"],"pdf_url":"https://arxiv.org/pdf/2307.04749v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.13840v2","updated":"2023-12-06T14:03:00Z","published":"2023-05-23T09:03:19Z","title":"Control-A-Video: Controllable Text-to-Video Generation with Diffusion\n Models","summary":" Recent advancements in diffusion models have unlocked unprecedented abilities\nin visual creation. However, current text-to-video generation models struggle\nwith the trade-off among movement range, action coherence and object\nconsistency. To mitigate this issue, we present a controllable text-to-video\n(T2V) diffusion model, called Control-A-Video, capable of maintaining\nconsistency while customizable video synthesis. Based on a pre-trained\nconditional text-to-image (T2I) diffusion model, our model aims to generate\nvideos conditioned on a sequence of control signals, such as edge or depth\nmaps. For the purpose of improving object consistency, Control-A-Video\nintegrates motion priors and content priors into video generation. We propose\ntwo motion-adaptive noise initialization strategies, which are based on pixel\nresidual and optical flow, to introduce motion priors from input videos,\nproducing more coherent videos. Moreover, a first-frame conditioned controller\nis proposed to generate videos from content priors of the first frame, which\nfacilitates the semantic alignment with text and allows longer video generation\nin an auto-regressive manner. With the proposed architecture and strategies,\nour model achieves resource-efficient convergence and generate consistent and\ncoherent videos with fine-grained control. Extensive experiments demonstrate\nits success in various video generative tasks such as video editing and video\nstyle transfer, outperforming previous methods in terms of consistency and\nquality.\n","authors":["Weifeng Chen","Yatai Ji","Jie Wu","Hefeng Wu","Pan Xie","Jiashi Li","Xin Xia","Xuefeng Xiao","Liang Lin"],"pdf_url":"https://arxiv.org/pdf/2305.13840v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.02238v2","updated":"2023-12-06T05:52:16Z","published":"2023-12-04T09:19:38Z","title":"X-Adapter: Adding Universal Compatibility of Plugins for Upgraded\n Diffusion Model","summary":" We introduce X-Adapter, a universal upgrader to enable the pretrained\nplug-and-play modules (e.g., ControlNet, LoRA) to work directly with the\nupgraded text-to-image diffusion model (e.g., SDXL) without further retraining.\nWe achieve this goal by training an additional network to control the frozen\nupgraded model with the new text-image data pairs. In detail, X-Adapter keeps a\nfrozen copy of the old model to preserve the connectors of different plugins.\nAdditionally, X-Adapter adds trainable mapping layers that bridge the decoders\nfrom models of different versions for feature remapping. The remapped features\nwill be used as guidance for the upgraded model. To enhance the guidance\nability of X-Adapter, we employ a null-text training strategy for the upgraded\nmodel. After training, we also introduce a two-stage denoising strategy to\nalign the initial latents of X-Adapter and the upgraded model. Thanks to our\nstrategies, X-Adapter demonstrates universal compatibility with various plugins\nand also enables plugins of different versions to work together, thereby\nexpanding the functionalities of diffusion community. To verify the\neffectiveness of the proposed method, we conduct extensive experiments and the\nresults show that X-Adapter may facilitate wider application in the upgraded\nfoundational diffusion model.\n","authors":["Lingmin Ran","Xiaodong Cun","Jia-Wei Liu","Rui Zhao","Song Zijie","Xintao Wang","Jussi Keppo","Mike Zheng Shou"],"pdf_url":"https://arxiv.org/pdf/2312.02238v2.pdf","comment":"Project page: https://showlab.github.io/X-Adapter/"}]},"2023-12-07T00:00:00Z":{"Computation and Language":[{"id":"http://arxiv.org/abs/2312.04556v1","updated":"2023-12-07T18:59:29Z","published":"2023-12-07T18:59:29Z","title":"Large Language Models for Mathematicians","summary":" Large language models (LLMs) such as ChatGPT have received immense interest\nfor their general-purpose language understanding and, in particular, their\nability to generate high-quality text or computer code. For many professions,\nLLMs represent an invaluable tool that can speed up and improve the quality of\nwork. In this note, we discuss to what extent they can aid professional\nmathematicians. We first provide a mathematical description of the transformer\nmodel used in all modern language models. Based on recent studies, we then\noutline best practices and potential issues and report on the mathematical\nabilities of language models. Finally, we shed light on the potential of LMMs\nto change how mathematicians work.\n","authors":["Simon Frieder","Julius Berner","Philipp Petersen","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.04556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04554v1","updated":"2023-12-07T18:59:22Z","published":"2023-12-07T18:59:22Z","title":"Improved Visual Grounding through Self-Consistent Explanations","summary":" Vision-and-language models trained to match images with text can be combined\nwith visual explanation methods to point to the locations of specific objects\nin an image. Our work shows that the localization --\"grounding\"-- abilities of\nthese models can be further improved by finetuning for self-consistent visual\nexplanations. We propose a strategy for augmenting existing text-image datasets\nwith paraphrases using a large language model, and SelfEQ, a weakly-supervised\nstrategy on visual explanation maps for paraphrases that encourages\nself-consistency. Specifically, for an input textual phrase, we attempt to\ngenerate a paraphrase and finetune the model so that the phrase and paraphrase\nmap to the same region in the image. We posit that this both expands the\nvocabulary that the model is able to handle, and improves the quality of the\nobject locations highlighted by gradient-based visual explanation methods (e.g.\nGradCAM). We demonstrate that SelfEQ improves performance on Flickr30k,\nReferIt, and RefCOCO+ over a strong baseline method and several prior works.\nParticularly, comparing to other methods that do not use any type of box\nannotations, we obtain 84.07% on Flickr30k (an absolute improvement of 4.69%),\n67.40% on ReferIt (an absolute improvement of 7.68%), and 75.10%, 55.49% on\nRefCOCO+ test sets A and B respectively (an absolute improvement of 3.74% on\naverage).\n","authors":["Ruozhen He","Paola Cascante-Bonilla","Ziyan Yang","Alexander C. Berg","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2312.04554v1.pdf","comment":"Project Page: https://catherine-r-he.github.io/SelfEQ/"},{"id":"http://arxiv.org/abs/2305.03514v2","updated":"2023-12-07T18:37:55Z","published":"2023-04-12T17:33:28Z","title":"Can Large Language Models Transform Computational Social Science?","summary":" Large Language Models (LLMs) are capable of successfully performing many\nlanguage processing tasks zero-shot (without training data). If zero-shot LLMs\ncan also reliably classify and explain social phenomena like persuasiveness and\npolitical ideology, then LLMs could augment the Computational Social Science\n(CSS) pipeline in important ways. This work provides a road map for using LLMs\nas CSS tools. Towards this end, we contribute a set of prompting best practices\nand an extensive evaluation pipeline to measure the zero-shot performance of 13\nlanguage models on 25 representative English CSS benchmarks. On taxonomic\nlabeling tasks (classification), LLMs fail to outperform the best fine-tuned\nmodels but still achieve fair levels of agreement with humans. On free-form\ncoding tasks (generation), LLMs produce explanations that often exceed the\nquality of crowdworkers' gold references. We conclude that the performance of\ntoday's LLMs can augment the CSS research pipeline in two ways: (1) serving as\nzero-shot data annotators on human annotation teams, and (2) bootstrapping\nchallenging creative generation tasks (e.g., explaining the underlying\nattributes of a text). In summary, LLMs are posed to meaningfully participate\nin} social science analysis in partnership with humans.\n","authors":["Caleb Ziems","William Held","Omar Shaikh","Jiaao Chen","Zhehao Zhang","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.03514v2.pdf","comment":"To appear in \"Computational Linguistics\" (CL)"},{"id":"http://arxiv.org/abs/2312.04515v1","updated":"2023-12-07T18:34:57Z","published":"2023-12-07T18:34:57Z","title":"Efficient Monotonic Multihead Attention","summary":" We introduce the Efficient Monotonic Multihead Attention (EMMA), a\nstate-of-the-art simultaneous translation model with numerically-stable and\nunbiased monotonic alignment estimation. In addition, we present improved\ntraining and inference strategies, including simultaneous fine-tuning from an\noffline translation model and reduction of monotonic alignment variance. The\nexperimental results demonstrate that the proposed model attains\nstate-of-the-art performance in simultaneous speech-to-text translation on the\nSpanish and English translation task.\n","authors":["Xutai Ma","Anna Sun","Siqi Ouyang","Hirofumi Inaguma","Paden Tomasello"],"pdf_url":"https://arxiv.org/pdf/2312.04515v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04511v1","updated":"2023-12-07T18:32:04Z","published":"2023-12-07T18:32:04Z","title":"An LLM Compiler for Parallel Function Calling","summary":" Large Language Models (LLMs) have shown remarkable results on various complex\nreasoning benchmarks. The reasoning capabilities of LLMs enable them to execute\nfunction calls, using user-provided functions to overcome their inherent\nlimitations, such as knowledge cutoffs, poor arithmetic skills, or lack of\naccess to private data. This development has expanded LLMs' scope to include\nmulti-function calling, where LLMs are equipped with a variety of functions and\nselect the proper functions based on the context. Multi-function calling\nabilities of LLMs have catalyzed LLM-based software development, allowing them\nto tackle more complex problems. However, current methods for multi-function\ncalling often require sequential reasoning and acting for each function which\ncan result in high latency, cost, and sometimes inaccurate behavior. To address\nthis, we introduce LLMCompiler, which executes functions in parallel to\nefficiently orchestrate multi-function calling. Drawing from the principles of\nclassical compilers, LLMCompiler streamlines parallel function calling with\nthree components: (i) an LLM Planner, formulating execution strategies and\ndependencies; (ii) a Task Fetching Unit, dispatching function calling tasks;\nand (iii) an Executor, executing these tasks in parallel. LLMCompiler\nautomatically computes an optimized orchestration for the function calls and\ncan be used with open-source models such as LLaMA-2. We have benchmarked\nLLMCompiler on a range of tasks including cases with non-trivial\ninter-dependency between function calls, as well as cases that require dynamic\nreplanning based on intermediate results. We observe consistent latency speedup\nof up to 3.7x, cost savings of up to 6.7x, and accuracy improvement of up to\n~9% as compared to ReAct. Additionally, LLMCompiler achieves up to 1.35x\nlatency gain over OpenAI's recent parallel function calling, while achieving\nsimilar accuracy.\n","authors":["Sehoon Kim","Suhong Moon","Ryan Tabrizi","Nicholas Lee","Michael W. Mahoney","Kurt Keutzer","Amir Gholami"],"pdf_url":"https://arxiv.org/pdf/2312.04511v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04510v1","updated":"2023-12-07T18:30:15Z","published":"2023-12-07T18:30:15Z","title":"A Block Metropolis-Hastings Sampler for Controllable Energy-based Text\n Generation","summary":" Recent work has shown that energy-based language modeling is an effective\nframework for controllable text generation because it enables flexible\nintegration of arbitrary discriminators. However, because energy-based LMs are\nglobally normalized, approximate techniques like Metropolis-Hastings (MH) are\nrequired for inference. Past work has largely explored simple proposal\ndistributions that modify a single token at a time, like in Gibbs sampling. In\nthis paper, we develop a novel MH sampler that, in contrast, proposes re-writes\nof the entire sequence in each step via iterative prompting of a large language\nmodel. Our new sampler (a) allows for more efficient and accurate sampling from\na target distribution and (b) allows generation length to be determined through\nthe sampling procedure rather than fixed in advance, as past work has required.\nWe perform experiments on two controlled generation tasks, showing both\ndownstream performance gains and more accurate target distribution sampling in\ncomparison with single-token proposal techniques.\n","authors":["Jarad Forristal","Niloofar Mireshghallah","Greg Durrett","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2312.04510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.09820v2","updated":"2023-12-07T18:08:34Z","published":"2023-01-24T05:11:17Z","title":"A Stability Analysis of Fine-Tuning a Pre-Trained Model","summary":" Fine-tuning a pre-trained model (such as BERT, ALBERT, RoBERTa, T5, GPT,\netc.) has proven to be one of the most promising paradigms in recent NLP\nresearch. However, numerous recent works indicate that fine-tuning suffers from\nthe instability problem, i.e., tuning the same model under the same setting\nresults in significantly different performance. Many recent works have proposed\ndifferent methods to solve this problem, but there is no theoretical\nunderstanding of why and how these methods work. In this paper, we propose a\nnovel theoretical stability analysis of fine-tuning that focuses on two\ncommonly used settings, namely, full fine-tuning and head tuning. We define the\nstability under each setting and prove the corresponding stability bounds. The\ntheoretical bounds explain why and how several existing methods can stabilize\nthe fine-tuning procedure. In addition to being able to explain most of the\nobserved empirical discoveries, our proposed theoretical analysis framework can\nalso help in the design of effective and provable methods. Based on our theory,\nwe propose three novel strategies to stabilize the fine-tuning procedure,\nnamely, Maximal Margin Regularizer (MMR), Multi-Head Loss (MHLoss), and Self\nUnsupervised Re-Training (SURT). We extensively evaluate our proposed\napproaches on 11 widely used real-world benchmark datasets, as well as hundreds\nof synthetic classification datasets. The experiment results show that our\nproposed methods significantly stabilize the fine-tuning procedure and also\ncorroborate our theoretical analysis.\n","authors":["Zihao Fu","Anthony Man-Cho So","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2301.09820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04474v1","updated":"2023-12-07T17:51:43Z","published":"2023-12-07T17:51:43Z","title":"Chain of Code: Reasoning with a Language Model-Augmented Code Emulator","summary":" Code provides a general syntactic structure to build complex programs and\nperform precise computations when paired with a code interpreter -- we\nhypothesize that language models (LMs) can leverage code-writing to improve\nChain of Thought reasoning not only for logic and arithmetic tasks, but also\nfor linguistic ones (and in particular, those that are a mix of both). For\nexample, consider prompting an LM to write code that counts the number of times\nit detects sarcasm in an essay: the LM may struggle to write an implementation\nfor \"detect_sarcasm(string)\" that can be executed by the interpreter (handling\nthe edge cases would be insurmountable). However, LMs may still produce a valid\nsolution if they are used not only to write the code, but also to selectively\n\"emulate\" the interpreter by generating the expected output of\n\"detect_sarcasm(string)\" and other lines of code (e.g., that the interpreter\ncould not compile). In this work, we propose Chain of Code (CoT), a simple yet\nsurprisingly effective extension that improves LM code-driven reasoning. The\nkey idea is to encourage LMs to format linguistic sub-tasks in a program as\nflexible pseudocode that the compiler can explicitly catch undefined behaviors\nand hand off to simulate with an LM (as an \"LMulator\"). Experiments demonstrate\nthat Chain of Code outperforms Chain of Thought and other baselines across a\nvariety of benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of\n12% over Chain of Thought. CoT scales well with large and small models alike,\nand broadens the scope of reasoning questions that LMs can correctly answer by\n\"thinking in code\". Project webpage: https://chain-of-code.github.io/.\n","authors":["Chengshu Li","Jacky Liang","Andy Zeng","Xinyun Chen","Karol Hausman","Dorsa Sadigh","Sergey Levine","Li Fei-Fei","Fei Xia","Brian Ichter"],"pdf_url":"https://arxiv.org/pdf/2312.04474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04469v1","updated":"2023-12-07T17:41:44Z","published":"2023-12-07T17:41:44Z","title":"On the Learnability of Watermarks for Language Models","summary":" Watermarking of language model outputs enables statistical detection of\nmodel-generated text, which has many applications in the responsible deployment\nof language models. Existing watermarking strategies operate by altering the\ndecoder of an existing language model, and the ability for a language model to\ndirectly learn to generate the watermark would have significant implications\nfor the real-world deployment of watermarks. First, learned watermarks could be\nused to build open models that naturally generate watermarked text, allowing\nfor open models to benefit from watermarking. Second, if watermarking is used\nto determine the provenance of generated text, an adversary can hurt the\nreputation of a victim model by spoofing its watermark and generating damaging\nwatermarked text. To investigate the learnability of watermarks, we propose\nwatermark distillation, which trains a student model to behave like a teacher\nmodel that uses decoding-based watermarking. We test our approach on three\ndistinct decoding-based watermarking strategies and various hyperparameter\nsettings, finding that models can learn to generate watermarked text with high\ndetectability. We also find limitations to learnability, including the loss of\nwatermarking capabilities under fine-tuning on normal text and high sample\ncomplexity when learning low-distortion watermarks.\n","authors":["Chenchen Gu","Xiang Lisa Li","Percy Liang","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2312.04469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04455v1","updated":"2023-12-07T17:24:51Z","published":"2023-12-07T17:24:51Z","title":"Fortify the Shortest Stave in Attention: Enhancing Context Awareness of\n Large Language Models for Effective Tool Use","summary":" Recent advancements in large language models (LLMs) have significantly\nexpanded their functionality and skills as tool agents. In this paper, we argue\nthat a waveform pattern in the model's attention allocation has an impact on\nthe tool use performance, which degrades when the position of essential\ninformation hits the trough zone. To address this issue, we propose a novel\ninference method named Attention Buckets. This approach enables LLMs to handle\ncontext by conducting parallel processes, each featuring a unique RoPE angle\nbase that shapes the attention waveform. Attention Buckets ensures that an\nattention trough of a particular process can be compensated with an attention\npeak of another run, reducing the risk of the LLM missing essential information\nresiding within the attention trough. Our extensive experiments on the widely\nrecognized tool use benchmark demonstrate the efficacy of our approach, where a\n7B-parameter open-source model enhanced by Attention Buckets achieves SOTA\nperformance on par with GPT-4.\n","authors":["Yuhan Chen","Ang Lv","Ting-En Lin","Changyu Chen","Yuchuan Wu","Fei Huang","Yongbin Li","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2312.04455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04440v1","updated":"2023-12-07T17:06:20Z","published":"2023-12-07T17:06:20Z","title":"OpenAsp: A Benchmark for Multi-document Open Aspect-based Summarization","summary":" The performance of automatic summarization models has improved dramatically\nin recent years. Yet, there is still a gap in meeting specific information\nneeds of users in real-world scenarios, particularly when a targeted summary is\nsought, such as in the useful aspect-based summarization setting targeted in\nthis paper. Previous datasets and studies for this setting have predominantly\nconcentrated on a limited set of pre-defined aspects, focused solely on single\ndocument inputs, or relied on synthetic data. To advance research on more\nrealistic scenarios, we introduce OpenAsp, a benchmark for multi-document\n\\textit{open} aspect-based summarization. This benchmark is created using a\nnovel and cost-effective annotation protocol, by which an open aspect dataset\nis derived from existing generic multi-document summarization datasets. We\nanalyze the properties of OpenAsp showcasing its high-quality content. Further,\nwe show that the realistic open-aspect setting realized in OpenAsp poses a\nchallenge for current state-of-the-art summarization models, as well as for\nlarge language models.\n","authors":["Shmuel Amar","Liat Schiff","Ori Ernst","Asi Shefer","Ori Shapira","Ido Dagan"],"pdf_url":"https://arxiv.org/pdf/2312.04440v1.pdf","comment":"EMNLP 2023"},{"id":"http://arxiv.org/abs/2305.06164v2","updated":"2023-12-07T15:56:21Z","published":"2023-05-04T16:04:41Z","title":"Conversational Semantic Parsing using Dynamic Context Graphs","summary":" In this paper we consider the task of conversational semantic parsing over\ngeneral purpose knowledge graphs (KGs) with millions of entities, and thousands\nof relation-types. We focus on models which are capable of interactively\nmapping user utterances into executable logical forms (e.g., Sparql) in the\ncontext of the conversational history. Our key idea is to represent information\nabout an utterance and its context via a subgraph which is created dynamically,\ni.e., the number of nodes varies per utterance. Rather than treating the\nsubgraph as a sequence, we exploit its underlying structure and encode it with\na graph neural network which further allows us to represent a large number of\n(unseen) nodes. Experimental results show that dynamic context modeling is\nsuperior to static approaches, delivering performance improvements across the\nboard (i.e., for simple and complex questions). Our results further confirm\nthat modeling the structure of context is better at processing discourse\ninformation, (i.e., at handling ellipsis and resolving coreference) and longer\ninteractions.\n","authors":["Parag Jain","Mirella Lapata"],"pdf_url":"https://arxiv.org/pdf/2305.06164v2.pdf","comment":"camera ready"},{"id":"http://arxiv.org/abs/2312.04372v1","updated":"2023-12-07T15:43:52Z","published":"2023-12-07T15:43:52Z","title":"LaMPilot: An Open Benchmark Dataset for Autonomous Driving with Language\n Model Programs","summary":" We present LaMPilot, a novel framework for planning in the field of\nautonomous driving, rethinking the task as a code-generation process that\nleverages established behavioral primitives. This approach aims to address the\nchallenge of interpreting and executing spontaneous user instructions such as\n\"overtake the car ahead,\" which have typically posed difficulties for existing\nframeworks. We introduce the LaMPilot benchmark specifically designed to\nquantitatively evaluate the efficacy of Large Language Models (LLMs) in\ntranslating human directives into actionable driving policies. We then evaluate\na wide range of state-of-the-art code generation language models on tasks from\nthe LaMPilot Benchmark. The results of the experiments showed that GPT-4, with\nhuman feedback, achieved an impressive task completion rate of 92.7% and a\nminimal collision rate of 0.9%. To encourage further investigation in this\narea, our code and dataset will be made available.\n","authors":["Yunsheng Ma","Can Cui","Xu Cao","Wenqian Ye","Peiran Liu","Juanwu Lu","Amr Abdelraouf","Rohit Gupta","Kyungtae Han","Aniket Bera","James M. Rehg","Ziran Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04372v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.16733v3","updated":"2023-12-07T15:30:28Z","published":"2023-11-28T12:29:33Z","title":"LLMs for Science: Usage for Code Generation and Data Analysis","summary":" Large language models (LLMs) have been touted to enable increased\nproductivity in many areas of today's work life. Scientific research as an area\nof work is no exception: the potential of LLM-based tools to assist in the\ndaily work of scientists has become a highly discussed topic across\ndisciplines. However, we are only at the very onset of this subject of study.\nIt is still unclear how the potential of LLMs will materialise in research\npractice. With this study, we give first empirical evidence on the use of LLMs\nin the research process. We have investigated a set of use cases for LLM-based\ntools in scientific research, and conducted a first study to assess to which\ndegree current tools are helpful. In this paper we report specifically on use\ncases related to software engineering, such as generating application code and\ndeveloping scripts for data analytics. While we studied seemingly simple use\ncases, results across tools differ significantly. Our results highlight the\npromise of LLM-based tools in general, yet we also observe various issues,\nparticularly regarding the integrity of the output these tools provide.\n","authors":["Mohamed Nejjar","Luca Zacharias","Fabian Stiehle","Ingo Weber"],"pdf_url":"https://arxiv.org/pdf/2311.16733v3.pdf","comment":"Preprint; In Submission"},{"id":"http://arxiv.org/abs/2312.04362v1","updated":"2023-12-07T15:29:34Z","published":"2023-12-07T15:29:34Z","title":"PCoQA: Persian Conversational Question Answering Dataset","summary":" Humans seek information regarding a specific topic through performing a\nconversation containing a series of questions and answers. In the pursuit of\nconversational question answering research, we introduce the PCoQA, the first\n\\textbf{P}ersian \\textbf{Co}nversational \\textbf{Q}uestion \\textbf{A}nswering\ndataset, a resource comprising information-seeking dialogs encompassing a total\nof 9,026 contextually-driven questions. Each dialog involves a questioner, a\nresponder, and a document from the Wikipedia; The questioner asks several\ninter-connected questions from the text and the responder provides a span of\nthe document as the answer for each question. PCoQA is designed to present\nnovel challenges compared to previous question answering datasets including\nhaving more open-ended non-factual answers, longer answers, and fewer lexical\noverlaps. This paper not only presents the comprehensive PCoQA dataset but also\nreports the performance of various benchmark models. Our models include\nbaseline models and pre-trained models, which are leveraged to boost the\nperformance of the model. The dataset and benchmarks are available at our\nGithub page.\n","authors":["Hamed Hematian Hemati","Atousa Toghyani","Atena Souri","Sayed Hesam Alavian","Hossein Sameti","Hamid Beigy"],"pdf_url":"https://arxiv.org/pdf/2312.04362v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04350v1","updated":"2023-12-07T15:12:12Z","published":"2023-12-07T15:12:12Z","title":"CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language\n Models","summary":" The ability to perform causal reasoning is widely considered a core feature\nof intelligence. In this work, we investigate whether large language models\n(LLMs) can coherently reason about causality. Much of the existing work in\nnatural language processing (NLP) focuses on evaluating commonsense causal\nreasoning in LLMs, thus failing to assess whether a model can perform causal\ninference in accordance with a set of well-defined formal rules. To address\nthis, we propose a new NLP task, causal inference in natural language, inspired\nby the \"causal inference engine\" postulated by Judea Pearl et al. We compose a\nlarge dataset, CLadder, with 10K samples: based on a collection of causal\ngraphs and queries (associational, interventional, and counterfactual), we\nobtain symbolic questions and ground-truth answers, through an oracle causal\ninference engine. These are then translated into natural language. We evaluate\nmultiple LLMs on our dataset, and we introduce and evaluate a bespoke\nchain-of-thought prompting strategy, CausalCoT. We show that our task is highly\nchallenging for LLMs, and we conduct an in-depth analysis to gain deeper\ninsight into the causal reasoning abilities of LLMs. Our data is open-sourced\nat https://huggingface.co/datasets/causalNLP/cladder, and our code can be found\nat https://github.com/causalNLP/cladder.\n","authors":["Zhijing Jin","Yuen Chen","Felix Leeb","Luigi Gresele","Ojasv Kamal","Zhiheng Lyu","Kevin Blin","Fernando Gonzalez Adauto","Max Kleiman-Weiner","Mrinmaya Sachan","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.04350v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2309.07311v4","updated":"2023-12-07T15:09:48Z","published":"2023-09-13T20:57:11Z","title":"Sudden Drops in the Loss: Syntax Acquisition, Phase Transitions, and\n Simplicity Bias in MLMs","summary":" Most interpretability research in NLP focuses on understanding the behavior\nand features of a fully trained model. However, certain insights into model\nbehavior may only be accessible by observing the trajectory of the training\nprocess. We present a case study of syntax acquisition in masked language\nmodels (MLMs) that demonstrates how analyzing the evolution of interpretable\nartifacts throughout training deepens our understanding of emergent behavior.\nIn particular, we study Syntactic Attention Structure (SAS), a naturally\nemerging property of MLMs wherein specific Transformer heads tend to focus on\nspecific syntactic relations. We identify a brief window in pretraining when\nmodels abruptly acquire SAS, concurrent with a steep drop in loss. This\nbreakthrough precipitates the subsequent acquisition of linguistic\ncapabilities. We then examine the causal role of SAS by manipulating SAS during\ntraining, and demonstrate that SAS is necessary for the development of\ngrammatical capabilities. We further find that SAS competes with other\nbeneficial traits during training, and that briefly suppressing SAS improves\nmodel quality. These findings offer an interpretation of a real-world example\nof both simplicity bias and breakthrough training dynamics.\n","authors":["Angelica Chen","Ravid Shwartz-Ziv","Kyunghyun Cho","Matthew L. Leavitt","Naomi Saphra"],"pdf_url":"https://arxiv.org/pdf/2309.07311v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04348v1","updated":"2023-12-07T15:09:24Z","published":"2023-12-07T15:09:24Z","title":"When Input Integers are Given in the Unary Numeral Representation","summary":" Many NP-complete problems take integers as part of their input instances.\nThese input integers are generally binarized, that is, provided in the form of\nthe \"binary\" numeral representation, and the lengths of such binary forms are\nused as a basis unit to measure the computational complexity of the problems.\nIn sharp contrast, the \"unarization\" (or the \"unary\" numeral representation) of\nnumbers has been known to bring a remarkably different effect onto the\ncomputational complexity of the problems. When no computational-complexity\ndifference is observed between binarization and unarization of instances, on\nthe contrary, the problems are said to be strong NP-complete. This work\nattempts to spotlight an issue of how the unarization of instances affects the\ncomputational complexity of various combinatorial problems. We present numerous\nNP-complete (or even NP-hard) problems, which turn out to be easily solvable\nwhen input integers are represented in unary. We then discuss the computational\ncomplexities of such problems when taking unary-form integer inputs. We hope\nthat a list of such problems signifies the structural differences between\nstrong NP-completeness and non-strong NP-completeness.\n","authors":["Tomoyuki Yamakami"],"pdf_url":"https://arxiv.org/pdf/2312.04348v1.pdf","comment":"(A4, 10pt, 12 pages, 1 figure) This is a preliminary report of the\n current work, which has appeared in the Proceedings of the 24th Italian\n Conference on Theoretical Computer Science (ICTCS 2023), Palermo, Italy,\n September 13--15, 2023, CEUR Workshop Proceedings (CEUR-WS.org)"},{"id":"http://arxiv.org/abs/2312.04344v1","updated":"2023-12-07T15:05:59Z","published":"2023-12-07T15:05:59Z","title":"Enhancing Medical Task Performance in GPT-4V: A Comprehensive Study on\n Prompt Engineering Strategies","summary":" OpenAI's latest large vision-language model (LVLM), GPT-4V(ision), has piqued\nconsiderable interest for its potential in medical applications. Despite its\npromise, recent studies and internal reviews highlight its underperformance in\nspecialized medical tasks. This paper explores the boundary of GPT-4V's\ncapabilities in medicine, particularly in processing complex imaging data from\nendoscopies, CT scans, and MRIs etc. Leveraging open-source datasets, we\nassessed its foundational competencies, identifying substantial areas for\nenhancement. Our research emphasizes prompt engineering, an often-underutilized\nstrategy for improving AI responsiveness. Through iterative testing, we refined\nthe model's prompts, significantly improving its interpretative accuracy and\nrelevance in medical imaging. From our comprehensive evaluations, we distilled\n10 effective prompt engineering techniques, each fortifying GPT-4V's medical\nacumen. These methodical enhancements facilitate more reliable, precise, and\nclinically valuable insights from GPT-4V, advancing its operability in critical\nhealthcare environments. Our findings are pivotal for those employing AI in\nmedicine, providing clear, actionable guidance on harnessing GPT-4V's full\ndiagnostic potential.\n","authors":["Pengcheng Chen","Ziyan Huang","Zhongying Deng","Tianbin Li","Yanzhou Su","Haoyu Wang","Jin Ye","Yu Qiao","Junjun He"],"pdf_url":"https://arxiv.org/pdf/2312.04344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04339v1","updated":"2023-12-07T14:59:15Z","published":"2023-12-07T14:59:15Z","title":"Merging by Matching Models in Task Subspaces","summary":" Model merging aims to cheaply combine individual task-specific models into a\nsingle multitask model. In this work, we view past merging methods as\nleveraging different notions of a ''task subspace'' in which models are matched\nbefore being merged. We connect the task subspace of a given model to its loss\nlandscape and formalize how this approach to model merging can be seen as\nsolving a linear system of equations. While past work has generally been\nlimited to linear systems that have a closed-form solution, we consider using\nthe conjugate gradient method to find a solution. We show that using the\nconjugate gradient method can outperform closed-form solutions, enables merging\nvia linear systems that are otherwise intractable to solve, and flexibly allows\nchoosing from a wide variety of initializations and estimates for the ''task\nsubspace''. We ultimately demonstrate that our merging framework called\n''Matching Models in their Task Subspace'' (MaTS) achieves state-of-the-art\nresults in multitask and intermediate-task model merging. We release all of the\ncode and checkpoints used in our work at https://github.com/r-three/mats.\n","authors":["Derek Tam","Mohit Bansal","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2312.04339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00168v2","updated":"2023-12-07T14:53:48Z","published":"2023-11-30T19:59:19Z","title":"Navigating News Narratives: A Media Bias Analysis Dataset","summary":" The proliferation of biased news narratives across various media platforms\nhas become a prominent challenge, influencing public opinion on critical topics\nlike politics, health, and climate change. This paper introduces the\n\"Navigating News Narratives: A Media Bias Analysis Dataset\", a comprehensive\ndataset to address the urgent need for tools to detect and analyze media bias.\nThis dataset encompasses a broad spectrum of biases, making it a unique and\nvaluable asset in the field of media studies and artificial intelligence. The\ndataset is available at\nhttps://huggingface.co/datasets/newsmediabias/news-bias-full-data.\n","authors":["Shaina Raza"],"pdf_url":"https://arxiv.org/pdf/2312.00168v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04333v1","updated":"2023-12-07T14:50:41Z","published":"2023-12-07T14:50:41Z","title":"Beyond Surface: Probing LLaMA Across Scales and Layers","summary":" This paper presents an in-depth analysis of Large Language Models (LLMs),\nfocusing on LLaMA, a prominent open-source foundational model in natural\nlanguage processing. Instead of assessing LLaMA through its generative output,\nwe design multiple-choice tasks to probe its intrinsic understanding in\nhigh-order tasks such as reasoning and computation. We examine the model\nhorizontally, comparing different sizes, and vertically, assessing different\nlayers. We unveil several key and uncommon findings based on the designed\nprobing tasks: (1) Horizontally, enlarging model sizes almost could not\nautomatically impart additional knowledge or computational prowess. Instead, it\ncan enhance reasoning abilities, especially in math problem solving, and helps\nreduce hallucinations, but only beyond certain size thresholds; (2) In vertical\nanalysis, the lower layers of LLaMA lack substantial arithmetic and factual\nknowledge, showcasing logical thinking, multilingual and recognitive abilities,\nwith top layers housing most computational power and real-world knowledge.\n","authors":["Nuo Chen","Ning Wu","Shining Liang","Ming Gong","Linjun Shou","Dongmei Zhang","Jia Li"],"pdf_url":"https://arxiv.org/pdf/2312.04333v1.pdf","comment":"15 pages"},{"id":"http://arxiv.org/abs/2310.16427v2","updated":"2023-12-07T14:39:22Z","published":"2023-10-25T07:47:01Z","title":"PromptAgent: Strategic Planning with Language Models Enables\n Expert-level Prompt Optimization","summary":" Highly effective, task-specific prompts are often heavily engineered by\nexperts to integrate detailed instructions and domain insights based on a deep\nunderstanding of both instincts of large language models (LLMs) and the\nintricacies of the target task. However, automating the generation of such\nexpert-level prompts remains elusive. Existing prompt optimization methods tend\nto overlook the depth of domain knowledge and struggle to efficiently explore\nthe vast space of expert-level prompts. Addressing this, we present\nPromptAgent, an optimization method that autonomously crafts prompts equivalent\nin quality to those handcrafted by experts. At its core, PromptAgent views\nprompt optimization as a strategic planning problem and employs a principled\nplanning algorithm, rooted in Monte Carlo tree search, to strategically\nnavigate the expert-level prompt space. Inspired by human-like trial-and-error\nexploration, PromptAgent induces precise expert-level insights and in-depth\ninstructions by reflecting on model errors and generating constructive error\nfeedback. Such a novel framework allows the agent to iteratively examine\nintermediate prompts (states), refine them based on error feedbacks (actions),\nsimulate future rewards, and search for high-reward paths leading to expert\nprompts. We apply PromptAgent to 12 tasks spanning three practical domains:\nBIG-Bench Hard (BBH), as well as domain-specific and general NLP tasks, showing\nit significantly outperforms strong Chain-of-Thought and recent prompt\noptimization baselines. Extensive analyses emphasize its capability to craft\nexpert-level, detailed, and domain-insightful prompts with great efficiency and\ngeneralizability.\n","authors":["Xinyuan Wang","Chenxi Li","Zhen Wang","Fan Bai","Haotian Luo","Jiayou Zhang","Nebojsa Jojic","Eric P. Xing","Zhiting Hu"],"pdf_url":"https://arxiv.org/pdf/2310.16427v2.pdf","comment":"34 pages, 10 figures"},{"id":"http://arxiv.org/abs/2305.11426v3","updated":"2023-12-07T14:27:29Z","published":"2023-05-19T04:46:04Z","title":"Post Hoc Explanations of Language Models Can Improve Language Models","summary":" Large Language Models (LLMs) have demonstrated remarkable capabilities in\nperforming complex tasks. Moreover, recent research has shown that\nincorporating human-annotated rationales (e.g., Chain-of-Thought prompting)\nduring in-context learning can significantly enhance the performance of these\nmodels, particularly on tasks that require reasoning capabilities. However,\nincorporating such rationales poses challenges in terms of scalability as this\nrequires a high degree of human involvement. In this work, we present a novel\nframework, Amplifying Model Performance by Leveraging In-Context Learning with\nPost Hoc Explanations (AMPLIFY), which addresses the aforementioned challenges\nby automating the process of rationale generation. To this end, we leverage\npost hoc explanation methods which output attribution scores (explanations)\ncapturing the influence of each of the input features on model predictions.\nMore specifically, we construct automated natural language rationales that\nembed insights from post hoc explanations to provide corrective signals to\nLLMs. Extensive experimentation with real-world datasets demonstrates that our\nframework, AMPLIFY, leads to prediction accuracy improvements of about 10-25%\nover a wide range of tasks, including those where prior approaches which rely\non human-annotated rationales such as Chain-of-Thought prompting fall short.\nOur work makes one of the first attempts at highlighting the potential of post\nhoc explanations as valuable tools for enhancing the effectiveness of LLMs.\nFurthermore, we conduct additional empirical analyses and ablation studies to\ndemonstrate the impact of each of the components of AMPLIFY, which, in turn,\nleads to critical insights for refining in-context learning.\n","authors":["Satyapriya Krishna","Jiaqi Ma","Dylan Slack","Asma Ghandeharioun","Sameer Singh","Himabindu Lakkaraju"],"pdf_url":"https://arxiv.org/pdf/2305.11426v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04306v1","updated":"2023-12-07T14:04:15Z","published":"2023-12-07T14:04:15Z","title":"nerblackbox: A High-level Library for Named Entity Recognition in Python","summary":" We present nerblackbox, a python library to facilitate the use of\nstate-of-the-art transformer-based models for named entity recognition. It\nprovides simple-to-use yet powerful methods to access data and models from a\nwide range of sources, for fully automated model training and evaluation as\nwell as versatile model inference. While many technical challenges are solved\nand hidden from the user by default, nerblackbox also offers fine-grained\ncontrol and a rich set of customizable features. It is thus targeted both at\napplication-oriented developers as well as machine learning experts and\nresearchers.\n","authors":["Felix Stollenwerk"],"pdf_url":"https://arxiv.org/pdf/2312.04306v1.pdf","comment":"Published in Proceedings of the 3rd Workshop for Natural Language\n Processing Open Source Software (NLP-OSS 2023)"},{"id":"http://arxiv.org/abs/2312.04302v1","updated":"2023-12-07T13:53:29Z","published":"2023-12-07T13:53:29Z","title":"Prompt Highlighter: Interactive Control for Multi-Modal LLMs","summary":" This study targets a critical aspect of multi-modal LLMs' (LLMs&VLMs)\ninference: explicit controllable text generation. Multi-modal LLMs empower\nmulti-modality understanding with the capability of semantic generation yet\nbring less explainability and heavier reliance on prompt contents due to their\nautoregressive generative nature. While manipulating prompt formats could\nimprove outputs, designing specific and precise prompts per task can be\nchallenging and ineffective. To tackle this issue, we introduce a novel\ninference method, Prompt Highlighter, which enables users to highlight specific\nprompt spans to interactively control the focus during generation. Motivated by\nthe classifier-free diffusion guidance, we form regular and unconditional\ncontext pairs based on highlighted tokens, demonstrating that the\nautoregressive generation in models can be guided in a classifier-free way.\nNotably, we find that, during inference, guiding the models with highlighted\ntokens through the attention weights leads to more desired outputs. Our\napproach is compatible with current LLMs and VLMs, achieving impressive\ncustomized generation results without training. Experiments confirm its\neffectiveness in focusing on input contexts and generating reliable content.\nWithout tuning on LLaVA-v1.5, our method secured 69.5 in the MMBench test and\n1552.5 in MME-perception. The code is available at:\nhttps://github.com/dvlab-research/Prompt-Highlighter/\n","authors":["Yuechen Zhang","Shengju Qian","Bohao Peng","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.04302v1.pdf","comment":"19 pages. Project Page:\n https://julianjuaner.github.io/projects/PromptHighlighter"},{"id":"http://arxiv.org/abs/2311.05876v2","updated":"2023-12-07T12:42:07Z","published":"2023-11-10T05:24:04Z","title":"Trends in Integration of Knowledge and Large Language Models: A Survey\n and Taxonomy of Methods, Benchmarks, and Applications","summary":" Large language models (LLMs) exhibit superior performance on various natural\nlanguage tasks, but they are susceptible to issues stemming from outdated data\nand domain-specific limitations. In order to address these challenges,\nresearchers have pursued two primary strategies, knowledge editing and\nretrieval augmentation, to enhance LLMs by incorporating external information\nfrom different aspects. Nevertheless, there is still a notable absence of a\ncomprehensive survey. In this paper, we propose a review to discuss the trends\nin integration of knowledge and large language models, including taxonomy of\nmethods, benchmarks, and applications. In addition, we conduct an in-depth\nanalysis of different methods and point out potential research directions in\nthe future. We hope this survey offers the community quick access and a\ncomprehensive overview of this research area, with the intention of inspiring\nfuture research endeavors.\n","authors":["Zhangyin Feng","Weitao Ma","Weijiang Yu","Lei Huang","Haotian Wang","Qianglong Chen","Weihua Peng","Xiaocheng Feng","Bing Qin","Ting liu"],"pdf_url":"https://arxiv.org/pdf/2311.05876v2.pdf","comment":"Work in progress; 22 pages. This work has been submitted to the IEEE\n for possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2312.04262v1","updated":"2023-12-07T12:40:00Z","published":"2023-12-07T12:40:00Z","title":"PsyChat: A Client-Centric Dialogue System for Mental Health Support","summary":" Dialogue systems are increasingly integrated into mental health support to\nhelp clients facilitate exploration, gain insight, take action, and ultimately\nheal themselves. For a dialogue system to be practical and user-friendly, it\nshould be client-centric, focusing on the client's behaviors. However, existing\ndialogue systems publicly available for mental health support often concentrate\nsolely on the counselor's strategies rather than the behaviors expressed by\nclients. This can lead to the implementation of unreasonable or inappropriate\ncounseling strategies and corresponding responses from the dialogue system. To\naddress this issue, we propose PsyChat, a client-centric dialogue system that\nprovides psychological support through online chat. The client-centric dialogue\nsystem comprises five modules: client behavior recognition, counselor strategy\nselection, input packer, response generator intentionally fine-tuned to produce\nresponses, and response selection. Both automatic and human evaluations\ndemonstrate the effectiveness and practicality of our proposed dialogue system\nfor real-life mental health support. Furthermore, we employ our proposed\ndialogue system to simulate a real-world client-virtual-counselor interaction\nscenario. The system is capable of predicting the client's behaviors, selecting\nappropriate counselor strategies, and generating accurate and suitable\nresponses, as demonstrated in the scenario.\n","authors":["Huachuan Qiu","Anqi Li","Lizhi Ma","Zhenzhong Lan"],"pdf_url":"https://arxiv.org/pdf/2312.04262v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.00502v2","updated":"2023-12-07T12:16:42Z","published":"2023-11-01T13:08:50Z","title":"Efficient LLM Inference on CPUs","summary":" Large language models (LLMs) have demonstrated remarkable performance and\ntremendous potential across a wide range of tasks. However, deploying these\nmodels has been challenging due to the astronomical amount of model parameters,\nwhich requires a demand for large memory capacity and high memory bandwidth. In\nthis paper, we propose an effective approach that can make the deployment of\nLLMs more efficiently. We support an automatic INT4 weight-only quantization\nflow and design a special LLM runtime with highly-optimized kernels to\naccelerate the LLM inference on CPUs. We demonstrate the general applicability\nof our approach on popular LLMs including Llama2, Llama, GPT-NeoX, and showcase\nthe extreme inference efficiency on CPUs. The code is publicly available at:\nhttps://github.com/intel/intel-extension-for-transformers.\n","authors":["Haihao Shen","Hanwen Chang","Bo Dong","Yu Luo","Hengyu Meng"],"pdf_url":"https://arxiv.org/pdf/2311.00502v2.pdf","comment":"NeurIPS'2023 on Efficient Natural Language and Speech Processing"},{"id":"http://arxiv.org/abs/2312.04219v1","updated":"2023-12-07T11:10:28Z","published":"2023-12-07T11:10:28Z","title":"Swap distance minimization in SOV languages. Cognitive and mathematical\n foundations","summary":" Distance minimization is a general principle of language. A special case of\nthis principle in the domain of word order is swap distance minimization. This\nprinciple predicts that variations from a canonical order that are reached by\nfewer swaps of adjacent constituents are lest costly and thus more likely. Here\nwe investigate the principle in the context of the triple formed by subject\n(S), object (O) and verb (V). We introduce the concept of word order rotation\nas a cognitive underpinning of that prediction. When the canonical order of a\nlanguage is SOV, the principle predicts SOV < SVO, OSV < VSO, OVS < VOS, in\norder of increasing cognitive cost. We test the prediction in three flexible\norder SOV languages: Korean (Koreanic), Malayalam (Dravidian), and Sinhalese\n(Indo-European). Evidence of swap distance minimization is found in all three\nlanguages, but it is weaker in Sinhalese. Swap distance minimization is\nstronger than a preference for the canonical order in Korean and especially\nMalayalam.\n","authors":["Ramon Ferrer-i-Cancho","Savithry Namboodiripad"],"pdf_url":"https://arxiv.org/pdf/2312.04219v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04193v1","updated":"2023-12-07T10:21:22Z","published":"2023-12-07T10:21:22Z","title":"Language Model Knowledge Distillation for Efficient Question Answering\n in Spanish","summary":" Recent advances in the development of pre-trained Spanish language models has\nled to significant progress in many Natural Language Processing (NLP) tasks,\nsuch as question answering. However, the lack of efficient models imposes a\nbarrier for the adoption of such models in resource-constrained environments.\nTherefore, smaller distilled models for the Spanish language could be proven to\nbe highly scalable and facilitate their further adoption on a variety of tasks\nand scenarios. In this work, we take one step in this direction by developing\nSpanishTinyRoBERTa, a compressed language model based on RoBERTa for efficient\nquestion answering in Spanish. To achieve this, we employ knowledge\ndistillation from a large model onto a lighter model that allows for a wider\nimplementation, even in areas with limited computational resources, whilst\nattaining negligible performance sacrifice. Our experiments show that the dense\ndistilled model can still preserve the performance of its larger counterpart,\nwhile significantly increasing inference speedup. This work serves as a\nstarting point for further research and investigation of model compression\nefforts for Spanish language models across various NLP tasks.\n","authors":["Adrián Bazaga","Pietro Liò","Gos Micklem"],"pdf_url":"https://arxiv.org/pdf/2312.04193v1.pdf","comment":"6 pages, 2 tables"},{"id":"http://arxiv.org/abs/2312.03699v2","updated":"2023-12-07T10:19:27Z","published":"2023-12-06T18:59:11Z","title":"PROMISE: A Framework for Model-Driven Stateful Prompt Orchestration","summary":" The advent of increasingly powerful language models has raised expectations\nfor language-based interactions. However, controlling these models is a\nchallenge, emphasizing the need to be able to investigate the feasibility and\nvalue of their application. We present PROMISE, a framework that facilitates\nthe development of complex language-based interactions with information\nsystems. Its use of state machine modeling concepts enables model-driven,\ndynamic prompt orchestration across hierarchically nested states and\ntransitions. This improves the control of the behavior of language models and\nthus enables their effective and efficient use. We show the benefits of PROMISE\nin the context of application scenarios within health information systems and\ndemonstrate its ability to handle complex interactions.\n","authors":["Wenyuan Wu","Jasmin Heierli","Max Meisterhans","Adrian Moser","Andri Färber","Mateusz Dolata","Elena Gavagnin","Alexandre de Spindler","Gerhard Schwabe"],"pdf_url":"https://arxiv.org/pdf/2312.03699v2.pdf","comment":"Minor revision regards wording"},{"id":"http://arxiv.org/abs/2312.03549v2","updated":"2023-12-07T09:26:07Z","published":"2023-12-06T15:27:26Z","title":"Holmes: Towards Distributed Training Across Clusters with Heterogeneous\n NIC Environment","summary":" Large language models (LLMs) such as GPT-3, OPT, and LLaMA have demonstrated\nremarkable accuracy in a wide range of tasks. However, training these models\ncan incur significant expenses, often requiring tens of thousands of GPUs for\nmonths of continuous operation. Typically, this training is carried out in\nspecialized GPU clusters equipped with homogeneous high-speed Remote Direct\nMemory Access (RDMA) network interface cards (NICs). The acquisition and\nmaintenance of such dedicated clusters is challenging. Current LLM training\nframeworks, like Megatron-LM and Megatron-DeepSpeed, focus primarily on\noptimizing training within homogeneous cluster settings. In this paper, we\nintroduce Holmes, a training framework for LLMs that employs thoughtfully\ncrafted data and model parallelism strategies over the heterogeneous NIC\nenvironment. Our primary technical contribution lies in a novel scheduling\nmethod that intelligently allocates distinct computational tasklets in LLM\ntraining to specific groups of GPU devices based on the characteristics of\ntheir connected NICs. Furthermore, our proposed framework, utilizing pipeline\nparallel techniques, demonstrates scalability to multiple GPU clusters, even in\nscenarios without high-speed interconnects between nodes in distinct clusters.\nWe conducted comprehensive experiments that involved various scenarios in the\nheterogeneous NIC environment. In most cases, our framework achieves\nperformance levels close to those achievable with homogeneous RDMA-capable\nnetworks (InfiniBand or RoCE), significantly exceeding training efficiency\nwithin the pure Ethernet environment. Additionally, we verified that our\nframework outperforms other mainstream LLM frameworks under heterogeneous NIC\nenvironment in terms of training efficiency and can be seamlessly integrated\nwith them.\n","authors":["Fei Yang","Shuang Peng","Ning Sun","Fangyu Wang","Ke Tan","Fu Wu","Jiezhong Qiu","Aimin Pan"],"pdf_url":"https://arxiv.org/pdf/2312.03549v2.pdf","comment":"14 pages"},{"id":"http://arxiv.org/abs/2312.04134v1","updated":"2023-12-07T08:48:54Z","published":"2023-12-07T08:48:54Z","title":"Using a Large Language Model to generate a Design Structure Matrix","summary":" The Design Structure Matrix (DSM) is an established method used in dependency\nmodelling, especially in the design of complex engineering systems. The\ngeneration of DSM is traditionally carried out through manual means and can\ninvolve interviewing experts to elicit critical system elements and the\nrelationships between them. Such manual approaches can be time-consuming and\ncostly. This paper presents a workflow that uses a Large Language Model (LLM)\nto support the generation of DSM and improve productivity. A prototype of the\nworkflow was developed in this work and applied on a diesel engine DSM\npublished previously. It was found that the prototype could reproduce 357 out\nof 462 DSM entries published (i.e. 77.3%), suggesting that the work can aid DSM\ngeneration. A no-code version of the prototype is made available online to\nsupport future research.\n","authors":["Edwin C. Y. Koh"],"pdf_url":"https://arxiv.org/pdf/2312.04134v1.pdf","comment":"16 pages, 7 Figures, 6 Tables"},{"id":"http://arxiv.org/abs/2310.05470v2","updated":"2023-12-07T08:48:36Z","published":"2023-10-09T07:27:15Z","title":"Generative Judge for Evaluating Alignment","summary":" The rapid development of Large Language Models (LLMs) has substantially\nexpanded the range of tasks they can address. In the field of Natural Language\nProcessing (NLP), researchers have shifted their focus from conventional NLP\ntasks (e.g., sequence tagging and parsing) towards tasks that revolve around\naligning with human needs (e.g., brainstorming and email writing). This shift\nin task distribution imposes new requirements on evaluating these aligned\nmodels regarding generality (i.e., assessing performance across diverse\nscenarios), flexibility (i.e., examining under different protocols), and\ninterpretability (i.e., scrutinizing models with explanations). In this paper,\nwe propose a generative judge with 13B parameters, Auto-J, designed to address\nthese challenges. Our model is trained on user queries and LLM-generated\nresponses under massive real-world scenarios and accommodates diverse\nevaluation protocols (e.g., pairwise response comparison and single-response\nevaluation) with well-structured natural language critiques. To demonstrate the\nefficacy of our approach, we construct a new testbed covering 58 different\nscenarios. Experimentally, Auto-J outperforms a series of strong competitors,\nincluding both open-source and closed-source models, by a large margin. We also\nprovide detailed analysis and case studies to further reveal the potential of\nour method and make a variety of resources public at\nhttps://github.com/GAIR-NLP/auto-j.\n","authors":["Junlong Li","Shichao Sun","Weizhe Yuan","Run-Ze Fan","Hai Zhao","Pengfei Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05470v2.pdf","comment":"Fix typos in Table 1"},{"id":"http://arxiv.org/abs/2312.04127v1","updated":"2023-12-07T08:29:58Z","published":"2023-12-07T08:29:58Z","title":"Analyzing the Inherent Response Tendency of LLMs: Real-World\n Instructions-Driven Jailbreak","summary":" Extensive work has been devoted to improving the safety mechanism of Large\nLanguage Models (LLMs). However, in specific scenarios, LLMs still generate\nharmful responses when faced with malicious instructions, a phenomenon referred\nto as \"Jailbreak Attack\". In our research, we introduce a novel jailbreak\nattack method (\\textbf{RADIAL}), which consists of two steps: 1) Inherent\nResponse Tendency Analysis: we analyze the inherent affirmation and rejection\ntendency of LLMs to react to real-world instructions. 2) Real-World\nInstructions-Driven Jailbreak: based on our analysis, we strategically choose\nseveral real-world instructions and embed malicious instructions into them to\namplify the LLM's potential to generate harmful responses. On three open-source\nhuman-aligned LLMs, our method achieves excellent jailbreak attack performance\nfor both Chinese and English malicious instructions. Besides, we guided\ndetailed ablation experiments and verified the effectiveness of our core idea\n\"Inherent Response Tendency Analysis\". Our exploration also exposes the\nvulnerability of LLMs to being induced into generating more detailed harmful\nresponses in subsequent rounds of dialogue.\n","authors":["Yanrui Du","Sendong Zhao","Ming Ma","Yuhan Chen","Bing Qin"],"pdf_url":"https://arxiv.org/pdf/2312.04127v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2208.08089v2","updated":"2023-12-07T08:22:34Z","published":"2022-08-17T06:05:41Z","title":"Constrained Few-Shot Learning: Human-Like Low Sample Complexity Learning\n and Non-Episodic Text Classification","summary":" Few-shot learning (FSL) is an emergent paradigm of learning that attempts to\nlearn to reason with low sample complexity to mimic the way humans learn,\ngeneralise and extrapolate from only a few seen examples. While FSL attempts to\nmimic these human characteristics, fundamentally, the task of FSL as\nconventionally formulated using meta-learning with episodic-based training does\nnot in actuality align with how humans acquire and reason with knowledge. FSL\nwith episodic training, while only requires $K$ instances of each test class,\nstill requires a large number of labelled training instances from disjoint\nclasses. In this paper, we introduce the novel task of constrained few-shot\nlearning (CFSL), a special case of FSL where $M$, the number of instances of\neach training class is constrained such that $M \\leq K$ thus applying a similar\nrestriction during FSL training and test. We propose a method for CFSL\nleveraging Cat2Vec using a novel categorical contrastive loss inspired by\ncognitive theories such as fuzzy trace theory and prototype theory.\n","authors":["Jaron Mar","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2208.08089v2.pdf","comment":"Add additional references Update various sections for clarity"},{"id":"http://arxiv.org/abs/2312.04103v1","updated":"2023-12-07T07:37:15Z","published":"2023-12-07T07:37:15Z","title":"Enhancing the Rationale-Input Alignment for Self-explaining\n Rationalization","summary":" Rationalization empowers deep learning models with self-explaining\ncapabilities through a cooperative game, where a generator selects a\nsemantically consistent subset of the input as a rationale, and a subsequent\npredictor makes predictions based on the selected rationale. In this paper, we\ndiscover that rationalization is prone to a problem named \\emph{rationale\nshift}, which arises from the algorithmic bias of the cooperative game.\nRationale shift refers to a situation where the semantics of the selected\nrationale may deviate from the original input, but the predictor still produces\naccurate predictions based on the deviation, resulting in a compromised\ngenerator with misleading feedback.\n To address this issue, we first demonstrate the importance of the alignment\nbetween the rationale and the full input through both empirical observations\nand theoretical analysis. Subsequently, we introduce a novel approach called\nDAR (\\textbf{D}iscriminatively \\textbf{A}ligned \\textbf{R}ationalization),\nwhich utilizes an auxiliary module pretrained on the full input to\ndiscriminatively align the selected rationale and the original input. We\ntheoretically illustrate how DAR accomplishes the desired alignment, thereby\novercoming the rationale shift problem. The experiments on two widely used\nreal-world benchmarks show that the proposed method significantly improves the\nexplanation quality (measured by the overlap between the model-selected\nexplanation and the human-annotated rationale) as compared to state-of-the-art\ntechniques. Additionally, results on two synthetic settings further validate\nthe effectiveness of DAR in addressing the rationale shift problem.\n","authors":["Wei Liu","Haozhao Wang","Jun Wang","Zhiying Deng","YuanKai Zhang","Cheng Wang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2312.04103v1.pdf","comment":"Accept at ICDE 2024"},{"id":"http://arxiv.org/abs/2310.05140v3","updated":"2023-12-07T07:05:50Z","published":"2023-10-08T12:21:24Z","title":"Harnessing the Power of Large Language Models for Empathetic Response\n Generation: Empirical Investigations and Improvements","summary":" Empathetic dialogue is an indispensable part of building harmonious social\nrelationships and contributes to the development of a helpful AI. Previous\napproaches are mainly based on fine small-scale language models. With the\nadvent of ChatGPT, the application effect of large language models (LLMs) in\nthis field has attracted great attention. This work empirically investigates\nthe performance of LLMs in generating empathetic responses and proposes three\nimprovement methods of semantically similar in-context learning, two-stage\ninteractive generation, and combination with the knowledge base. Extensive\nexperiments show that LLMs can significantly benefit from our proposed methods\nand is able to achieve state-of-the-art performance in both automatic and human\nevaluations. Additionally, we explore the possibility of GPT-4 simulating human\nevaluators.\n","authors":["Yushan Qian","Wei-Nan Zhang","Ting Liu"],"pdf_url":"https://arxiv.org/pdf/2310.05140v3.pdf","comment":"the Findings of EMNLP 2023"},{"id":"http://arxiv.org/abs/2212.14578v2","updated":"2023-12-07T06:38:10Z","published":"2022-12-30T07:37:40Z","title":"MAUVE Scores for Generative Models: Theory and Practice","summary":" Generative artificial intelligence has made significant strides, producing\ntext indistinguishable from human prose and remarkably photorealistic images.\nAutomatically measuring how close the generated data distribution is to the\ntarget distribution is central to diagnosing existing models and developing\nbetter ones. We present MAUVE, a family of comparison measures between pairs of\ndistributions such as those encountered in the generative modeling of text or\nimages. These scores are statistical summaries of divergence frontiers\ncapturing two types of errors in generative modeling. We explore three\napproaches to statistically estimate these scores: vector quantization,\nnon-parametric estimation, and classifier-based estimation. We provide\nstatistical bounds for the vector quantization approach.\n Empirically, we find that the proposed scores paired with a range of\n$f$-divergences and statistical estimation methods can quantify the gaps\nbetween the distributions of human-written text and those of modern neural\nlanguage models by correlating with human judgments and identifying known\nproperties of the generated texts. We demonstrate in the vision domain that\nMAUVE can identify known properties of generated images on par with or better\nthan existing metrics. In conclusion, we present practical recommendations for\nusing MAUVE effectively with language and image modalities.\n","authors":["Krishna Pillutla","Lang Liu","John Thickstun","Sean Welleck","Swabha Swayamdipta","Rowan Zellers","Sewoong Oh","Yejin Choi","Zaid Harchaoui"],"pdf_url":"https://arxiv.org/pdf/2212.14578v2.pdf","comment":"Published in Journal of Machine Learning Research"},{"id":"http://arxiv.org/abs/2305.07372v3","updated":"2023-12-07T06:34:30Z","published":"2023-05-12T10:45:29Z","title":"Interactive Text-to-SQL Generation via Editable Step-by-Step\n Explanations","summary":" Relational databases play an important role in this Big Data era. However, it\nis challenging for non-experts to fully unleash the analytical power of\nrelational databases, since they are not familiar with database languages such\nas SQL. Many techniques have been proposed to automatically generate SQL from\nnatural language, but they suffer from two issues: (1) they still make many\nmistakes, particularly for complex queries, and (2) they do not provide a\nflexible way for non-expert users to validate and refine the incorrect queries.\nTo address these issues, we introduce a new interaction mechanism that allows\nusers directly edit a step-by-step explanation of an incorrect SQL to fix SQL\nerrors. Experiments on the Spider benchmark show that our approach outperforms\nthree SOTA approaches by at least 31.6% in terms of execution accuracy. A user\nstudy with 24 participants further shows that our approach helped users solve\nsignificantly more SQL tasks with less time and higher confidence,\ndemonstrating its potential to expand access to databases, particularly for\nnon-experts.\n","authors":["Yuan Tian","Zheng Zhang","Zheng Ning","Toby Jia-Jun Li","Jonathan K. Kummerfeld","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2305.07372v3.pdf","comment":"ARR AE score of 4"},{"id":"http://arxiv.org/abs/2312.04068v1","updated":"2023-12-07T06:23:17Z","published":"2023-12-07T06:23:17Z","title":"Making Translators Privacy-aware on the User's Side","summary":" We propose PRISM to enable users of machine translation systems to preserve\nthe privacy of data on their own initiative. There is a growing demand to apply\nmachine translation systems to data that require privacy protection. While\nseveral machine translation engines claim to prioritize privacy, the extent and\nspecifics of such protection are largely ambiguous. First, there is often a\nlack of clarity on how and to what degree the data is protected. Even if\nservice providers believe they have sufficient safeguards in place,\nsophisticated adversaries might still extract sensitive information. Second,\nvulnerabilities may exist outside of these protective measures, such as within\ncommunication channels, potentially leading to data leakage. As a result, users\nare hesitant to utilize machine translation engines for data demanding high\nlevels of privacy protection, thereby missing out on their benefits. PRISM\nresolves this problem. Instead of relying on the translation service to keep\ndata safe, PRISM provides the means to protect data on the user's side. This\napproach ensures that even machine translation engines with inadequate privacy\nmeasures can be used securely. For platforms already equipped with privacy\nsafeguards, PRISM acts as an additional protection layer, reinforcing their\nsecurity furthermore. PRISM adds these privacy features without significantly\ncompromising translation accuracy. Our experiments demonstrate the\neffectiveness of PRISM using real-world translators, T5 and ChatGPT\n(GPT-3.5-turbo), and the datasets with two languages. PRISM effectively\nbalances privacy protection with translation accuracy.\n","authors":["Ryoma Sato"],"pdf_url":"https://arxiv.org/pdf/2312.04068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04059v1","updated":"2023-12-07T05:45:24Z","published":"2023-12-07T05:45:24Z","title":"Comparing Large Language Model AI and Human-Generated Coaching Messages\n for Behavioral Weight Loss","summary":" Automated coaching messages for weight control can save time and costs, but\ntheir repetitive, generic nature may limit their effectiveness compared to\nhuman coaching. Large language model (LLM) based artificial intelligence (AI)\nchatbots, like ChatGPT, could offer more personalized and novel messages to\naddress repetition with their data-processing abilities. While LLM AI\ndemonstrates promise to encourage healthier lifestyles, studies have yet to\nexamine the feasibility and acceptability of LLM-based BWL coaching. 87 adults\nin a weight-loss trial rated ten coaching messages' helpfulness (five\nhuman-written, five ChatGPT-generated) using a 5-point Likert scale, providing\nadditional open-ended feedback to justify their ratings. Participants also\nidentified which messages they believed were AI-generated. The evaluation\noccurred in two phases: messages in Phase 1 were perceived as impersonal and\nnegative, prompting revisions for Phase 2 messages. In Phase 1, AI-generated\nmessages were rated less helpful than human-written ones, with 66 percent\nreceiving a helpfulness rating of 3 or higher. However, in Phase 2, the AI\nmessages matched the human-written ones regarding helpfulness, with 82% scoring\nthree or above. Additionally, 50% were misidentified as human-written,\nsuggesting AI's sophistication in mimicking human-generated content. A thematic\nanalysis of open-ended feedback revealed that participants appreciated AI's\nempathy and personalized suggestions but found them more formulaic, less\nauthentic, and too data-focused. This study reveals the preliminary feasibility\nand acceptability of LLM AIs, like ChatGPT, in crafting potentially effective\nweight control coaching messages. Our findings also underscore areas for future\nenhancement.\n","authors":["Zhuoran Huang","Michael P. Berry","Christina Chwyl","Gary Hsieh","Jing Wei","Evan M. Forman"],"pdf_url":"https://arxiv.org/pdf/2312.04059v1.pdf","comment":"29 pages, 5 figures"},{"id":"http://arxiv.org/abs/2309.03409v2","updated":"2023-12-07T05:25:15Z","published":"2023-09-07T00:07:15Z","title":"Large Language Models as Optimizers","summary":" Optimization is ubiquitous. While derivative-based algorithms have been\npowerful tools for various problems, the absence of gradient imposes challenges\non many real-world applications. In this work, we propose Optimization by\nPROmpting (OPRO), a simple and effective approach to leverage large language\nmodels (LLMs) as optimizers, where the optimization task is described in\nnatural language. In each optimization step, the LLM generates new solutions\nfrom the prompt that contains previously generated solutions with their values,\nthen the new solutions are evaluated and added to the prompt for the next\noptimization step. We first showcase OPRO on linear regression and traveling\nsalesman problems, then move on to prompt optimization where the goal is to\nfind instructions that maximize the task accuracy. With a variety of LLMs, we\ndemonstrate that the best prompts optimized by OPRO outperform human-designed\nprompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks. Code at\nhttps://github.com/google-deepmind/opro.\n","authors":["Chengrun Yang","Xuezhi Wang","Yifeng Lu","Hanxiao Liu","Quoc V. Le","Denny Zhou","Xinyun Chen"],"pdf_url":"https://arxiv.org/pdf/2309.03409v2.pdf","comment":"42 pages, 26 figures, 15 tables. Code at\n https://github.com/google-deepmind/opro"},{"id":"http://arxiv.org/abs/2312.04052v1","updated":"2023-12-07T05:20:15Z","published":"2023-12-07T05:20:15Z","title":"Multimodal Misinformation Detection in a South African Social Media\n Environment","summary":" With the constant spread of misinformation on social media networks, a need\nhas arisen to continuously assess the veracity of digital content. This need\nhas inspired numerous research efforts on the development of misinformation\ndetection (MD) models. However, many models do not use all information\navailable to them and existing research contains a lack of relevant datasets to\ntrain the models, specifically within the South African social media\nenvironment. The aim of this paper is to investigate the transferability of\nknowledge of a MD model between different contextual environments. This\nresearch contributes a multimodal MD model capable of functioning in the South\nAfrican social media environment, as well as introduces a South African\nmisinformation dataset. The model makes use of multiple sources of information\nfor misinformation detection, namely: textual and visual elements. It uses\nbidirectional encoder representations from transformers (BERT) as the textual\nencoder and a residual network (ResNet) as the visual encoder. The model is\ntrained and evaluated on the Fakeddit dataset and a South African\nmisinformation dataset. Results show that using South African samples in the\ntraining of the model increases model performance, in a South African\ncontextual environment, and that a multimodal model retains significantly more\nknowledge than both the textual and visual unimodal models. Our study suggests\nthat the performance of a misinformation detection model is influenced by the\ncultural nuances of its operating environment and multimodal models assist in\nthe transferability of knowledge between different contextual environments.\nTherefore, local data should be incorporated into the training process of a\nmisinformation detection model in order to optimize model performance.\n","authors":["Amica De Jager","Vukosi Marivate","Abioudun Modupe"],"pdf_url":"https://arxiv.org/pdf/2312.04052v1.pdf","comment":"Artificial Intelligence Research. SACAIR 2023"},{"id":"http://arxiv.org/abs/2312.00273v2","updated":"2023-12-07T04:37:47Z","published":"2023-12-01T01:22:46Z","title":"Mark My Words: Analyzing and Evaluating Language Model Watermarks","summary":" The capabilities of large language models have grown significantly in recent\nyears and so too have concerns about their misuse. In this context, the ability\nto distinguish machine-generated text from human-authored content becomes\nimportant. Prior works have proposed numerous schemes to watermark text, which\nwould benefit from a systematic evaluation framework. This work focuses on text\nwatermarking techniques - as opposed to image watermarks - and proposes\nMARKMYWORDS, a comprehensive benchmark for them under different tasks as well\nas practical attacks. We focus on three main metrics: quality, size (e.g. the\nnumber of tokens needed to detect a watermark), and tamper-resistance. Current\nwatermarking techniques are good enough to be deployed: Kirchenbauer et al. [1]\ncan watermark Llama2-7B-chat with no perceivable loss in quality, the watermark\ncan be detected with fewer than 100 tokens, and the scheme offers good\ntamper-resistance to simple attacks. We argue that watermark\nindistinguishability, a criteria emphasized in some prior works, is too strong\na requirement: schemes that slightly modify logit distributions outperform\ntheir indistinguishable counterparts with no noticeable loss in generation\nquality. We publicly release our benchmark\n(https://github.com/wagner-group/MarkMyWords)\n","authors":["Julien Piet","Chawin Sitawarin","Vivian Fang","Norman Mu","David Wagner"],"pdf_url":"https://arxiv.org/pdf/2312.00273v2.pdf","comment":"18 pages, 11 figures"},{"id":"http://arxiv.org/abs/2312.04032v1","updated":"2023-12-07T04:23:36Z","published":"2023-12-07T04:23:36Z","title":"RoAST: Robustifying Language Models via Adversarial Perturbation with\n Selective Training","summary":" Fine-tuning pre-trained language models (LMs) has become the de facto\nstandard in many NLP tasks. Nevertheless, fine-tuned LMs are still prone to\nrobustness issues, such as adversarial robustness and model calibration.\nSeveral perspectives of robustness for LMs have been studied independently, but\nlacking a unified consideration in multiple perspectives. In this paper, we\npropose Robustifying LMs via Adversarial perturbation with Selective Training\n(RoAST), a simple yet effective fine-tuning technique to enhance the\nmulti-perspective robustness of LMs in a unified way. RoAST effectively\nincorporates two important sources for the model robustness, robustness on the\nperturbed inputs and generalizable knowledge in pre-trained LMs. To be\nspecific, RoAST introduces adversarial perturbation during fine-tuning while\nthe model parameters are selectively updated upon their relative importance to\nminimize unnecessary deviation. Under a unified evaluation of fine-tuned LMs by\nincorporating four representative perspectives of model robustness, we\ndemonstrate the effectiveness of RoAST compared to state-of-the-art fine-tuning\nmethods on six different types of LMs, which indicates its usefulness in\npractice.\n","authors":["Jaehyung Kim","Yuning Mao","Rui Hou","Hanchao Yu","Davis Liang","Pascale Fung","Qifan Wang","Fuli Feng","Lifu Huang","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2312.04032v1.pdf","comment":"33 pages, accepted at EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2310.01441v2","updated":"2023-12-07T04:10:01Z","published":"2023-09-30T20:18:50Z","title":"UPAR: A Kantian-Inspired Prompting Framework for Enhancing Large\n Language Model Capabilities","summary":" Large Language Models (LLMs) have demonstrated impressive inferential\ncapabilities, with numerous research endeavors devoted to enhancing this\ncapacity through prompting. Despite these efforts, a unified epistemological\nfoundation is still conspicuously absent. Drawing inspiration from Kant's a\npriori philosophy, we propose the UPAR prompting framework, designed to emulate\nthe structure of human cognition within LLMs. The UPAR framework is delineated\ninto four phases: \"Understand\", \"Plan\", \"Act\", and \"Reflect\", enabling the\nextraction of structured information from complex contexts, prior planning of\nsolutions, execution according to plan, and self-reflection. This structure\nsignificantly augments the explainability and accuracy of LLM inference,\nproducing a human-understandable and inspectable inferential trajectory.\nFurthermore, our work offers an epistemological foundation for existing\nprompting techniques, allowing for a possible systematic integration of these\nmethods. With GPT-4, our approach elevates the accuracy from COT baseline of\n22.92% to 58.33% in a challenging subset of GSM8K, and from 67.91% to 75.40% in\nthe causal judgment task. Without using few-shot examples or external tools,\nUPAR significantly outperforms existing prompting methods on SCIBENCH, a\nchallenging dataset containing collegiate-level mathematics, chemistry, and\nphysics scientific problems.\n","authors":["Hejia Geng","Boxun Xu","Peng Li"],"pdf_url":"https://arxiv.org/pdf/2310.01441v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06458v2","updated":"2023-12-07T04:04:31Z","published":"2023-03-11T17:14:33Z","title":"ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and\n Multilingual Natural Language Generation","summary":" Natural Language Generation (NLG) accepts input data in the form of images,\nvideos, or text and generates corresponding natural language text as output.\nExisting NLG methods mainly adopt a supervised approach and rely heavily on\ncoupled data-to-text pairs. However, for many targeted scenarios and for\nnon-English languages, sufficient quantities of labeled data are often not\navailable. To relax the dependency on labeled data of downstream tasks, we\npropose an intuitive and effective zero-shot learning framework, ZeroNLG, which\ncan deal with multiple NLG tasks, including image-to-text (image captioning),\nvideo-to-text (video captioning), and text-to-text (neural machine\ntranslation), across English, Chinese, German, and French within a unified\nframework. ZeroNLG does not require any labeled downstream pairs for training.\nDuring training, ZeroNLG (i) projects different domains (across modalities and\nlanguages) to corresponding coordinates in a shared common latent space; (ii)\nbridges different domains by aligning their corresponding coordinates in this\nspace; and (iii) builds an unsupervised multilingual auto-encoder to learn to\ngenerate text by reconstructing the input text given its coordinate in shared\nlatent space. Consequently, during inference, based on the data-to-text\npipeline, ZeroNLG can generate target sentences across different languages\ngiven the coordinate of input data in the common space. Within this unified\nframework, given visual (imaging or video) data as input, ZeroNLG can perform\nzero-shot visual captioning; given textual sentences as input, ZeroNLG can\nperform zero-shot machine translation. We present the results of extensive\nexperiments on twelve NLG tasks, showing that, without using any labeled\ndownstream pairs for training, ZeroNLG generates high-quality and believable\noutputs and significantly outperforms existing zero-shot methods.\n","authors":["Bang Yang","Fenglin Liu","Yuexian Zou","Xian Wu","Yaowei Wang","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2303.06458v2.pdf","comment":"Our code and data are available at\n https://github.com/yangbang18/ZeroNLG"},{"id":"http://arxiv.org/abs/2312.04021v1","updated":"2023-12-07T03:37:39Z","published":"2023-12-07T03:37:39Z","title":"A Study on the Calibration of In-context Learning","summary":" Modern auto-regressive language models are trained to minimize log loss on\nbroad data by predicting the next token so they are expected to get calibrated\nanswers when framing a problem as a next-token prediction task. We study this\nfor in-context learning (ICL), a widely used way to adapt frozen large language\nmodels (LLMs) via crafting prompts, and investigate the trade-offs between\nperformance and calibration on a wide range of natural language understanding\nand reasoning tasks. We conduct extensive experiments to show that such\ntrade-offs may get worse as we increase model size, incorporate more ICL\nexamples, and fine-tune models using instruction, dialog, or reinforcement\nlearning from human feedback (RLHF) on carefully curated datasets. Furthermore,\nwe find that common recalibration techniques that are widely effective such as\ntemperature scaling provide limited gains in calibration errors, suggesting\nthat new methods may be required for settings where models are expected to be\nreliable.\n","authors":["Hanlin Zhang","Yi-Fan Zhang","Yaodong Yu","Dhruv Madeka","Dean Foster","Eric Xing","Hima Lakkaraju","Sham Kakade"],"pdf_url":"https://arxiv.org/pdf/2312.04021v1.pdf","comment":"Spotlight Talk at NeurIPS 2023 Workshop on Failure Modes in the Age\n of Foundation Models"},{"id":"http://arxiv.org/abs/2312.03987v1","updated":"2023-12-07T02:09:27Z","published":"2023-12-07T02:09:27Z","title":"Cost-Effective In-Context Learning for Entity Resolution: A Design Space\n Exploration","summary":" Entity resolution (ER) is an important data integration task with a wide\nspectrum of applications. The state-of-the-art solutions on ER rely on\npre-trained language models (PLMs), which require fine-tuning on a lot of\nlabeled matching/non-matching entity pairs. Recently, large languages models\n(LLMs), such as GPT-4, have shown the ability to perform many tasks without\ntuning model parameters, which is known as in-context learning (ICL) that\nfacilitates effective learning from a few labeled input context demonstrations.\nHowever, existing ICL approaches to ER typically necessitate providing a task\ndescription and a set of demonstrations for each entity pair and thus have\nlimitations on the monetary cost of interfacing LLMs. To address the problem,\nin this paper, we provide a comprehensive study to investigate how to develop a\ncost-effective batch prompting approach to ER. We introduce a framework BATCHER\nconsisting of demonstration selection and question batching and explore\ndifferent design choices that support batch prompting for ER. We also devise a\ncovering-based demonstration selection strategy that achieves an effective\nbalance between matching accuracy and monetary cost. We conduct a thorough\nevaluation to explore the design space and evaluate our proposed strategies.\nThrough extensive experiments, we find that batch prompting is very\ncost-effective for ER, compared with not only PLM-based methods fine-tuned with\nextensive labeled data but also LLM-based methods with manually designed\nprompting. We also provide guidance for selecting appropriate design choices\nfor batch prompting.\n","authors":["Meihao Fan","Xiaoyue Han","Ju Fan","Chengliang Chai","Nan Tang","Guoliang Li","Xiaoyong Du"],"pdf_url":"https://arxiv.org/pdf/2312.03987v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2202.09517v2","updated":"2023-12-07T01:07:13Z","published":"2022-02-19T03:48:20Z","title":"Deep Learning for Hate Speech Detection: A Comparative Study","summary":" Automated hate speech detection is an important tool in combating the spread\nof hate speech, particularly in social media. Numerous methods have been\ndeveloped for the task, including a recent proliferation of deep-learning based\napproaches. A variety of datasets have also been developed, exemplifying\nvarious manifestations of the hate-speech detection problem. We present here a\nlarge-scale empirical comparison of deep and shallow hate-speech detection\nmethods, mediated through the three most commonly used datasets. Our goal is to\nilluminate progress in the area, and identify strengths and weaknesses in the\ncurrent state-of-the-art. We particularly focus our analysis on measures of\npractical performance, including detection accuracy, computational efficiency,\ncapability in using pre-trained models, and domain generalization. In doing so\nwe aim to provide guidance as to the use of hate-speech detection in practice,\nquantify the state-of-the-art, and identify future research directions. Code\nand dataset are available at\nhttps://github.com/jmjmalik22/Hate-Speech-Detection.\n","authors":["Jitendra Singh Malik","Hezhe Qiao","Guansong Pang","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2202.09517v2.pdf","comment":"18 pages, 4 figures, and 6 tables"},{"id":"http://arxiv.org/abs/2312.02931v2","updated":"2023-12-07T00:37:29Z","published":"2023-12-05T18:03:13Z","title":"WhisBERT: Multimodal Text-Audio Language Modeling on 100M Words","summary":" Training on multiple modalities of input can augment the capabilities of a\nlanguage model. Here, we ask whether such a training regime can improve the\nquality and efficiency of these systems as well. We focus on text--audio and\nintroduce Whisbert, which is inspired by the text--image approach of FLAVA\n(Singh et al., 2022). In accordance with Babylm guidelines (Warstadt et al.,\n2023), we pretrain Whisbert on a dataset comprising only 100 million words plus\ntheir corresponding speech from the word-aligned version of the People's Speech\ndataset (Galvez et al., 2021). To assess the impact of multimodality, we\ncompare versions of the model that are trained on text only and on both audio\nand text simultaneously. We find that while Whisbert is able to perform well on\nmultimodal masked modeling and surpasses the Babylm baselines in most benchmark\ntasks, it struggles to optimize its complex objective and outperform its\ntext-only Whisbert baseline.\n","authors":["Lukas Wolf","Greta Tuckute","Klemen Kotar","Eghbal Hosseini","Tamar Regev","Ethan Wilcox","Alex Warstadt"],"pdf_url":"https://arxiv.org/pdf/2312.02931v2.pdf","comment":"Published at the BabyLM Challenge, a shared task co-sponsored by CMCL\n 2023 and CoNLL 2023, hosted by EMNLP 2023"}],"Computer Vision and Pattern Recognition":[{"id":"http://arxiv.org/abs/2312.04567v1","updated":"2023-12-07T18:59:59Z","published":"2023-12-07T18:59:59Z","title":"Scaling Laws of Synthetic Images for Model Training ... for Now","summary":" Recent significant advances in text-to-image models unlock the possibility of\ntraining vision systems using synthetic images, potentially overcoming the\ndifficulty of collecting curated data at scale. It is unclear, however, how\nthese models behave at scale, as more synthetic data is added to the training\nset. In this paper we study the scaling laws of synthetic images generated by\nstate of the art text-to-image models, for the training of supervised models:\nimage classifiers with label supervision, and CLIP with language supervision.\nWe identify several factors, including text prompts, classifier-free guidance\nscale, and types of text-to-image models, that significantly affect scaling\nbehavior. After tuning these factors, we observe that synthetic images\ndemonstrate a scaling trend similar to, but slightly less effective than, real\nimages in CLIP training, while they significantly underperform in scaling when\ntraining supervised image classifiers. Our analysis indicates that the main\nreason for this underperformance is the inability of off-the-shelf\ntext-to-image models to generate certain concepts, a limitation that\nsignificantly impairs the training of image classifiers. Our findings also\nsuggest that scaling synthetic data can be particularly effective in scenarios\nsuch as: (1) when there is a limited supply of real images for a supervised\nproblem (e.g., fewer than 0.5 million images in ImageNet), (2) when the\nevaluation dataset diverges significantly from the training data, indicating\nthe out-of-distribution scenario, or (3) when synthetic data is used in\nconjunction with real images, as demonstrated in the training of CLIP models.\n","authors":["Lijie Fan","Kaifeng Chen","Dilip Krishnan","Dina Katabi","Phillip Isola","Yonglong Tian"],"pdf_url":"https://arxiv.org/pdf/2312.04567v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04566v1","updated":"2023-12-07T18:59:58Z","published":"2023-12-07T18:59:58Z","title":"Gen2Det: Generate to Detect","summary":" Recently diffusion models have shown improvement in synthetic image quality\nas well as better control in generation. We motivate and present Gen2Det, a\nsimple modular pipeline to create synthetic training data for object detection\nfor free by leveraging state-of-the-art grounded image generation methods.\nUnlike existing works which generate individual object instances, require\nidentifying foreground followed by pasting on other images, we simplify to\ndirectly generating scene-centric images. In addition to the synthetic data,\nGen2Det also proposes a suite of techniques to best utilize the generated data,\nincluding image-level filtering, instance-level filtering, and better training\nrecipe to account for imperfections in the generation. Using Gen2Det, we show\nhealthy improvements on object detection and segmentation tasks under various\nsettings and agnostic to detection methods. In the long-tailed detection\nsetting on LVIS, Gen2Det improves the performance on rare categories by a large\nmargin while also significantly improving the performance on other categories,\ne.g. we see an improvement of 2.13 Box AP and 1.84 Mask AP over just training\non real data on LVIS with Mask R-CNN. In the low-data regime setting on COCO,\nGen2Det consistently improves both Box and Mask AP by 2.27 and 1.85 points. In\nthe most general detection setting, Gen2Det still demonstrates robust\nperformance gains, e.g. it improves the Box and Mask AP on COCO by 0.45 and\n0.32 points.\n","authors":["Saksham Suri","Fanyi Xiao","Animesh Sinha","Sean Chang Culatana","Raghuraman Krishnamoorthi","Chenchen Zhu","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.04566v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04565v1","updated":"2023-12-07T18:59:56Z","published":"2023-12-07T18:59:56Z","title":"MuRF: Multi-Baseline Radiance Fields","summary":" We present Multi-Baseline Radiance Fields (MuRF), a general feed-forward\napproach to solving sparse view synthesis under multiple different baseline\nsettings (small and large baselines, and different number of input views). To\nrender a target novel view, we discretize the 3D space into planes parallel to\nthe target image plane, and accordingly construct a target view frustum volume.\nSuch a target volume representation is spatially aligned with the target view,\nwhich effectively aggregates relevant information from the input views for\nhigh-quality rendering. It also facilitates subsequent radiance field\nregression with a convolutional network thanks to its axis-aligned nature. The\n3D context modeled by the convolutional network enables our method to synthesis\nsharper scene structures than prior works. Our MuRF achieves state-of-the-art\nperformance across multiple different baseline settings and diverse scenarios\nranging from simple objects (DTU) to complex indoor and outdoor scenes\n(RealEstate10K and LLFF). We also show promising zero-shot generalization\nabilities on the Mip-NeRF 360 dataset, demonstrating the general applicability\nof MuRF.\n","authors":["Haofei Xu","Anpei Chen","Yuedong Chen","Christos Sakaridis","Yulun Zhang","Marc Pollefeys","Andreas Geiger","Fisher Yu"],"pdf_url":"https://arxiv.org/pdf/2312.04565v1.pdf","comment":"Project page: https://haofeixu.github.io/murf/"},{"id":"http://arxiv.org/abs/2312.04564v1","updated":"2023-12-07T18:59:55Z","published":"2023-12-07T18:59:55Z","title":"EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS","summary":" Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view\nscene synthesis. It addresses the challenges of lengthy training times and slow\nrendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid,\ndifferentiable rasterization of 3D Gaussians, 3D-GS achieves real-time\nrendering and accelerated training. They, however, demand substantial memory\nresources for both training and storage, as they require millions of Gaussians\nin their point cloud representation for each scene. We present a technique\nutilizing quantized embeddings to significantly reduce memory storage\nrequirements and a coarse-to-fine training strategy for a faster and more\nstable optimization of the Gaussian point clouds. Our approach results in scene\nrepresentations with fewer Gaussians and quantized representations, leading to\nfaster training times and rendering speeds for real-time rendering of high\nresolution scenes. We reduce memory by more than an order of magnitude all\nwhile maintaining the reconstruction quality. We validate the effectiveness of\nour approach on a variety of datasets and scenes preserving the visual quality\nwhile consuming 10-20x less memory and faster training/inference speed. Project\npage and code is available https://efficientgaussian.github.io\n","authors":["Sharath Girish","Kamal Gupta","Abhinav Shrivastava"],"pdf_url":"https://arxiv.org/pdf/2312.04564v1.pdf","comment":"Website: https://efficientgaussian.github.io Code:\n https://github.com/Sharath-girish/efficientgaussian"},{"id":"http://arxiv.org/abs/2312.04563v1","updated":"2023-12-07T18:59:52Z","published":"2023-12-07T18:59:52Z","title":"Visual Geometry Grounded Deep Structure From Motion","summary":" Structure-from-motion (SfM) is a long-standing problem in the computer vision\ncommunity, which aims to reconstruct the camera poses and 3D structure of a\nscene from a set of unconstrained 2D images. Classical frameworks solve this\nproblem in an incremental manner by detecting and matching keypoints,\nregistering images, triangulating 3D points, and conducting bundle adjustment.\nRecent research efforts have predominantly revolved around harnessing the power\nof deep learning techniques to enhance specific elements (e.g., keypoint\nmatching), but are still based on the original, non-differentiable pipeline.\nInstead, we propose a new deep pipeline VGGSfM, where each component is fully\ndifferentiable and thus can be trained in an end-to-end manner. To this end, we\nintroduce new mechanisms and simplifications. First, we build on recent\nadvances in deep 2D point tracking to extract reliable pixel-accurate tracks,\nwhich eliminates the need for chaining pairwise matches. Furthermore, we\nrecover all cameras simultaneously based on the image and track features\ninstead of gradually registering cameras. Finally, we optimise the cameras and\ntriangulate 3D points via a differentiable bundle adjustment layer. We attain\nstate-of-the-art performance on three popular datasets, CO3D, IMC Phototourism,\nand ETH3D.\n","authors":["Jianyuan Wang","Nikita Karaev","Christian Rupprecht","David Novotny"],"pdf_url":"https://arxiv.org/pdf/2312.04563v1.pdf","comment":"8 figures. Project page: https://vggsfm.github.io/"},{"id":"http://arxiv.org/abs/2312.04560v1","updated":"2023-12-07T18:59:41Z","published":"2023-12-07T18:59:41Z","title":"NeRFiller: Completing Scenes via Generative 3D Inpainting","summary":" We propose NeRFiller, an approach that completes missing portions of a 3D\ncapture via generative 3D inpainting using off-the-shelf 2D visual generative\nmodels. Often parts of a captured 3D scene or object are missing due to mesh\nreconstruction failures or a lack of observations (e.g., contact regions, such\nas the bottom of objects, or hard-to-reach areas). We approach this challenging\n3D inpainting problem by leveraging a 2D inpainting diffusion model. We\nidentify a surprising behavior of these models, where they generate more 3D\nconsistent inpaints when images form a 2$\\times$2 grid, and show how to\ngeneralize this behavior to more than four images. We then present an iterative\nframework to distill these inpainted regions into a single consistent 3D scene.\nIn contrast to related works, we focus on completing scenes rather than\ndeleting foreground objects, and our approach does not require tight 2D object\nmasks or text. We compare our approach to relevant baselines adapted to our\nsetting on a variety of scenes, where NeRFiller creates the most 3D consistent\nand plausible scene completions. Our project page is at\nhttps://ethanweber.me/nerfiller.\n","authors":["Ethan Weber","Aleksander Hołyński","Varun Jampani","Saurabh Saxena","Noah Snavely","Abhishek Kar","Angjoo Kanazawa"],"pdf_url":"https://arxiv.org/pdf/2312.04560v1.pdf","comment":"Project page: https://ethanweber.me/nerfiller"},{"id":"http://arxiv.org/abs/2312.04561v1","updated":"2023-12-07T18:59:41Z","published":"2023-12-07T18:59:41Z","title":"GenDeF: Learning Generative Deformation Field for Video Generation","summary":" We offer a new perspective on approaching the task of video generation.\nInstead of directly synthesizing a sequence of frames, we propose to render a\nvideo by warping one static image with a generative deformation field (GenDeF).\nSuch a pipeline enjoys three appealing advantages. First, we can sufficiently\nreuse a well-trained image generator to synthesize the static image (also\ncalled canonical image), alleviating the difficulty in producing a video and\nthereby resulting in better visual quality. Second, we can easily convert a\ndeformation field to optical flows, making it possible to apply explicit\nstructural regularizations for motion modeling, leading to temporally\nconsistent results. Third, the disentanglement between content and motion\nallows users to process a synthesized video through processing its\ncorresponding static image without any tuning, facilitating many applications\nlike video editing, keypoint tracking, and video segmentation. Both qualitative\nand quantitative results on three common video generation benchmarks\ndemonstrate the superiority of our GenDeF method.\n","authors":["Wen Wang","Kecheng Zheng","Qiuyu Wang","Hao Chen","Zifan Shi","Ceyuan Yang","Yujun Shen","Chunhua Shen"],"pdf_url":"https://arxiv.org/pdf/2312.04561v1.pdf","comment":"Project page: https://aim-uofa.github.io/GenDeF/"},{"id":"http://arxiv.org/abs/2312.04559v1","updated":"2023-12-07T18:59:33Z","published":"2023-12-07T18:59:33Z","title":"PrimDiffusion: Volumetric Primitives Diffusion for 3D Human Generation","summary":" We present PrimDiffusion, the first diffusion-based framework for 3D human\ngeneration. Devising diffusion models for 3D human generation is difficult due\nto the intensive computational cost of 3D representations and the articulated\ntopology of 3D humans. To tackle these challenges, our key insight is operating\nthe denoising diffusion process directly on a set of volumetric primitives,\nwhich models the human body as a number of small volumes with radiance and\nkinematic information. This volumetric primitives representation marries the\ncapacity of volumetric representations with the efficiency of primitive-based\nrendering. Our PrimDiffusion framework has three appealing properties: 1)\ncompact and expressive parameter space for the diffusion model, 2) flexible 3D\nrepresentation that incorporates human prior, and 3) decoder-free rendering for\nefficient novel-view and novel-pose synthesis. Extensive experiments validate\nthat PrimDiffusion outperforms state-of-the-art methods in 3D human generation.\nNotably, compared to GAN-based methods, our PrimDiffusion supports real-time\nrendering of high-quality 3D humans at a resolution of $512\\times512$ once the\ndenoising process is done. We also demonstrate the flexibility of our framework\non training-free conditional generation such as texture transfer and 3D\ninpainting.\n","authors":["Zhaoxi Chen","Fangzhou Hong","Haiyi Mei","Guangcong Wang","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04559v1.pdf","comment":"NeurIPS 2023; Project page\n https://frozenburning.github.io/projects/primdiffusion/ Code available at\n https://github.com/FrozenBurning/PrimDiffusion"},{"id":"http://arxiv.org/abs/2312.04558v1","updated":"2023-12-07T18:59:31Z","published":"2023-12-07T18:59:31Z","title":"MonoGaussianAvatar: Monocular Gaussian Point-based Head Avatar","summary":" The ability to animate photo-realistic head avatars reconstructed from\nmonocular portrait video sequences represents a crucial step in bridging the\ngap between the virtual and real worlds. Recent advancements in head avatar\ntechniques, including explicit 3D morphable meshes (3DMM), point clouds, and\nneural implicit representation have been exploited for this ongoing research.\nHowever, 3DMM-based methods are constrained by their fixed topologies,\npoint-based approaches suffer from a heavy training burden due to the extensive\nquantity of points involved, and the last ones suffer from limitations in\ndeformation flexibility and rendering efficiency. In response to these\nchallenges, we propose MonoGaussianAvatar (Monocular Gaussian Point-based Head\nAvatar), a novel approach that harnesses 3D Gaussian point representation\ncoupled with a Gaussian deformation field to learn explicit head avatars from\nmonocular portrait videos. We define our head avatars with Gaussian points\ncharacterized by adaptable shapes, enabling flexible topology. These points\nexhibit movement with a Gaussian deformation field in alignment with the target\npose and expression of a person, facilitating efficient deformation.\nAdditionally, the Gaussian points have controllable shape, size, color, and\nopacity combined with Gaussian splatting, allowing for efficient training and\nrendering. Experiments demonstrate the superior performance of our method,\nwhich achieves state-of-the-art results among previous methods.\n","authors":["Yufan Chen","Lizhen Wang","Qijing Li","Hongjiang Xiao","Shengping Zhang","Hongxun Yao","Yebin Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04558v1.pdf","comment":"The link to our projectpage is\n https://yufan1012.github.io/MonoGaussianAvatar"},{"id":"http://arxiv.org/abs/2312.04557v1","updated":"2023-12-07T18:59:30Z","published":"2023-12-07T18:59:30Z","title":"GenTron: Delving Deep into Diffusion Transformers for Image and Video\n Generation","summary":" In this study, we explore Transformer-based diffusion models for image and\nvideo generation. Despite the dominance of Transformer architectures in various\nfields due to their flexibility and scalability, the visual generative domain\nprimarily utilizes CNN-based U-Net architectures, particularly in\ndiffusion-based models. We introduce GenTron, a family of Generative models\nemploying Transformer-based diffusion, to address this gap. Our initial step\nwas to adapt Diffusion Transformers (DiTs) from class to text conditioning, a\nprocess involving thorough empirical exploration of the conditioning mechanism.\nWe then scale GenTron from approximately 900M to over 3B parameters, observing\nsignificant improvements in visual quality. Furthermore, we extend GenTron to\ntext-to-video generation, incorporating novel motion-free guidance to enhance\nvideo quality. In human evaluations against SDXL, GenTron achieves a 51.1% win\nrate in visual quality (with a 19.8% draw rate), and a 42.3% win rate in text\nalignment (with a 42.9% draw rate). GenTron also excels in the T2I-CompBench,\nunderscoring its strengths in compositional generation. We believe this work\nwill provide meaningful insights and serve as a valuable reference for future\nresearch.\n","authors":["Shoufa Chen","Mengmeng Xu","Jiawei Ren","Yuren Cong","Sen He","Yanping Xie","Animesh Sinha","Ping Luo","Tao Xiang","Juan-Manuel Perez-Rua"],"pdf_url":"https://arxiv.org/pdf/2312.04557v1.pdf","comment":"Technical report. Website:\n https://www.shoufachen.com/gentron_website/"},{"id":"http://arxiv.org/abs/2312.04554v1","updated":"2023-12-07T18:59:22Z","published":"2023-12-07T18:59:22Z","title":"Improved Visual Grounding through Self-Consistent Explanations","summary":" Vision-and-language models trained to match images with text can be combined\nwith visual explanation methods to point to the locations of specific objects\nin an image. Our work shows that the localization --\"grounding\"-- abilities of\nthese models can be further improved by finetuning for self-consistent visual\nexplanations. We propose a strategy for augmenting existing text-image datasets\nwith paraphrases using a large language model, and SelfEQ, a weakly-supervised\nstrategy on visual explanation maps for paraphrases that encourages\nself-consistency. Specifically, for an input textual phrase, we attempt to\ngenerate a paraphrase and finetune the model so that the phrase and paraphrase\nmap to the same region in the image. We posit that this both expands the\nvocabulary that the model is able to handle, and improves the quality of the\nobject locations highlighted by gradient-based visual explanation methods (e.g.\nGradCAM). We demonstrate that SelfEQ improves performance on Flickr30k,\nReferIt, and RefCOCO+ over a strong baseline method and several prior works.\nParticularly, comparing to other methods that do not use any type of box\nannotations, we obtain 84.07% on Flickr30k (an absolute improvement of 4.69%),\n67.40% on ReferIt (an absolute improvement of 7.68%), and 75.10%, 55.49% on\nRefCOCO+ test sets A and B respectively (an absolute improvement of 3.74% on\naverage).\n","authors":["Ruozhen He","Paola Cascante-Bonilla","Ziyan Yang","Alexander C. Berg","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2312.04554v1.pdf","comment":"Project Page: https://catherine-r-he.github.io/SelfEQ/"},{"id":"http://arxiv.org/abs/2312.04553v1","updated":"2023-12-07T18:59:21Z","published":"2023-12-07T18:59:21Z","title":"SPIDeRS: Structured Polarization for Invisible Depth and Reflectance\n Sensing","summary":" Can we capture shape and reflectance in stealth? Such capability would be\nvaluable for many application domains in vision, xR, robotics, and HCI. We\nintroduce Structured Polarization, the first depth and reflectance sensing\nmethod using patterns of polarized light (SPIDeRS). The key idea is to modulate\nthe angle of linear polarization (AoLP) of projected light at each pixel. The\nuse of polarization makes it invisible and lets us recover not only depth but\nalso directly surface normals and even reflectance. We implement SPIDeRS with a\nliquid crystal spatial light modulator (SLM) and a polarimetric camera. We\nderive a novel method for robustly extracting the projected structured\npolarization pattern from the polarimetric object appearance. We evaluate the\neffectiveness of SPIDeRS by applying it to a number of real-world objects. The\nresults show that our method successfully reconstructs object shapes of various\nmaterials and is robust to diffuse reflection and ambient light. We also\ndemonstrate relighting using recovered surface normals and reflectance. We\nbelieve SPIDeRS opens a new avenue of polarization use in visual sensing.\n","authors":["Tomoki Ichikawa","Shohei Nobuhara","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04553v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04552v1","updated":"2023-12-07T18:59:20Z","published":"2023-12-07T18:59:20Z","title":"Generating Illustrated Instructions","summary":" We introduce the new task of generating Illustrated Instructions, i.e.,\nvisual instructions customized to a user's needs. We identify desiderata unique\nto this task, and formalize it through a suite of automatic and human\nevaluation metrics, designed to measure the validity, consistency, and efficacy\nof the generations. We combine the power of large language models (LLMs)\ntogether with strong text-to-image generation diffusion models to propose a\nsimple approach called StackedDiffusion, which generates such illustrated\ninstructions given text as input. The resulting model strongly outperforms\nbaseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases,\nusers even prefer it to human-generated articles. Most notably, it enables\nvarious new and exciting applications far beyond what static articles on the\nweb can provide, such as personalized instructions complete with intermediate\nsteps and pictures in response to a user's individual situation.\n","authors":["Sachit Menon","Ishan Misra","Rohit Girdhar"],"pdf_url":"https://arxiv.org/pdf/2312.04552v1.pdf","comment":"Project website:\n http://facebookresearch.github.io/IllustratedInstructions"},{"id":"http://arxiv.org/abs/2312.04551v1","updated":"2023-12-07T18:59:18Z","published":"2023-12-07T18:59:18Z","title":"Free3D: Consistent Novel View Synthesis without 3D Representation","summary":" We introduce Free3D, a simple approach designed for open-set novel view\nsynthesis (NVS) from a single image. Similar to Zero-1-to-3, we start from a\npre-trained 2D image generator for generalization, and fine-tune it for NVS.\nCompared to recent and concurrent works, we obtain significant improvements\nwithout resorting to an explicit 3D representation, which is slow and\nmemory-consuming or training an additional 3D network. We do so by encoding\nbetter the target camera pose via a new per-pixel ray conditioning\nnormalization (RCN) layer. The latter injects pose information in the\nunderlying 2D image generator by telling each pixel its specific viewing\ndirection. We also improve multi-view consistency via a light-weight multi-view\nattention layer and multi-view noise sharing. We train Free3D on the Objaverse\ndataset and demonstrate excellent generalization to various new categories in\nseveral new datasets, including OminiObject3D and GSO. We hope our simple and\neffective approach will serve as a solid baseline and help future research in\nNVS with more accuracy pose. The project page is available at\nhttps://chuanxiaz.com/free3d/.\n","authors":["Chuanxia Zheng","Andrea Vedaldi"],"pdf_url":"https://arxiv.org/pdf/2312.04551v1.pdf","comment":"webpage: https://chuanxiaz.com/free3d/"},{"id":"http://arxiv.org/abs/2312.04548v1","updated":"2023-12-07T18:59:14Z","published":"2023-12-07T18:59:14Z","title":"Multiview Aerial Visual Recognition (MAVREC): Can Multi-view Improve\n Aerial Visual Perception?","summary":" Despite the commercial abundance of UAVs, aerial data acquisition remains\nchallenging, and the existing Asia and North America-centric open-source UAV\ndatasets are small-scale or low-resolution and lack diversity in scene\ncontextuality. Additionally, the color content of the scenes, solar-zenith\nangle, and population density of different geographies influence the data\ndiversity. These two factors conjointly render suboptimal aerial-visual\nperception of the deep neural network (DNN) models trained primarily on the\nground-view data, including the open-world foundational models.\n To pave the way for a transformative era of aerial detection, we present\nMultiview Aerial Visual RECognition or MAVREC, a video dataset where we record\nsynchronized scenes from different perspectives -- ground camera and\ndrone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard\n2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million\nannotated bounding boxes. This makes MAVREC the largest ground and aerial-view\ndataset, and the fourth largest among all drone-based datasets across all\nmodalities and tasks. Through our extensive benchmarking on MAVREC, we\nrecognize that augmenting object detectors with ground-view images from the\ncorresponding geographical location is a superior pre-training strategy for\naerial detection. Building on this strategy, we benchmark MAVREC with a\ncurriculum-based semi-supervised object detection approach that leverages\nlabeled (ground and aerial) and unlabeled (only aerial) images to enhance the\naerial detection. We publicly release the MAVREC dataset:\nhttps://mavrec.github.io.\n","authors":["Aritra Dutta","Srijan Das","Jacob Nielsen","Rajatsubhra Chakraborty","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2312.04548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04549v1","updated":"2023-12-07T18:59:14Z","published":"2023-12-07T18:59:14Z","title":"PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play","summary":" Learning from unstructured and uncurated data has become the dominant\nparadigm for generative approaches in language and vision. Such unstructured\nand unguided behavior data, commonly known as play, is also easier to collect\nin robotics but much more difficult to learn from due to its inherently\nmultimodal, noisy, and suboptimal nature. In this paper, we study this problem\nof learning goal-directed skill policies from unstructured play data which is\nlabeled with language in hindsight. Specifically, we leverage advances in\ndiffusion models to learn a multi-task diffusion model to extract robotic\nskills from play data. Using a conditional denoising diffusion process in the\nspace of states and actions, we can gracefully handle the complexity and\nmultimodality of play data and generate diverse and interesting robot\nbehaviors. To make diffusion models more useful for skill learning, we\nencourage robotic agents to acquire a vocabulary of skills by introducing\ndiscrete bottlenecks into the conditional behavior generation process. In our\nexperiments, we demonstrate the effectiveness of our approach across a wide\nvariety of environments in both simulation and the real world. Results\nvisualizations and videos at https://play-fusion.github.io\n","authors":["Lili Chen","Shikhar Bahl","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2312.04549v1.pdf","comment":"In CoRL 2023. Website at https://play-fusion.github.io"},{"id":"http://arxiv.org/abs/2312.04547v1","updated":"2023-12-07T18:58:59Z","published":"2023-12-07T18:58:59Z","title":"Digital Life Project: Autonomous 3D Characters with Social Intelligence","summary":" In this work, we present Digital Life Project, a framework utilizing language\nas the universal medium to build autonomous 3D characters, who are capable of\nengaging in social interactions and expressing with articulated body motions,\nthereby simulating life in a digital environment. Our framework comprises two\nprimary components: 1) SocioMind: a meticulously crafted digital brain that\nmodels personalities with systematic few-shot exemplars, incorporates a\nreflection process based on psychology principles, and emulates autonomy by\ninitiating dialogue topics; 2) MoMat-MoGen: a text-driven motion synthesis\nparadigm for controlling the character's digital body. It integrates motion\nmatching, a proven industry technique to ensure motion quality, with\ncutting-edge advancements in motion generation for diversity. Extensive\nexperiments demonstrate that each module achieves state-of-the-art performance\nin its respective domain. Collectively, they enable virtual characters to\ninitiate and sustain dialogues autonomously, while evolving their\nsocio-psychological states. Concurrently, these characters can perform\ncontextually relevant bodily movements. Additionally, a motion captioning\nmodule further allows the virtual character to recognize and appropriately\nrespond to human players' actions. Homepage: https://digital-life-project.com/\n","authors":["Zhongang Cai","Jianping Jiang","Zhongfei Qing","Xinying Guo","Mingyuan Zhang","Zhengyu Lin","Haiyi Mei","Chen Wei","Ruisi Wang","Wanqi Yin","Xiangyu Fan","Han Du","Liang Pan","Peng Gao","Zhitao Yang","Yang Gao","Jiaqi Li","Tianxiang Ren","Yukun Wei","Xiaogang Wang","Chen Change Loy","Lei Yang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04547v1.pdf","comment":"Homepage: https://digital-life-project.com/"},{"id":"http://arxiv.org/abs/2302.01330v3","updated":"2023-12-07T18:58:30Z","published":"2023-02-02T18:59:16Z","title":"SceneDreamer: Unbounded 3D Scene Generation from 2D Image Collections","summary":" In this work, we present SceneDreamer, an unconditional generative model for\nunbounded 3D scenes, which synthesizes large-scale 3D landscapes from random\nnoise. Our framework is learned from in-the-wild 2D image collections only,\nwithout any 3D annotations. At the core of SceneDreamer is a principled\nlearning paradigm comprising 1) an efficient yet expressive 3D scene\nrepresentation, 2) a generative scene parameterization, and 3) an effective\nrenderer that can leverage the knowledge from 2D images. Our approach begins\nwith an efficient bird's-eye-view (BEV) representation generated from simplex\nnoise, which includes a height field for surface elevation and a semantic field\nfor detailed scene semantics. This BEV scene representation enables 1)\nrepresenting a 3D scene with quadratic complexity, 2) disentangled geometry and\nsemantics, and 3) efficient training. Moreover, we propose a novel generative\nneural hash grid to parameterize the latent space based on 3D positions and\nscene semantics, aiming to encode generalizable features across various scenes.\nLastly, a neural volumetric renderer, learned from 2D image collections through\nadversarial training, is employed to produce photorealistic images. Extensive\nexperiments demonstrate the effectiveness of SceneDreamer and superiority over\nstate-of-the-art methods in generating vivid yet diverse unbounded 3D worlds.\n","authors":["Zhaoxi Chen","Guangcong Wang","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2302.01330v3.pdf","comment":"IEEE Transactions on Pattern Analysis & Machine Intelligence (TPAMI)\n 2023; Project Page https://scene-dreamer.github.io/ Code\n https://github.com/FrozenBurning/SceneDreamer"},{"id":"http://arxiv.org/abs/2312.04543v1","updated":"2023-12-07T18:58:09Z","published":"2023-12-07T18:58:09Z","title":"HyperDreamer: Hyper-Realistic 3D Content Generation and Editing from a\n Single Image","summary":" 3D content creation from a single image is a long-standing yet highly\ndesirable task. Recent advances introduce 2D diffusion priors, yielding\nreasonable results. However, existing methods are not hyper-realistic enough\nfor post-generation usage, as users cannot view, render and edit the resulting\n3D content from a full range. To address these challenges, we introduce\nHyperDreamer with several key designs and appealing properties: 1) Viewable:\n360 degree mesh modeling with high-resolution textures enables the creation of\nvisually compelling 3D models from a full range of observation points. 2)\nRenderable: Fine-grained semantic segmentation and data-driven priors are\nincorporated as guidance to learn reasonable albedo, roughness, and specular\nproperties of the materials, enabling semantic-aware arbitrary material\nestimation. 3) Editable: For a generated model or their own data, users can\ninteractively select any region via a few clicks and efficiently edit the\ntexture with text-based guidance. Extensive experiments demonstrate the\neffectiveness of HyperDreamer in modeling region-aware materials with\nhigh-resolution textures and enabling user-friendly editing. We believe that\nHyperDreamer holds promise for advancing 3D content creation and finding\napplications in various domains.\n","authors":["Tong Wu","Zhibing Li","Shuai Yang","Pan Zhang","Xinggang Pan","Jiaqi Wang","Dahua Lin","Ziwei Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04543v1.pdf","comment":"SIGGRAPH Asia 2023 (conference track). Project page:\n https://ys-imtech.github.io/HyperDreamer/"},{"id":"http://arxiv.org/abs/2312.04540v1","updated":"2023-12-07T18:57:03Z","published":"2023-12-07T18:57:03Z","title":"Sim-to-Real Causal Transfer: A Metric Learning Approach to\n Causally-Aware Interaction Representations","summary":" Modeling spatial-temporal interactions among neighboring agents is at the\nheart of multi-agent problems such as motion forecasting and crowd navigation.\nDespite notable progress, it remains unclear to which extent modern\nrepresentations can capture the causal relationships behind agent interactions.\nIn this work, we take an in-depth look at the causal awareness of these\nrepresentations, from computational formalism to real-world practice. First, we\ncast doubt on the notion of non-causal robustness studied in the recent\nCausalAgents benchmark. We show that recent representations are already\npartially resilient to perturbations of non-causal agents, and yet modeling\nindirect causal effects involving mediator agents remains challenging. To\naddress this challenge, we introduce a metric learning approach that\nregularizes latent representations with causal annotations. Our controlled\nexperiments show that this approach not only leads to higher degrees of causal\nawareness but also yields stronger out-of-distribution robustness. To further\noperationalize it in practice, we propose a sim-to-real causal transfer method\nvia cross-domain multi-task learning. Experiments on pedestrian datasets show\nthat our method can substantially boost generalization, even in the absence of\nreal-world causal annotations. We hope our work provides a new perspective on\nthe challenges and potential pathways towards causally-aware representations of\nmulti-agent interactions. Our code is available at\nhttps://github.com/socialcausality.\n","authors":["Yuejiang Liu","Ahmad Rahimi","Po-Chien Luan","Frano Rajič","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.04540v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2312.04539v1","updated":"2023-12-07T18:55:52Z","published":"2023-12-07T18:55:52Z","title":"Self-Guided Open-Vocabulary Semantic Segmentation","summary":" Vision-Language Models (VLMs) have emerged as promising tools for open-ended\nimage understanding tasks, including open vocabulary segmentation. Yet, direct\napplication of such VLMs to segmentation is non-trivial, since VLMs are trained\nwith image-text pairs and naturally lack pixel-level granularity. Recent works\nhave made advancements in bridging this gap, often by leveraging the shared\nimage-text space in which the image and a provided text prompt are represented.\nIn this paper, we challenge the capabilities of VLMs further and tackle\nopen-vocabulary segmentation without the need for any textual input. To this\nend, we propose a novel Self-Guided Semantic Segmentation (Self-Seg) framework.\nSelf-Seg is capable of automatically detecting relevant class names from\nclustered BLIP embeddings and using these for accurate semantic segmentation.\nIn addition, we propose an LLM-based Open-Vocabulary Evaluator (LOVE) to\neffectively assess predicted open-vocabulary class names. We achieve\nstate-of-the-art results on Pascal VOC, ADE20K and CityScapes for\nopen-vocabulary segmentation without given class names, as well as competitive\nperformance with methods where class names are given. All code and data will be\nreleased.\n","authors":["Osman Ülger","Maksymilian Kulicki","Yuki Asano","Martin R. Oswald"],"pdf_url":"https://arxiv.org/pdf/2312.04539v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04534v1","updated":"2023-12-07T18:53:18Z","published":"2023-12-07T18:53:18Z","title":"PICTURE: PhotorealistIC virtual Try-on from UnconstRained dEsigns","summary":" In this paper, we propose a novel virtual try-on from unconstrained designs\n(ucVTON) task to enable photorealistic synthesis of personalized composite\nclothing on input human images. Unlike prior arts constrained by specific input\ntypes, our method allows flexible specification of style (text or image) and\ntexture (full garment, cropped sections, or texture patches) conditions. To\naddress the entanglement challenge when using full garment images as\nconditions, we develop a two-stage pipeline with explicit disentanglement of\nstyle and texture. In the first stage, we generate a human parsing map\nreflecting the desired style conditioned on the input. In the second stage, we\ncomposite textures onto the parsing map areas based on the texture input. To\nrepresent complex and non-stationary textures that have never been achieved in\nprevious fashion editing works, we first propose extracting hierarchical and\nbalanced CLIP features and applying position encoding in VTON. Experiments\ndemonstrate superior synthesis quality and personalization enabled by our\nmethod. The flexible control over style and texture mixing brings virtual\ntry-on to a new level of user experience for online shopping and fashion\ndesign.\n","authors":["Shuliang Ning","Duomin Wang","Yipeng Qin","Zirong Jin","Baoyuan Wang","Xiaoguang Han"],"pdf_url":"https://arxiv.org/pdf/2312.04534v1.pdf","comment":"Project page: https://ningshuliang.github.io/2023/Arxiv/index.html"},{"id":"http://arxiv.org/abs/2303.13472v2","updated":"2023-12-07T18:52:41Z","published":"2023-03-23T17:43:17Z","title":"Plotting Behind the Scenes: Towards Learnable Game Engines","summary":" Neural video game simulators emerged as powerful tools to generate and edit\nvideos. Their idea is to represent games as the evolution of an environment's\nstate driven by the actions of its agents. While such a paradigm enables users\nto play a game action-by-action, its rigidity precludes more semantic forms of\ncontrol. To overcome this limitation, we augment game models with prompts\nspecified as a set of natural language actions and desired states. The result-a\nPromptable Game Model (PGM)-makes it possible for a user to play the game by\nprompting it with high- and low-level action sequences. Most captivatingly, our\nPGM unlocks the director's mode, where the game is played by specifying goals\nfor the agents in the form of a prompt. This requires learning \"game AI\",\nencapsulated by our animation model, to navigate the scene using high-level\nconstraints, play against an adversary, and devise a strategy to win a point.\nTo render the resulting state, we use a compositional NeRF representation\nencapsulated in our synthesis model. To foster future research, we present\nnewly collected, annotated and calibrated Tennis and Minecraft datasets. Our\nmethod significantly outperforms existing neural video game simulators in terms\nof rendering quality and unlocks applications beyond the capabilities of the\ncurrent state of the art. Our framework, data, and models are available at\nhttps://snap-research.github.io/promptable-game-models/.\n","authors":["Willi Menapace","Aliaksandr Siarohin","Stéphane Lathuilière","Panos Achlioptas","Vladislav Golyanik","Sergey Tulyakov","Elisa Ricci"],"pdf_url":"https://arxiv.org/pdf/2303.13472v2.pdf","comment":"ACM Transactions on Graphics \\c{opyright} Copyright is held by the\n owner/author(s) 2023. This is the author's version of the work. It is posted\n here for your personal use. Not for redistribution. The definitive Version of\n Record was published in ACM Transactions on Graphics,\n http://dx.doi.org/10.1145/3635705"},{"id":"http://arxiv.org/abs/2312.04533v1","updated":"2023-12-07T18:51:19Z","published":"2023-12-07T18:51:19Z","title":"Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language\n Models","summary":" We introduce Dream2Real, a robotics framework which integrates\nvision-language models (VLMs) trained on 2D data into a 3D object rearrangement\npipeline. This is achieved by the robot autonomously constructing a 3D\nrepresentation of the scene, where objects can be rearranged virtually and an\nimage of the resulting arrangement rendered. These renders are evaluated by a\nVLM, so that the arrangement which best satisfies the user instruction is\nselected and recreated in the real world with pick-and-place. This enables\nlanguage-conditioned rearrangement to be performed zero-shot, without needing\nto collect a training dataset of example arrangements. Results on a series of\nreal-world tasks show that this framework is robust to distractors,\ncontrollable by language, capable of understanding complex multi-object\nrelations, and readily applicable to both tabletop and 6-DoF rearrangement\ntasks.\n","authors":["Ivan Kapelyukh","Yifei Ren","Ignacio Alzugaray","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2312.04533v1.pdf","comment":"Project webpage with videos: https://www.robot-learning.uk/dream2real"},{"id":"http://arxiv.org/abs/2312.04530v1","updated":"2023-12-07T18:50:01Z","published":"2023-12-07T18:50:01Z","title":"Camera Height Doesn't Change: Unsupervised Monocular Scale-Aware\n Road-Scene Depth Estimation","summary":" Monocular depth estimators either require explicit scale supervision through\nauxiliary sensors or suffer from scale ambiguity, which renders them difficult\nto deploy in downstream applications. A possible source of scale is the sizes\nof objects found in the scene, but inaccurate localization makes them difficult\nto exploit. In this paper, we introduce a novel scale-aware monocular depth\nestimation method called StableCamH that does not require any auxiliary sensor\nor supervision. The key idea is to exploit prior knowledge of object heights in\nthe scene but aggregate the height cues into a single invariant measure common\nto all frames in a road video sequence, namely the camera height. By\nformulating monocular depth estimation as camera height optimization, we\nachieve robust and accurate unsupervised end-to-end training. To realize\nStableCamH, we devise a novel learning-based size prior that can directly\nconvert car appearance into its dimensions. Extensive experiments on KITTI and\nCityscapes show the effectiveness of StableCamH, its state-of-the-art accuracy\ncompared with related methods, and its generalizability. The training framework\nof StableCamH can be used for any monocular depth estimation method and will\nhopefully become a fundamental building block for further work.\n","authors":["Genki Kinoshita","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04530v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04529v1","updated":"2023-12-07T18:50:00Z","published":"2023-12-07T18:50:00Z","title":"Diffusion Reflectance Map: Single-Image Stochastic Inverse Rendering of\n Illumination and Reflectance","summary":" Reflectance bounds the frequency spectrum of illumination in the object\nappearance. In this paper, we introduce the first stochastic inverse rendering\nmethod, which recovers the full frequency spectrum of an illumination jointly\nwith the object reflectance from a single image. Our key idea is to solve this\nblind inverse problem in the reflectance map, an appearance representation\ninvariant to the underlying geometry, by learning to reverse the image\nformation with a novel diffusion model which we refer to as the Diffusion\nReflectance Map Network (DRMNet). Given an observed reflectance map converted\nand completed from the single input image, DRMNet generates a reflectance map\ncorresponding to a perfect mirror sphere while jointly estimating the\nreflectance. The forward process can be understood as gradually filtering a\nnatural illumination with lower and lower frequency reflectance and additive\nGaussian noise. DRMNet learns to invert this process with two subnetworks,\nIllNet and RefNet, which work in concert towards this joint estimation. The\nnetwork is trained on an extensive synthetic dataset and is demonstrated to\ngeneralize to real images, showing state-of-the-art accuracy on established\ndatasets.\n","authors":["Yuto Enyo","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04529v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04527v1","updated":"2023-12-07T18:46:47Z","published":"2023-12-07T18:46:47Z","title":"Correspondences of the Third Kind: Camera Pose Estimation from Object\n Reflection","summary":" Computer vision has long relied on two kinds of correspondences: pixel\ncorrespondences in images and 3D correspondences on object surfaces. Is there\nanother kind, and if there is, what can they do for us? In this paper, we\nintroduce correspondences of the third kind we call reflection correspondences\nand show that they can help estimate camera pose by just looking at objects\nwithout relying on the background. Reflection correspondences are point\ncorrespondences in the reflected world, i.e., the scene reflected by the object\nsurface. The object geometry and reflectance alters the scene geometrically and\nradiometrically, respectively, causing incorrect pixel correspondences.\nGeometry recovered from each image is also hampered by distortions, namely\ngeneralized bas-relief ambiguity, leading to erroneous 3D correspondences. We\nshow that reflection correspondences can resolve the ambiguities arising from\nthese distortions. We introduce a neural correspondence estimator and a RANSAC\nalgorithm that fully leverages all three kinds of correspondences for robust\nand accurate joint camera pose and object shape estimation just from the object\nappearance. The method expands the horizon of numerous downstream tasks,\nincluding camera pose estimation for appearance modeling (e.g., NeRF) and\nmotion estimation of reflective objects (e.g., cars on the road), to name a\nfew, as it relieves the requirement of overlapping background.\n","authors":["Kohei Yamashita","Vincent Lepetit","Ko Nishino"],"pdf_url":"https://arxiv.org/pdf/2312.04527v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04524v1","updated":"2023-12-07T18:43:45Z","published":"2023-12-07T18:43:45Z","title":"RAVE: Randomized Noise Shuffling for Fast and Consistent Video Editing\n with Diffusion Models","summary":" Recent advancements in diffusion-based models have demonstrated significant\nsuccess in generating images from text. However, video editing models have not\nyet reached the same level of visual quality and user control. To address this,\nwe introduce RAVE, a zero-shot video editing method that leverages pre-trained\ntext-to-image diffusion models without additional training. RAVE takes an input\nvideo and a text prompt to produce high-quality videos while preserving the\noriginal motion and semantic structure. It employs a novel noise shuffling\nstrategy, leveraging spatio-temporal interactions between frames, to produce\ntemporally consistent videos faster than existing methods. It is also efficient\nin terms of memory requirements, allowing it to handle longer videos. RAVE is\ncapable of a wide range of edits, from local attribute modifications to shape\ntransformations. In order to demonstrate the versatility of RAVE, we create a\ncomprehensive video evaluation dataset ranging from object-focused scenes to\ncomplex human activities like dancing and typing, and dynamic scenes featuring\nswimming fish and boats. Our qualitative and quantitative experiments highlight\nthe effectiveness of RAVE in diverse video editing scenarios compared to\nexisting methods. Our code, dataset and videos can be found in\nhttps://rave-video.github.io.\n","authors":["Ozgur Kara","Bariscan Kurtkaya","Hidir Yesiltepe","James M. Rehg","Pinar Yanardag"],"pdf_url":"https://arxiv.org/pdf/2312.04524v1.pdf","comment":"Project webpage: https://rave-video.github.io , Github:\n http://github.com/rehg-lab/RAVE"},{"id":"http://arxiv.org/abs/2312.04521v1","updated":"2023-12-07T18:41:21Z","published":"2023-12-07T18:41:21Z","title":"Multimodal Industrial Anomaly Detection by Crossmodal Feature Mapping","summary":" The paper explores the industrial multimodal Anomaly Detection (AD) task,\nwhich exploits point clouds and RGB images to localize anomalies. We introduce\na novel light and fast framework that learns to map features from one modality\nto the other on nominal samples. At test time, anomalies are detected by\npinpointing inconsistencies between observed and mapped features. Extensive\nexperiments show that our approach achieves state-of-the-art detection and\nsegmentation performance in both the standard and few-shot settings on the\nMVTec 3D-AD dataset while achieving faster inference and occupying less memory\nthan previous multimodal AD methods. Moreover, we propose a layer-pruning\ntechnique to improve memory and time efficiency with a marginal sacrifice in\nperformance.\n","authors":["Alex Costanzino","Pierluigi Zama Ramirez","Giuseppe Lisanti","Luigi Di Stefano"],"pdf_url":"https://arxiv.org/pdf/2312.04521v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04519v1","updated":"2023-12-07T18:38:39Z","published":"2023-12-07T18:38:39Z","title":"Bootstrapping Autonomous Radars with Self-Supervised Learning","summary":" The perception of autonomous vehicles using radars has attracted increased\nresearch interest due its ability to operate in fog and bad weather. However,\ntraining radar models is hindered by the cost and difficulty of annotating\nlarge-scale radar data. To overcome this bottleneck, we propose a\nself-supervised learning framework to leverage the large amount of unlabeled\nradar data to pre-train radar-only embeddings for self-driving perception\ntasks. The proposed method combines radar-to-radar and radar-to-vision\ncontrastive losses to learn a general representation from unlabeled radar\nheatmaps paired with their corresponding camera images. When used for\ndownstream object detection, we demonstrate that the proposed self-supervision\nframework can improve the accuracy of state-of-the-art supervised baselines by\n5.8% in mAP.\n","authors":["Yiduo Hao","Sohrab Madani","Junfeng Guan","Mohammed Alloulah","Saurabh Gupta","Haitham Hassanieh"],"pdf_url":"https://arxiv.org/pdf/2312.04519v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04494v1","updated":"2023-12-07T18:13:42Z","published":"2023-12-07T18:13:42Z","title":"AVA: Towards Autonomous Visualization Agents through Visual\n Perception-Driven Decision-Making","summary":" With recent advances in multi-modal foundation models, the previously\ntext-only large language models (LLM) have evolved to incorporate visual input,\nopening up unprecedented opportunities for various applications in\nvisualization. Our work explores the utilization of the visual perception\nability of multi-modal LLMs to develop Autonomous Visualization Agents (AVAs)\nthat can interpret and accomplish user-defined visualization objectives through\nnatural language. We propose the first framework for the design of AVAs and\npresent several usage scenarios intended to demonstrate the general\napplicability of the proposed paradigm. The addition of visual perception\nallows AVAs to act as the virtual visualization assistant for domain experts\nwho may lack the knowledge or expertise in fine-tuning visualization outputs.\nOur preliminary exploration and proof-of-concept agents suggest that this\napproach can be widely applicable whenever the choices of appropriate\nvisualization parameters require the interpretation of previous visual output.\nFeedback from unstructured interviews with experts in AI research, medical\nvisualization, and radiology has been incorporated, highlighting the\npracticality and potential of AVAs. Our study indicates that AVAs represent a\ngeneral paradigm for designing intelligent visualization systems that can\nachieve high-level visualization goals, which pave the way for developing\nexpert-level visualization agents in the future.\n","authors":["Shusen Liu","Haichao Miao","Zhimin Li","Matthew Olson","Valerio Pascucci","Peer-Timo Bremer"],"pdf_url":"https://arxiv.org/pdf/2312.04494v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.03907v2","updated":"2023-12-07T18:04:30Z","published":"2023-05-06T02:53:13Z","title":"Listen to Look into the Future: Audio-Visual Egocentric Gaze\n Anticipation","summary":" Egocentric gaze anticipation serves as a key building block for the emerging\ncapability of Augmented Reality. Notably, gaze behavior is driven by both\nvisual cues and audio signals during daily activities. Motivated by this\nobservation, we introduce the first model that leverages both the video and\naudio modalities for egocentric gaze anticipation. Specifically, we propose a\nContrastive Spatial-Temporal Separable (CSTS) fusion approach that adopts two\nmodules to separately capture audio-visual correlations in spatial and temporal\ndimensions, and applies a contrastive loss on the re-weighted audio-visual\nfeatures from fusion modules for representation learning. We conduct extensive\nablation studies and thorough analysis using two egocentric video datasets:\nEgo4D and Aria, to validate our model design. We also demonstrate our model\noutperforms prior state-of-the-art methods by at least +1.9% and +1.6%.\nMoreover, we provide visualizations to show the gaze anticipation results and\nprovide additional insights into audio-visual representation learning.\n","authors":["Bolin Lai","Fiona Ryan","Wenqi Jia","Miao Liu","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2305.03907v2.pdf","comment":"17 pages"},{"id":"http://arxiv.org/abs/2312.04484v1","updated":"2023-12-07T17:59:53Z","published":"2023-12-07T17:59:53Z","title":"FRNet: Frustum-Range Networks for Scalable LiDAR Segmentation","summary":" LiDAR segmentation is crucial for autonomous driving systems. The recent\nrange-view approaches are promising for real-time processing. However, they\nsuffer inevitably from corrupted contextual information and rely heavily on\npost-processing techniques for prediction refinement. In this work, we propose\na simple yet powerful FRNet that restores the contextual information of the\nrange image pixels with corresponding frustum LiDAR points. Firstly, a frustum\nfeature encoder module is used to extract per-point features within the frustum\nregion, which preserves scene consistency and is crucial for point-level\npredictions. Next, a frustum-point fusion module is introduced to update\nper-point features hierarchically, which enables each point to extract more\nsurrounding information via the frustum features. Finally, a head fusion module\nis used to fuse features at different levels for final semantic prediction.\nExtensive experiments on four popular LiDAR segmentation benchmarks under\nvarious task setups demonstrate our superiority. FRNet achieves competitive\nperformance while maintaining high efficiency. The code is publicly available.\n","authors":["Xiang Xu","Lingdong Kong","Hui Shuai","Qingshan Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04484v1.pdf","comment":"Preprint; 20 pages, 9 figures, 9 tables; Code at\n https://github.com/Xiangxu-0103/FRNet"},{"id":"http://arxiv.org/abs/2312.04483v1","updated":"2023-12-07T17:59:07Z","published":"2023-12-07T17:59:07Z","title":"Hierarchical Spatio-temporal Decoupling for Text-to-Video Generation","summary":" Despite diffusion models having shown powerful abilities to generate\nphotorealistic images, generating videos that are realistic and diverse still\nremains in its infancy. One of the key reasons is that current methods\nintertwine spatial content and temporal dynamics together, leading to a notably\nincreased complexity of text-to-video generation (T2V). In this work, we\npropose HiGen, a diffusion model-based method that improves performance by\ndecoupling the spatial and temporal factors of videos from two perspectives,\ni.e., structure level and content level. At the structure level, we decompose\nthe T2V task into two steps, including spatial reasoning and temporal\nreasoning, using a unified denoiser. Specifically, we generate spatially\ncoherent priors using text during spatial reasoning and then generate\ntemporally coherent motions from these priors during temporal reasoning. At the\ncontent level, we extract two subtle cues from the content of the input video\nthat can express motion and appearance changes, respectively. These two cues\nthen guide the model's training for generating videos, enabling flexible\ncontent variations and enhancing temporal stability. Through the decoupled\nparadigm, HiGen can effectively reduce the complexity of this task and generate\nrealistic videos with semantics accuracy and motion stability. Extensive\nexperiments demonstrate the superior performance of HiGen over the\nstate-of-the-art T2V methods.\n","authors":["Zhiwu Qing","Shiwei Zhang","Jiayu Wang","Xiang Wang","Yujie Wei","Yingya Zhang","Changxin Gao","Nong Sang"],"pdf_url":"https://arxiv.org/pdf/2312.04483v1.pdf","comment":"Project page: https://higen-t2v.github.io/"},{"id":"http://arxiv.org/abs/2104.12928v4","updated":"2023-12-07T17:58:04Z","published":"2021-04-27T01:02:15Z","title":"If your data distribution shifts, use self-learning","summary":" We demonstrate that self-learning techniques like entropy minimization and\npseudo-labeling are simple and effective at improving performance of a deployed\ncomputer vision model under systematic domain shifts. We conduct a wide range\nof large-scale experiments and show consistent improvements irrespective of the\nmodel architecture, the pre-training technique or the type of distribution\nshift. At the same time, self-learning is simple to use in practice because it\ndoes not require knowledge or access to the original training data or scheme,\nis robust to hyperparameter choices, is straight-forward to implement and\nrequires only a few adaptation epochs. This makes self-learning techniques\nhighly attractive for any practitioner who applies machine learning algorithms\nin the real world. We present state-of-the-art adaptation results on CIFAR10-C\n(8.5% error), ImageNet-C (22.0% mCE), ImageNet-R (17.4% error) and ImageNet-A\n(14.8% error), theoretically study the dynamics of self-supervised adaptation\nmethods and propose a new classification dataset (ImageNet-D) which is\nchallenging even with adaptation.\n","authors":["Evgenia Rusak","Steffen Schneider","George Pachitariu","Luisa Eck","Peter Gehler","Oliver Bringmann","Wieland Brendel","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2104.12928v4.pdf","comment":"Web: https://domainadaptation.org/selflearning"},{"id":"http://arxiv.org/abs/2312.04479v1","updated":"2023-12-07T17:53:02Z","published":"2023-12-07T17:53:02Z","title":"GSGFormer: Generative Social Graph Transformer for Multimodal Pedestrian\n Trajectory Prediction","summary":" Pedestrian trajectory prediction, vital for selfdriving cars and\nsocially-aware robots, is complicated due to intricate interactions between\npedestrians, their environment, and other Vulnerable Road Users. This paper\npresents GSGFormer, an innovative generative model adept at predicting\npedestrian trajectories by considering these complex interactions and offering\na plethora of potential modal behaviors. We incorporate a heterogeneous graph\nneural network to capture interactions between pedestrians, semantic maps, and\npotential destinations. The Transformer module extracts temporal features,\nwhile our novel CVAE-Residual-GMM module promotes diverse behavioral modality\ngeneration. Through evaluations on multiple public datasets, GSGFormer not only\noutperforms leading methods with ample data but also remains competitive when\ndata is limited.\n","authors":["Zhongchang Luo","Marion Robin","Pavan Vasishta"],"pdf_url":"https://arxiv.org/pdf/2312.04479v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.00971v2","updated":"2023-12-07T17:49:30Z","published":"2023-06-01T17:58:44Z","title":"ViCo: Plug-and-play Visual Condition for Personalized Text-to-image\n Generation","summary":" Personalized text-to-image generation using diffusion models has recently\nemerged and garnered significant interest. This task learns a novel concept\n(e.g., a unique toy), illustrated in a handful of images, into a generative\nmodel that captures fine visual details and generates photorealistic images\nbased on textual embeddings. In this paper, we present ViCo, a novel\nlightweight plug-and-play method that seamlessly integrates visual condition\ninto personalized text-to-image generation. ViCo stands out for its unique\nfeature of not requiring any fine-tuning of the original diffusion model\nparameters, thereby facilitating more flexible and scalable model deployment.\nThis key advantage distinguishes ViCo from most existing models that\nnecessitate partial or full diffusion fine-tuning. ViCo incorporates an image\nattention module that conditions the diffusion process on patch-wise visual\nsemantics, and an attention-based object mask that comes at no extra cost from\nthe attention module. Despite only requiring light parameter training (~6%\ncompared to the diffusion U-Net), ViCo delivers performance that is on par\nwith, or even surpasses, all state-of-the-art models, both qualitatively and\nquantitatively. This underscores the efficacy of ViCo, making it a highly\npromising solution for personalized text-to-image generation without the need\nfor diffusion model fine-tuning. Code: https://github.com/haoosz/ViCo\n","authors":["Shaozhe Hao","Kai Han","Shihao Zhao","Kwan-Yee K. Wong"],"pdf_url":"https://arxiv.org/pdf/2306.00971v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.04466v1","updated":"2023-12-07T17:39:25Z","published":"2023-12-07T17:39:25Z","title":"Emotional Speech-driven 3D Body Animation via Disentangled Latent\n Diffusion","summary":" Existing methods for synthesizing 3D human gestures from speech have shown\npromising results, but they do not explicitly model the impact of emotions on\nthe generated gestures. Instead, these methods directly output animations from\nspeech without control over the expressed emotion. To address this limitation,\nwe present AMUSE, an emotional speech-driven body animation model based on\nlatent diffusion. Our observation is that content (i.e., gestures related to\nspeech rhythm and word utterances), emotion, and personal style are separable.\nTo account for this, AMUSE maps the driving audio to three disentangled latent\nvectors: one for content, one for emotion, and one for personal style. A latent\ndiffusion model, trained to generate gesture motion sequences, is then\nconditioned on these latent vectors. Once trained, AMUSE synthesizes 3D human\ngestures directly from speech with control over the expressed emotions and\nstyle by combining the content from the driving speech with the emotion and\nstyle of another speech sequence. Randomly sampling the noise of the diffusion\nmodel further generates variations of the gesture with the same emotional\nexpressivity. Qualitative, quantitative, and perceptual evaluations demonstrate\nthat AMUSE outputs realistic gesture sequences. Compared to the state of the\nart, the generated gestures are better synchronized with the speech content and\nbetter represent the emotion expressed by the input speech. Our project website\nis amuse.is.tue.mpg.de.\n","authors":["Kiran Chhatre","Radek Daněček","Nikos Athanasiou","Giorgio Becherini","Christopher Peters","Michael J. Black","Timo Bolkart"],"pdf_url":"https://arxiv.org/pdf/2312.04466v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04465v1","updated":"2023-12-07T17:35:49Z","published":"2023-12-07T17:35:49Z","title":"FitDiff: Robust monocular 3D facial shape and reflectance estimation\n using Diffusion Models","summary":" The remarkable progress in 3D face reconstruction has resulted in high-detail\nand photorealistic facial representations. Recently, Diffusion Models have\nrevolutionized the capabilities of generative methods by achieving far better\nperformance than GANs. In this work, we present FitDiff, a diffusion-based 3D\nfacial avatar generative model. This model accurately generates relightable\nfacial avatars, utilizing an identity embedding extracted from an \"in-the-wild\"\n2D facial image. Our multi-modal diffusion model concurrently outputs facial\nreflectance maps (diffuse and specular albedo and normals) and shapes,\nshowcasing great generalization capabilities. It is solely trained on an\nannotated subset of a public facial dataset, paired with 3D reconstructions. We\nrevisit the typical 3D facial fitting approach by guiding a reverse diffusion\nprocess using perceptual and face recognition losses. Being the first LDM\nconditioned on face recognition embeddings, FitDiff reconstructs relightable\nhuman avatars, that can be used as-is in common rendering engines, starting\nonly from an unconstrained facial image, and achieving state-of-the-art\nperformance.\n","authors":["Stathis Galanakis","Alexandros Lattas","Stylianos Moschoglou","Stefanos Zafeiriou"],"pdf_url":"https://arxiv.org/pdf/2312.04465v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04461v1","updated":"2023-12-07T17:32:29Z","published":"2023-12-07T17:32:29Z","title":"PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding","summary":" Recent advances in text-to-image generation have made remarkable progress in\nsynthesizing realistic human photos conditioned on given text prompts. However,\nexisting personalized generation methods cannot simultaneously satisfy the\nrequirements of high efficiency, promising identity (ID) fidelity, and flexible\ntext controllability. In this work, we introduce PhotoMaker, an efficient\npersonalized text-to-image generation method, which mainly encodes an arbitrary\nnumber of input ID images into a stack ID embedding for preserving ID\ninformation. Such an embedding, serving as a unified ID representation, can not\nonly encapsulate the characteristics of the same input ID comprehensively, but\nalso accommodate the characteristics of different IDs for subsequent\nintegration. This paves the way for more intriguing and practically valuable\napplications. Besides, to drive the training of our PhotoMaker, we propose an\nID-oriented data construction pipeline to assemble the training data. Under the\nnourishment of the dataset constructed through the proposed pipeline, our\nPhotoMaker demonstrates better ID preservation ability than test-time\nfine-tuning based methods, yet provides significant speed improvements,\nhigh-quality generation results, strong generalization capabilities, and a wide\nrange of applications. Our project page is available at\nhttps://photo-maker.github.io/\n","authors":["Zhen Li","Mingdeng Cao","Xintao Wang","Zhongang Qi","Ming-Ming Cheng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.04461v1.pdf","comment":"Tech report; Project page: https://photo-maker.github.io/"},{"id":"http://arxiv.org/abs/2312.04433v1","updated":"2023-12-07T16:57:26Z","published":"2023-12-07T16:57:26Z","title":"DreamVideo: Composing Your Dream Videos with Customized Subject and\n Motion","summary":" Customized generation using diffusion models has made impressive progress in\nimage generation, but remains unsatisfactory in the challenging video\ngeneration task, as it requires the controllability of both subjects and\nmotions. To that end, we present DreamVideo, a novel approach to generating\npersonalized videos from a few static images of the desired subject and a few\nvideos of target motion. DreamVideo decouples this task into two stages,\nsubject learning and motion learning, by leveraging a pre-trained video\ndiffusion model. The subject learning aims to accurately capture the fine\nappearance of the subject from provided images, which is achieved by combining\ntextual inversion and fine-tuning of our carefully designed identity adapter.\nIn motion learning, we architect a motion adapter and fine-tune it on the given\nvideos to effectively model the target motion pattern. Combining these two\nlightweight and efficient adapters allows for flexible customization of any\nsubject with any motion. Extensive experimental results demonstrate the\nsuperior performance of our DreamVideo over the state-of-the-art methods for\ncustomized video generation. Our project page is at\nhttps://dreamvideo-t2v.github.io.\n","authors":["Yujie Wei","Shiwei Zhang","Zhiwu Qing","Hangjie Yuan","Zhiheng Liu","Yu Liu","Yingya Zhang","Jingren Zhou","Hongming Shan"],"pdf_url":"https://arxiv.org/pdf/2312.04433v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04429v1","updated":"2023-12-07T16:55:04Z","published":"2023-12-07T16:55:04Z","title":"Approximate Caching for Efficiently Serving Diffusion Models","summary":" Text-to-image generation using diffusion models has seen explosive popularity\nowing to their ability in producing high quality images adhering to text\nprompts. However, production-grade diffusion model serving is a resource\nintensive task that not only require high-end GPUs which are expensive but also\nincurs considerable latency. In this paper, we introduce a technique called\napproximate-caching that can reduce such iterative denoising steps for an image\ngeneration based on a prompt by reusing intermediate noise states created\nduring a prior image generation for similar prompts. Based on this idea, we\npresent an end to end text-to-image system, Nirvana, that uses the\napproximate-caching with a novel cache management-policy Least Computationally\nBeneficial and Frequently Used (LCBFU) to provide % GPU compute savings, 19.8%\nend-to-end latency reduction and 19% dollar savings, on average, on two real\nproduction workloads. We further present an extensive characterization of real\nproduction text-to-image prompts from the perspective of caching, popularity\nand reuse of intermediate states in a large production environment.\n","authors":["Shubham Agarwal","Subrata Mitra","Sarthak Chakraborty","Srikrishna Karanam","Koyel Mukherjee","Shiv Saini"],"pdf_url":"https://arxiv.org/pdf/2312.04429v1.pdf","comment":"Accepted at NSDI'24"},{"id":"http://arxiv.org/abs/2312.04424v1","updated":"2023-12-07T16:49:09Z","published":"2023-12-07T16:49:09Z","title":"Cascade-Zero123: One Image to Highly Consistent 3D with Self-Prompted\n Nearby Views","summary":" Synthesizing multi-view 3D from one single image is a significant and\nchallenging task. For this goal, Zero-1-to-3 methods aim to extend a 2D latent\ndiffusion model to the 3D scope. These approaches generate the target-view\nimage with a single-view source image and the camera pose as condition\ninformation. However, the one-to-one manner adopted in Zero-1-to-3 incurs\nchallenges for building geometric and visual consistency across views,\nespecially for complex objects. We propose a cascade generation framework\nconstructed with two Zero-1-to-3 models, named Cascade-Zero123, to tackle this\nissue, which progressively extracts 3D information from the source image.\nSpecifically, a self-prompting mechanism is designed to generate several nearby\nviews at first. These views are then fed into the second-stage model along with\nthe source image as generation conditions. With self-prompted multiple views as\nthe supplementary information, our Cascade-Zero123 generates more highly\nconsistent novel-view images than Zero-1-to-3. The promotion is significant for\nvarious complex and challenging scenes, involving insects, humans, transparent\nobjects, and stacked multiple objects etc. The project page is at\nhttps://cascadezero123.github.io/.\n","authors":["Yabo Chen","Jiemin Fang","Yuyang Huang","Taoran Yi","Xiaopeng Zhang","Lingxi Xie","Xinggang Wang","Wenrui Dai","Hongkai Xiong","Qi Tian"],"pdf_url":"https://arxiv.org/pdf/2312.04424v1.pdf","comment":"Project page: https://cascadezero123.github.io/"},{"id":"http://arxiv.org/abs/2303.17859v4","updated":"2023-12-07T16:44:44Z","published":"2023-03-31T07:39:12Z","title":"MapFormer: Boosting Change Detection by Using Pre-change Information","summary":" Change detection in remote sensing imagery is essential for a variety of\napplications such as urban planning, disaster management, and climate research.\nHowever, existing methods for identifying semantically changed areas overlook\nthe availability of semantic information in the form of existing maps\ndescribing features of the earth's surface. In this paper, we leverage this\ninformation for change detection in bi-temporal images. We show that the simple\nintegration of the additional information via concatenation of latent\nrepresentations suffices to significantly outperform state-of-the-art change\ndetection methods. Motivated by this observation, we propose the new task of\n*Conditional Change Detection*, where pre-change semantic information is used\nas input next to bi-temporal images. To fully exploit the extra information, we\npropose *MapFormer*, a novel architecture based on a multi-modal feature fusion\nmodule that allows for feature processing conditioned on the available semantic\ninformation. We further employ a supervised, cross-modal contrastive loss to\nguide the learning of visual representations. Our approach outperforms existing\nchange detection methods by an absolute 11.7\\% and 18.4\\% in terms of binary\nchange IoU on DynamicEarthNet and HRSCD, respectively. Furthermore, we\ndemonstrate the robustness of our approach to the quality of the pre-change\nsemantic information and the absence pre-change imagery. The code is available\nat https://github.com/mxbh/mapformer.\n","authors":["Maximilian Bernhard","Niklas Strauß","Matthias Schubert"],"pdf_url":"https://arxiv.org/pdf/2303.17859v4.pdf","comment":"accepted at ICCV 2023"},{"id":"http://arxiv.org/abs/2301.00752v4","updated":"2023-12-07T16:42:32Z","published":"2023-01-02T16:51:40Z","title":"Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave\n Communications","summary":" This study demonstrates the feasibility of point cloud-based proactive link\nquality prediction for millimeter-wave (mmWave) communications. Previous\nstudies have proposed machine learning-based methods to predict received signal\nstrength for future time periods using time series of depth images to mitigate\nthe line-of-sight (LOS) path blockage by pedestrians in mmWave communication.\nHowever, these image-based methods have limited applicability due to privacy\nconcerns as camera images may contain sensitive information. This study\nproposes a point cloud-based method for mmWave link quality prediction and\ndemonstrates its feasibility through experiments. Point clouds represent\nthree-dimensional (3D) spaces as a set of points and are sparser and less\nlikely to contain sensitive information than camera images. Additionally, point\nclouds provide 3D position and motion information, which is necessary for\nunderstanding the radio propagation environment involving pedestrians. This\nstudy designs the mmWave link quality prediction method and conducts realistic\nindoor experiments, where the link quality fluctuates significantly due to\nhuman blockage, using commercially available IEEE 802.11ad-based 60 GHz\nwireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light\ndetection and ranging (LiDAR) for point cloud acquisition. The experimental\nresults showed that our proposed method can predict future large attenuation of\nmmWave received signal strength and throughput induced by the LOS path blockage\nby pedestrians with comparable or superior accuracy to image-based prediction\nmethods. Hence, our point cloud-based method can serve as a viable alternative\nto image-based methods.\n","authors":["Shoki Ohta","Takayuki Nishio","Riichi Kudo","Kahoko Takahashi","Hisashi Nagata"],"pdf_url":"https://arxiv.org/pdf/2301.00752v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.07207v5","updated":"2023-12-07T16:36:15Z","published":"2022-12-14T13:10:27Z","title":"MAELi: Masked Autoencoder for Large-Scale LiDAR Point Clouds","summary":" The sensing process of large-scale LiDAR point clouds inevitably causes large\nblind spots, i.e. regions not visible to the sensor. We demonstrate how these\ninherent sampling properties can be effectively utilized for self-supervised\nrepresentation learning by designing a highly effective pre-training framework\nthat considerably reduces the need for tedious 3D annotations to train\nstate-of-the-art object detectors. Our Masked AutoEncoder for LiDAR point\nclouds (MAELi) intuitively leverages the sparsity of LiDAR point clouds in both\nthe encoder and decoder during reconstruction. This results in more expressive\nand useful initialization, which can be directly applied to downstream\nperception tasks, such as 3D object detection or semantic segmentation for\nautonomous driving. In a novel reconstruction approach, MAELi distinguishes\nbetween empty and occluded space and employs a new masking strategy that\ntargets the LiDAR's inherent spherical projection. Thereby, without any ground\ntruth whatsoever and trained on single frames only, MAELi obtains an\nunderstanding of the underlying 3D scene geometry and semantics. To demonstrate\nthe potential of MAELi, we pre-train backbones in an end-to-end manner and show\nthe effectiveness of our unsupervised pre-trained weights on the tasks of 3D\nobject detection and semantic segmentation.\n","authors":["Georg Krispel","David Schinagl","Christian Fruhwirth-Reisinger","Horst Possegger","Horst Bischof"],"pdf_url":"https://arxiv.org/pdf/2212.07207v5.pdf","comment":"Accepted to WACV 2024, 16 pages"},{"id":"http://arxiv.org/abs/2312.04410v1","updated":"2023-12-07T16:26:23Z","published":"2023-12-07T16:26:23Z","title":"Smooth Diffusion: Crafting Smooth Latent Spaces in Diffusion Models","summary":" Recently, diffusion models have made remarkable progress in text-to-image\n(T2I) generation, synthesizing images with high fidelity and diverse contents.\nDespite this advancement, latent space smoothness within diffusion models\nremains largely unexplored. Smooth latent spaces ensure that a perturbation on\nan input latent corresponds to a steady change in the output image. This\nproperty proves beneficial in downstream tasks, including image interpolation,\ninversion, and editing. In this work, we expose the non-smoothness of diffusion\nlatent spaces by observing noticeable visual fluctuations resulting from minor\nlatent variations. To tackle this issue, we propose Smooth Diffusion, a new\ncategory of diffusion models that can be simultaneously high-performing and\nsmooth. Specifically, we introduce Step-wise Variation Regularization to\nenforce the proportion between the variations of an arbitrary input latent and\nthat of the output image is a constant at any diffusion training step. In\naddition, we devise an interpolation standard deviation (ISTD) metric to\neffectively assess the latent space smoothness of a diffusion model. Extensive\nquantitative and qualitative experiments demonstrate that Smooth Diffusion\nstands out as a more desirable solution not only in T2I generation but also\nacross various downstream tasks. Smooth Diffusion is implemented as a\nplug-and-play Smooth-LoRA to work with various community models. Code is\navailable at https://github.com/SHI-Labs/Smooth-Diffusion.\n","authors":["Jiayi Guo","Xingqian Xu","Yifan Pu","Zanlin Ni","Chaofei Wang","Manushree Vasu","Shiji Song","Gao Huang","Humphrey Shi"],"pdf_url":"https://arxiv.org/pdf/2312.04410v1.pdf","comment":"GitHub: https://github.com/SHI-Labs/Smooth-Diffusion"},{"id":"http://arxiv.org/abs/2312.04403v1","updated":"2023-12-07T16:16:50Z","published":"2023-12-07T16:16:50Z","title":"OT-Attack: Enhancing Adversarial Transferability of Vision-Language\n Models via Optimal Transport Optimization","summary":" Vision-language pre-training (VLP) models demonstrate impressive abilities in\nprocessing both images and text. However, they are vulnerable to multi-modal\nadversarial examples (AEs). Investigating the generation of\nhigh-transferability adversarial examples is crucial for uncovering VLP models'\nvulnerabilities in practical scenarios. Recent works have indicated that\nleveraging data augmentation and image-text modal interactions can enhance the\ntransferability of adversarial examples for VLP models significantly. However,\nthey do not consider the optimal alignment problem between dataaugmented\nimage-text pairs. This oversight leads to adversarial examples that are overly\ntailored to the source model, thus limiting improvements in transferability. In\nour research, we first explore the interplay between image sets produced\nthrough data augmentation and their corresponding text sets. We find that\naugmented image samples can align optimally with certain texts while exhibiting\nless relevance to others. Motivated by this, we propose an Optimal\nTransport-based Adversarial Attack, dubbed OT-Attack. The proposed method\nformulates the features of image and text sets as two distinct distributions\nand employs optimal transport theory to determine the most efficient mapping\nbetween them. This optimal mapping informs our generation of adversarial\nexamples to effectively counteract the overfitting issues. Extensive\nexperiments across various network architectures and datasets in image-text\nmatching tasks reveal that our OT-Attack outperforms existing state-of-the-art\nmethods in terms of adversarial transferability.\n","authors":["Dongchen Han","Xiaojun Jia","Yang Bai","Jindong Gu","Yang Liu","Xiaochun Cao"],"pdf_url":"https://arxiv.org/pdf/2312.04403v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04398v1","updated":"2023-12-07T16:10:10Z","published":"2023-12-07T16:10:10Z","title":"Intelligent Anomaly Detection for Lane Rendering Using Transformer with\n Self-Supervised Pre-Training and Customized Fine-Tuning","summary":" The burgeoning navigation services using digital maps provide great\nconvenience to drivers. Nevertheless, the presence of anomalies in lane\nrendering map images occasionally introduces potential hazards, as such\nanomalies can be misleading to human drivers and consequently contribute to\nunsafe driving conditions. In response to this concern and to accurately and\neffectively detect the anomalies, this paper transforms lane rendering image\nanomaly detection into a classification problem and proposes a four-phase\npipeline consisting of data pre-processing, self-supervised pre-training with\nthe masked image modeling (MiM) method, customized fine-tuning using\ncross-entropy based loss with label smoothing, and post-processing to tackle it\nleveraging state-of-the-art deep learning techniques, especially those\ninvolving Transformer models. Various experiments verify the effectiveness of\nthe proposed pipeline. Results indicate that the proposed pipeline exhibits\nsuperior performance in lane rendering image anomaly detection, and notably,\nthe self-supervised pre-training with MiM can greatly enhance the detection\naccuracy while significantly reducing the total training time. For instance,\nemploying the Swin Transformer with Uniform Masking as self-supervised\npretraining (Swin-Trans-UM) yielded a heightened accuracy at 94.77% and an\nimproved Area Under The Curve (AUC) score of 0.9743 compared with the pure Swin\nTransformer without pre-training (Swin-Trans) with an accuracy of 94.01% and an\nAUC of 0.9498. The fine-tuning epochs were dramatically reduced to 41 from the\noriginal 280. In conclusion, the proposed pipeline, with its incorporation of\nself-supervised pre-training using MiM and other advanced deep learning\ntechniques, emerges as a robust solution for enhancing the accuracy and\nefficiency of lane rendering image anomaly detection in digital navigation\nsystems.\n","authors":["Yongqi Dong","Xingmin Lu","Ruohan Li","Wei Song","Bart van Arem","Haneen Farah"],"pdf_url":"https://arxiv.org/pdf/2312.04398v1.pdf","comment":"20 pages, 6 figures, accepted by the 103rd Transportation Research\n Board (TRB) Annual Meeting, under review by Transportation Research Record:\n Journal of the Transportation Research Board"},{"id":"http://arxiv.org/abs/2312.04393v1","updated":"2023-12-07T16:06:31Z","published":"2023-12-07T16:06:31Z","title":"PhysHOI: Physics-Based Imitation of Dynamic Human-Object Interaction","summary":" Humans interact with objects all the time. Enabling a humanoid to learn\nhuman-object interaction (HOI) is a key step for future smart animation and\nintelligent robotics systems. However, recent progress in physics-based HOI\nrequires carefully designed task-specific rewards, making the system unscalable\nand labor-intensive. This work focuses on dynamic HOI imitation: teaching\nhumanoid dynamic interaction skills through imitating kinematic HOI\ndemonstrations. It is quite challenging because of the complexity of the\ninteraction between body parts and objects and the lack of dynamic HOI data. To\nhandle the above issues, we present PhysHOI, the first physics-based whole-body\nHOI imitation approach without task-specific reward designs. Except for the\nkinematic HOI representations of humans and objects, we introduce the contact\ngraph to model the contact relations between body parts and objects explicitly.\nA contact graph reward is also designed, which proved to be critical for\nprecise HOI imitation. Based on the key designs, PhysHOI can imitate diverse\nHOI tasks simply yet effectively without prior knowledge. To make up for the\nlack of dynamic HOI scenarios in this area, we introduce the BallPlay dataset\nthat contains eight whole-body basketball skills. We validate PhysHOI on\ndiverse HOI tasks, including whole-body grasping and basketball skills.\n","authors":["Yinhuai Wang","Jing Lin","Ailing Zeng","Zhengyi Luo","Jian Zhang","Lei Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.04393v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04385v1","updated":"2023-12-07T15:55:31Z","published":"2023-12-07T15:55:31Z","title":"AniRes2D: Anisotropic Residual-enhanced Diffusion for 2D MR\n Super-Resolution","summary":" Anisotropic low-resolution (LR) magnetic resonance (MR) images are fast to\nobtain but hinder automated processing. We propose to use denoising diffusion\nprobabilistic models (DDPMs) to super-resolve these 2D-acquired LR MR slices.\nThis paper introduces AniRes2D, a novel approach combining DDPM with a residual\nprediction for 2D super-resolution (SR). Results demonstrate that AniRes2D\noutperforms several other DDPM-based models in quantitative metrics, visual\nquality, and out-of-domain evaluation. We use a trained AniRes2D to\nsuper-resolve 3D volumes slice by slice, where comparative quantitative results\nand reduced skull aliasing are achieved compared to a recent state-of-the-art\nself-supervised 3D super-resolution method. Furthermore, we explored the use of\nnoise conditioning augmentation (NCA) as an alternative augmentation technique\nfor DDPM-based SR models, but it was found to reduce performance. Our findings\ncontribute valuable insights to the application of DDPMs for SR of anisotropic\nMR images.\n","authors":["Zejun Wu","Samuel W. Remedios","Blake E. Dewey","Aaron Carass","Jerry L. Prince"],"pdf_url":"https://arxiv.org/pdf/2312.04385v1.pdf","comment":"Accepted for presentation at SPIE Medical Imaging 2024, Clinical and\n Biomedical Imaging"},{"id":"http://arxiv.org/abs/2312.04369v1","updated":"2023-12-07T15:40:36Z","published":"2023-12-07T15:40:36Z","title":"SingingHead: A Large-scale 4D Dataset for Singing Head Animation","summary":" Singing, as a common facial movement second only to talking, can be regarded\nas a universal language across ethnicities and cultures, plays an important\nrole in emotional communication, art, and entertainment. However, it is often\noverlooked in the field of audio-driven facial animation due to the lack of\nsinging head datasets and the domain gap between singing and talking in rhythm\nand amplitude. To this end, we collect a high-quality large-scale singing head\ndataset, SingingHead, which consists of more than 27 hours of synchronized\nsinging video, 3D facial motion, singing audio, and background music from 76\nindividuals and 8 types of music. Along with the SingingHead dataset, we argue\nthat 3D and 2D facial animation tasks can be solved together, and propose a\nunified singing facial animation framework named UniSinger to achieve both\nsinging audio-driven 3D singing head animation and 2D singing portrait video\nsynthesis. Extensive comparative experiments with both SOTA 3D facial animation\nand 2D portrait animation methods demonstrate the necessity of singing-specific\ndatasets in singing head animation tasks and the promising performance of our\nunified facial animation framework.\n","authors":["Sijing Wu","Yunhao Li","Weitian Zhang","Jun Jia","Yucheng Zhu","Yichao Yan","Guangtao Zhai"],"pdf_url":"https://arxiv.org/pdf/2312.04369v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04364v1","updated":"2023-12-07T15:35:42Z","published":"2023-12-07T15:35:42Z","title":"DemoCaricature: Democratising Caricature Generation with a Rough Sketch","summary":" In this paper, we democratise caricature generation, empowering individuals\nto effortlessly craft personalised caricatures with just a photo and a\nconceptual sketch. Our objective is to strike a delicate balance between\nabstraction and identity, while preserving the creativity and subjectivity\ninherent in a sketch. To achieve this, we present Explicit Rank-1 Model Editing\nalongside single-image personalisation, selectively applying nuanced edits to\ncross-attention layers for a seamless merge of identity and style.\nAdditionally, we propose Random Mask Reconstruction to enhance robustness,\ndirecting the model to focus on distinctive identity and style features.\nCrucially, our aim is not to replace artists but to eliminate accessibility\nbarriers, allowing enthusiasts to engage in the artistry.\n","authors":["Dar-Yen Chen","Subhadeep Koley","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Ayan Kumar Bhunia","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2312.04364v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.19784v2","updated":"2023-12-07T15:22:07Z","published":"2023-10-30T17:50:14Z","title":"CustomNet: Zero-shot Object Customization with Variable-Viewpoints in\n Text-to-Image Diffusion Models","summary":" Incorporating a customized object into image generation presents an\nattractive feature in text-to-image generation. However, existing\noptimization-based and encoder-based methods are hindered by drawbacks such as\ntime-consuming optimization, insufficient identity preservation, and a\nprevalent copy-pasting effect. To overcome these limitations, we introduce\nCustomNet, a novel object customization approach that explicitly incorporates\n3D novel view synthesis capabilities into the object customization process.\nThis integration facilitates the adjustment of spatial position relationships\nand viewpoints, yielding diverse outputs while effectively preserving object\nidentity. Moreover, we introduce delicate designs to enable location control\nand flexible background control through textual descriptions or specific\nuser-defined images, overcoming the limitations of existing 3D novel view\nsynthesis methods. We further leverage a dataset construction pipeline that can\nbetter handle real-world objects and complex backgrounds. Equipped with these\ndesigns, our method facilitates zero-shot object customization without\ntest-time optimization, offering simultaneous control over the viewpoints,\nlocation, and background. As a result, our CustomNet ensures enhanced identity\npreservation and generates diverse, harmonious outputs.\n","authors":["Ziyang Yuan","Mingdeng Cao","Xintao Wang","Zhongang Qi","Chun Yuan","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2310.19784v2.pdf","comment":"Project webpage available at https://jiangyzy.github.io/CustomNet/"},{"id":"http://arxiv.org/abs/2312.04344v1","updated":"2023-12-07T15:05:59Z","published":"2023-12-07T15:05:59Z","title":"Enhancing Medical Task Performance in GPT-4V: A Comprehensive Study on\n Prompt Engineering Strategies","summary":" OpenAI's latest large vision-language model (LVLM), GPT-4V(ision), has piqued\nconsiderable interest for its potential in medical applications. Despite its\npromise, recent studies and internal reviews highlight its underperformance in\nspecialized medical tasks. This paper explores the boundary of GPT-4V's\ncapabilities in medicine, particularly in processing complex imaging data from\nendoscopies, CT scans, and MRIs etc. Leveraging open-source datasets, we\nassessed its foundational competencies, identifying substantial areas for\nenhancement. Our research emphasizes prompt engineering, an often-underutilized\nstrategy for improving AI responsiveness. Through iterative testing, we refined\nthe model's prompts, significantly improving its interpretative accuracy and\nrelevance in medical imaging. From our comprehensive evaluations, we distilled\n10 effective prompt engineering techniques, each fortifying GPT-4V's medical\nacumen. These methodical enhancements facilitate more reliable, precise, and\nclinically valuable insights from GPT-4V, advancing its operability in critical\nhealthcare environments. Our findings are pivotal for those employing AI in\nmedicine, providing clear, actionable guidance on harnessing GPT-4V's full\ndiagnostic potential.\n","authors":["Pengcheng Chen","Ziyan Huang","Zhongying Deng","Tianbin Li","Yanzhou Su","Haoyu Wang","Jin Ye","Yu Qiao","Junjun He"],"pdf_url":"https://arxiv.org/pdf/2312.04344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04337v1","updated":"2023-12-07T14:55:13Z","published":"2023-12-07T14:55:13Z","title":"Multi-View Unsupervised Image Generation with Cross Attention Guidance","summary":" The growing interest in novel view synthesis, driven by Neural Radiance Field\n(NeRF) models, is hindered by scalability issues due to their reliance on\nprecisely annotated multi-view images. Recent models address this by\nfine-tuning large text2image diffusion models on synthetic multi-view data.\nDespite robust zero-shot generalization, they may need post-processing and can\nface quality issues due to the synthetic-real domain gap. This paper introduces\na novel pipeline for unsupervised training of a pose-conditioned diffusion\nmodel on single-category datasets. With the help of pretrained self-supervised\nVision Transformers (DINOv2), we identify object poses by clustering the\ndataset through comparing visibility and locations of specific object parts.\nThe pose-conditioned diffusion model, trained on pose labels, and equipped with\ncross-frame attention at inference time ensures cross-view consistency, that is\nfurther aided by our novel hard-attention guidance. Our model, MIRAGE,\nsurpasses prior work in novel view synthesis on real images. Furthermore,\nMIRAGE is robust to diverse textures and geometries, as demonstrated with our\nexperiments on synthetic images generated with pretrained Stable Diffusion.\n","authors":["Llukman Cerkezi","Aram Davtyan","Sepehr Sameni","Paolo Favaro"],"pdf_url":"https://arxiv.org/pdf/2312.04337v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04334v1","updated":"2023-12-07T14:51:12Z","published":"2023-12-07T14:51:12Z","title":"Towards a Perceptual Evaluation Framework for Lighting Estimation","summary":" Progress in lighting estimation is tracked by computing existing image\nquality assessment (IQA) metrics on images from standard datasets. While this\nmay appear to be a reasonable approach, we demonstrate that doing so does not\ncorrelate to human preference when the estimated lighting is used to relight a\nvirtual scene into a real photograph. To study this, we design a controlled\npsychophysical experiment where human observers must choose their preference\namongst rendered scenes lit using a set of lighting estimation algorithms\nselected from the recent literature, and use it to analyse how these algorithms\nperform according to human perception. Then, we demonstrate that none of the\nmost popular IQA metrics from the literature, taken individually, correctly\nrepresent human perception. Finally, we show that by learning a combination of\nexisting IQA metrics, we can more accurately represent human preference. This\nprovides a new perceptual framework to help evaluate future lighting estimation\nalgorithms.\n","authors":["Justine Giroux","Mohammad Reza Karimi Dastjerdi","Yannick Hold-Geoffroy","Javier Vazquez-Corral","Jean-François Lalonde"],"pdf_url":"https://arxiv.org/pdf/2312.04334v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04328v1","updated":"2023-12-07T14:40:05Z","published":"2023-12-07T14:40:05Z","title":"A Multi-scale Information Integration Framework for Infrared and Visible\n Image Fusion","summary":" Infrared and visible image fusion aims at generating a fused image containing\nthe intensity and detail information of source images, and the key issue is\neffectively measuring and integrating the complementary information of\nmulti-modality images from the same scene. Existing methods mostly adopt a\nsimple weight in the loss function to decide the information retention of each\nmodality rather than adaptively measuring complementary information for\ndifferent image pairs. In this study, we propose a multi-scale dual attention\n(MDA) framework for infrared and visible image fusion, which is designed to\nmeasure and integrate complementary information in both structure and loss\nfunction at the image and patch level. In our method, the residual downsample\nblock decomposes source images into three scales first. Then, dual attention\nfusion block integrates complementary information and generates a spatial and\nchannel attention map at each scale for feature fusion. Finally, the output\nimage is reconstructed by the residual reconstruction block. Loss function\nconsists of image-level, feature-level and patch-level three parts, of which\nthe calculation of the image-level and patch-level two parts are based on the\nweights generated by the complementary information measurement. Indeed, to\nconstrain the pixel intensity distribution between the output and infrared\nimage, a style loss is added. Our fusion results perform robust and informative\nacross different scenarios. Qualitative and quantitative results on two\ndatasets illustrate that our method is able to preserve both thermal radiation\nand detailed information from two modalities and achieve comparable results\ncompared with the other state-of-the-art methods. Ablation experiments show the\neffectiveness of our information integration architecture and adaptively\nmeasure complementary information retention in the loss function.\n","authors":["Guang Yang","Jie Li","Hanxiao Lei","Xinbo Gao"],"pdf_url":"https://arxiv.org/pdf/2312.04328v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04326v1","updated":"2023-12-07T14:37:01Z","published":"2023-12-07T14:37:01Z","title":"iDesigner: A High-Resolution and Complex-Prompt Following Text-to-Image\n Diffusion Model for Interior Design","summary":" With the open-sourcing of text-to-image models (T2I) such as stable diffusion\n(SD) and stable diffusion XL (SD-XL), there is an influx of models fine-tuned\nin specific domains based on the open-source SD model, such as in anime,\ncharacter portraits, etc. However, there are few specialized models in certain\ndomains, such as interior design, which is attributed to the complex textual\ndescriptions and detailed visual elements inherent in design, alongside the\nnecessity for adaptable resolution. Therefore, text-to-image models for\ninterior design are required to have outstanding prompt-following capabilities,\nas well as iterative collaboration with design professionals to achieve the\ndesired outcome. In this paper, we collect and optimize text-image data in the\ndesign field and continue training in both English and Chinese on the basis of\nthe open-source CLIP model. We also proposed a fine-tuning strategy with\ncurriculum learning and reinforcement learning from CLIP feedback to enhance\nthe prompt-following capabilities of our approach so as to improve the quality\nof image generation. The experimental results on the collected dataset\ndemonstrate the effectiveness of the proposed approach, which achieves\nimpressive results and outperforms strong baselines.\n","authors":["Ruyi Gan","Xiaojun Wu","Junyu Lu","Yuanhe Tian","Dixiang Zhang","Ziwei Wu","Renliang Sun","Chang Liu","Jiaxing Zhang","Pingjian Zhang","Yan Song"],"pdf_url":"https://arxiv.org/pdf/2312.04326v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.06641v2","updated":"2023-12-07T14:24:52Z","published":"2023-10-10T14:04:32Z","title":"How (not) to ensemble LVLMs for VQA","summary":" This paper studies ensembling in the era of Large Vision-Language Models\n(LVLMs). Ensembling is a classical method to combine different models to get\nincreased performance. In the recent work on Encyclopedic-VQA the authors\nexamine a wide variety of models to solve their task: from vanilla LVLMs, to\nmodels including the caption as extra context, to models augmented with\nLens-based retrieval of Wikipedia pages. Intuitively these models are highly\ncomplementary, which should make them ideal for ensembling. Indeed, an oracle\nexperiment shows potential gains from 48.8% accuracy (the best single model)\nall the way up to 67% (best possible ensemble). So it is a trivial exercise to\ncreate an ensemble with substantial real gains. Or is it?\n","authors":["Lisa Alazraki","Lluis Castrejon","Mostafa Dehghani","Fantine Huot","Jasper Uijlings","Thomas Mensink"],"pdf_url":"https://arxiv.org/pdf/2310.06641v2.pdf","comment":"4th I Can't Believe It's Not Better Workshop (co-located with NeurIPS\n 2023)"},{"id":"http://arxiv.org/abs/2312.04316v1","updated":"2023-12-07T14:17:17Z","published":"2023-12-07T14:17:17Z","title":"Towards Knowledge-driven Autonomous Driving","summary":" This paper explores the emerging knowledge-driven autonomous driving\ntechnologies. Our investigation highlights the limitations of current\nautonomous driving systems, in particular their sensitivity to data bias,\ndifficulty in handling long-tail scenarios, and lack of interpretability.\nConversely, knowledge-driven methods with the abilities of cognition,\ngeneralization and life-long learning emerge as a promising way to overcome\nthese challenges. This paper delves into the essence of knowledge-driven\nautonomous driving and examines its core components: dataset \\& benchmark,\nenvironment, and driver agent. By leveraging large language models, world\nmodels, neural rendering, and other advanced artificial intelligence\ntechniques, these components collectively contribute to a more holistic,\nadaptive, and intelligent autonomous driving system. The paper systematically\norganizes and reviews previous research efforts in this area, and provides\ninsights and guidance for future research and practical applications of\nautonomous driving. We will continually share the latest updates on\ncutting-edge developments in knowledge-driven autonomous driving along with the\nrelevant valuable open-source resources at:\n\\url{https://github.com/PJLab-ADG/awesome-knowledge-driven-AD}.\n","authors":["Xin Li","Yeqi Bai","Pinlong Cai","Licheng Wen","Daocheng Fu","Bo Zhang","Xuemeng Yang","Xinyu Cai","Tao Ma","Jianfei Guo","Xing Gao","Min Dou","Botian Shi","Yong Liu","Liang He","Yu Qiao"],"pdf_url":"https://arxiv.org/pdf/2312.04316v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04314v1","updated":"2023-12-07T14:11:00Z","published":"2023-12-07T14:11:00Z","title":"GPT4SGG: Synthesizing Scene Graphs from Holistic and Region-specific\n Narratives","summary":" Learning scene graphs from natural language descriptions has proven to be a\ncheap and promising scheme for Scene Graph Generation (SGG). However, such\nunstructured caption data and its processing are troubling the learning an\nacurrate and complete scene graph. This dilema can be summarized as three\npoints. First, traditional language parsers often fail to extract meaningful\nrelationship triplets from caption data. Second, grounding unlocalized objects\nin parsed triplets will meet ambiguity in visual-language alignment. Last,\ncaption data typically are sparse and exhibit bias to partial observations of\nimage content. These three issues make it hard for the model to generate\ncomprehensive and accurate scene graphs. To fill this gap, we propose a simple\nyet effective framework, GPT4SGG, to synthesize scene graphs from holistic and\nregion-specific narratives. The framework discards traditional language parser,\nand localize objects before obtaining relationship triplets. To obtain\nrelationship triplets, holistic and dense region-specific narratives are\ngenerated from the image. With such textual representation of image data and a\ntask-specific prompt, an LLM, particularly GPT-4, directly synthesizes a scene\ngraph as \"pseudo labels\". Experimental results showcase GPT4SGG significantly\nimproves the performance of SGG models trained on image-caption data. We\nbelieve this pioneering work can motivate further research into mining the\nvisual reasoning capabilities of LLMs.\n","authors":["Zuyao Chen","Jinlin Wu","Zhen Lei","Zhaoxiang Zhang","Changwen Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04314v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04302v1","updated":"2023-12-07T13:53:29Z","published":"2023-12-07T13:53:29Z","title":"Prompt Highlighter: Interactive Control for Multi-Modal LLMs","summary":" This study targets a critical aspect of multi-modal LLMs' (LLMs&VLMs)\ninference: explicit controllable text generation. Multi-modal LLMs empower\nmulti-modality understanding with the capability of semantic generation yet\nbring less explainability and heavier reliance on prompt contents due to their\nautoregressive generative nature. While manipulating prompt formats could\nimprove outputs, designing specific and precise prompts per task can be\nchallenging and ineffective. To tackle this issue, we introduce a novel\ninference method, Prompt Highlighter, which enables users to highlight specific\nprompt spans to interactively control the focus during generation. Motivated by\nthe classifier-free diffusion guidance, we form regular and unconditional\ncontext pairs based on highlighted tokens, demonstrating that the\nautoregressive generation in models can be guided in a classifier-free way.\nNotably, we find that, during inference, guiding the models with highlighted\ntokens through the attention weights leads to more desired outputs. Our\napproach is compatible with current LLMs and VLMs, achieving impressive\ncustomized generation results without training. Experiments confirm its\neffectiveness in focusing on input contexts and generating reliable content.\nWithout tuning on LLaVA-v1.5, our method secured 69.5 in the MMBench test and\n1552.5 in MME-perception. The code is available at:\nhttps://github.com/dvlab-research/Prompt-Highlighter/\n","authors":["Yuechen Zhang","Shengju Qian","Bohao Peng","Shu Liu","Jiaya Jia"],"pdf_url":"https://arxiv.org/pdf/2312.04302v1.pdf","comment":"19 pages. Project Page:\n https://julianjuaner.github.io/projects/PromptHighlighter"},{"id":"http://arxiv.org/abs/2312.04296v1","updated":"2023-12-07T13:40:20Z","published":"2023-12-07T13:40:20Z","title":"Cross-codex Learning for Reliable Scribe Identification in Medieval\n Manuscripts","summary":" Historic scribe identification is a substantial task for obtaining\ninformation about the past. Uniform script styles, such as the Carolingian\nminuscule, make it a difficult task for classification to focus on meaningful\nfeatures. Therefore, we demonstrate in this paper the importance of cross-codex\ntraining data for CNN based text-independent off-line scribe identification, to\novercome codex dependent overfitting. We report three main findings: First, we\nfound that preprocessing with masked grayscale images instead of RGB images\nclearly increased the F1-score of the classification results. Second, we\ntrained different neural networks on our complex data, validating time and\naccuracy differences in order to define the most reliable network architecture.\nWith AlexNet, the network with the best trade-off between F1-score and time, we\nachieved for individual classes F1-scores of up to 0,96 on line level and up to\n1.0 on page level in classification. Third, we could replicate the finding that\nthe CNN output can be further improved by implementing a reject option, giving\nmore stable results. We present the results on our large scale open source\ndataset -- the Codex Claustroneoburgensis database (CCl-DB) -- containing a\nsignificant number of writings from different scribes in several codices. We\ndemonstrate for the first time on a dataset with such a variety of codices that\npaleographic decisions can be reproduced automatically and precisely with CNNs.\nThis gives manifold new and fast possibilities for paleographers to gain\ninsights into unlabeled material, but also to develop further hypotheses.\n","authors":["Julius Weißmann","Markus Seidl","Anya Dietrich","Martin Haltrich"],"pdf_url":"https://arxiv.org/pdf/2312.04296v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04293v1","updated":"2023-12-07T13:27:37Z","published":"2023-12-07T13:27:37Z","title":"GPT-4V with Emotion: A Zero-shot Benchmark for Multimodal Emotion\n Understanding","summary":" Recently, GPT-4 with Vision (GPT-4V) has shown remarkable performance across\nvarious multimodal tasks. However, its efficacy in emotion recognition remains\na question. This paper quantitatively evaluates GPT-4V's capabilities in\nmultimodal emotion understanding, encompassing tasks such as facial emotion\nrecognition, visual sentiment analysis, micro-expression recognition, dynamic\nfacial emotion recognition, and multimodal emotion recognition. Our experiments\nshow that GPT-4V exhibits impressive multimodal and temporal understanding\ncapabilities, even surpassing supervised systems in some tasks. Despite these\nachievements, GPT-4V is currently tailored for general domains. It performs\npoorly in micro-expression recognition that requires specialized expertise. The\nmain purpose of this paper is to present quantitative results of GPT-4V on\nemotion understanding and establish a zero-shot benchmark for future research.\nCode and evaluation results are available at:\nhttps://github.com/zeroQiaoba/gpt4v-emotion.\n","authors":["Zheng Lian","Licai Sun","Haiyang Sun","Kang Chen","Zhuofan Wen","Hao Gu","Shun Chen","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2312.04293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.15250v2","updated":"2023-12-07T13:00:57Z","published":"2023-07-28T01:20:12Z","title":"D2S: Representing local descriptors and global scene coordinates for\n camera relocalization","summary":" State-of-the-art visual localization methods mostly rely on complex\nprocedures to match local descriptors and 3D point clouds. However, these\nprocedures can incur significant cost in terms of inference, storage, and\nupdates over time. In this study, we propose a direct learning-based approach\nthat utilizes a simple network named D2S to represent local descriptors and\ntheir scene coordinates. Our method is characterized by its simplicity and\ncost-effectiveness. It solely leverages a single RGB image for localization\nduring the testing phase and only requires a lightweight model to encode a\ncomplex sparse scene. The proposed D2S employs a combination of a simple loss\nfunction and graph attention to selectively focus on robust descriptors while\ndisregarding areas such as clouds, trees, and several dynamic objects. This\nselective attention enables D2S to effectively perform a binary-semantic\nclassification for sparse descriptors. Additionally, we propose a new outdoor\ndataset to evaluate the capabilities of visual localization methods in terms of\nscene generalization and self-updating from unlabeled observations. Our\napproach outperforms the state-of-the-art CNN-based methods in scene coordinate\nregression in indoor and outdoor environments. It demonstrates the ability to\ngeneralize beyond training data, including scenarios involving transitions from\nday to night and adapting to domain shifts, even in the absence of the labeled\ndata sources. The source code, trained models, dataset, and demo videos are\navailable at the following link: https://thpjp.github.io/d2s\n","authors":["Bach-Thuan Bui","Dinh-Tuan Tran","Joo-Ho Lee"],"pdf_url":"https://arxiv.org/pdf/2307.15250v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03461v2","updated":"2023-12-07T12:46:07Z","published":"2023-12-06T12:36:53Z","title":"HiFi4G: High-Fidelity Human Performance Rendering via Compact Gaussian\n Splatting","summary":" We have recently seen tremendous progress in photo-real human modeling and\nrendering. Yet, efficiently rendering realistic human performance and\nintegrating it into the rasterization pipeline remains challenging. In this\npaper, we present HiFi4G, an explicit and compact Gaussian-based approach for\nhigh-fidelity human performance rendering from dense footage. Our core\nintuition is to marry the 3D Gaussian representation with non-rigid tracking,\nachieving a compact and compression-friendly representation. We first propose a\ndual-graph mechanism to obtain motion priors, with a coarse deformation graph\nfor effective initialization and a fine-grained Gaussian graph to enforce\nsubsequent constraints. Then, we utilize a 4D Gaussian optimization scheme with\nadaptive spatial-temporal regularizers to effectively balance the non-rigid\nprior and Gaussian updating. We also present a companion compression scheme\nwith residual compensation for immersive experiences on various platforms. It\nachieves a substantial compression rate of approximately 25 times, with less\nthan 2MB of storage per frame. Extensive experiments demonstrate the\neffectiveness of our approach, which significantly outperforms existing\napproaches in terms of optimization speed, rendering quality, and storage\noverhead.\n","authors":["Yuheng Jiang","Zhehao Shen","Penghao Wang","Zhuo Su","Yu Hong","Yingliang Zhang","Jingyi Yu","Lan Xu"],"pdf_url":"https://arxiv.org/pdf/2312.03461v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04266v1","updated":"2023-12-07T12:45:33Z","published":"2023-12-07T12:45:33Z","title":"Activity Grammars for Temporal Action Segmentation","summary":" Sequence prediction on temporal data requires the ability to understand\ncompositional structures of multi-level semantics beyond individual and\ncontextual properties. The task of temporal action segmentation, which aims at\ntranslating an untrimmed activity video into a sequence of action segments,\nremains challenging for this reason. This paper addresses the problem by\nintroducing an effective activity grammar to guide neural predictions for\ntemporal action segmentation. We propose a novel grammar induction algorithm\nthat extracts a powerful context-free grammar from action sequence data. We\nalso develop an efficient generalized parser that transforms frame-level\nprobability distributions into a reliable sequence of actions according to the\ninduced grammar with recursive rules. Our approach can be combined with any\nneural network for temporal action segmentation to enhance the sequence\nprediction and discover its compositional structure. Experimental results\ndemonstrate that our method significantly improves temporal action segmentation\nin terms of both performance and interpretability on two standard benchmarks,\nBreakfast and 50 Salads.\n","authors":["Dayoung Gong","Joonseok Lee","Deunsol Jung","Suha Kwak","Minsu Cho"],"pdf_url":"https://arxiv.org/pdf/2312.04266v1.pdf","comment":"Accepted to NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.04265v1","updated":"2023-12-07T12:43:00Z","published":"2023-12-07T12:43:00Z","title":"Stronger, Fewer, & Superior: Harnessing Vision Foundation Models for\n Domain Generalized Semantic Segmentation","summary":" In this paper, we first assess and harness various Vision Foundation Models\n(VFMs) in the context of Domain Generalized Semantic Segmentation (DGSS).\nDriven by the motivation that Leveraging Stronger pre-trained models and Fewer\ntrainable parameters for Superior generalizability, we introduce a robust\nfine-tuning approach, namely Rein, to parameter-efficiently harness VFMs for\nDGSS. Built upon a set of trainable tokens, each linked to distinct instances,\nRein precisely refines and forwards the feature maps from each layer to the\nnext layer within the backbone. This process produces diverse refinements for\ndifferent categories within a single image. With fewer trainable parameters,\nRein efficiently fine-tunes VFMs for DGSS tasks, surprisingly surpassing full\nparameter fine-tuning. Extensive experiments across various settings\ndemonstrate that Rein significantly outperforms state-of-the-art methods.\nRemarkably, with just an extra 1% of trainable parameters within the frozen\nbackbone, Rein achieves a mIoU of 68.1% on the Cityscapes, without accessing\nany real urban-scene datasets.\n","authors":["Zhixiang Wei","Lin Chen","Yi Jin","Xiaoxiao Ma","Tianle Liu","Pengyang Lin","Ben Wang","Huaian Chen","Jinjin Zheng"],"pdf_url":"https://arxiv.org/pdf/2312.04265v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00786v2","updated":"2023-12-07T12:29:28Z","published":"2023-12-01T18:59:59Z","title":"Dense Optical Tracking: Connecting the Dots","summary":" Recent approaches to point tracking are able to recover the trajectory of any\nscene point through a large portion of a video despite the presence of\nocclusions. They are, however, too slow in practice to track every point\nobserved in a single frame in a reasonable amount of time. This paper\nintroduces DOT, a novel, simple and efficient method for solving this problem.\nIt first extracts a small set of tracks from key regions at motion boundaries\nusing an off-the-shelf point tracking algorithm. Given source and target\nframes, DOT then computes rough initial estimates of a dense flow field and\nvisibility mask through nearest-neighbor interpolation, before refining them\nusing a learnable optical flow estimator that explicitly handles occlusions and\ncan be trained on synthetic data with ground-truth correspondences. We show\nthat DOT is significantly more accurate than current optical flow techniques,\noutperforms sophisticated \"universal\" trackers like OmniMotion, and is on par\nwith, or better than, the best point tracking algorithms like CoTracker while\nbeing at least two orders of magnitude faster. Quantitative and qualitative\nexperiments with synthetic and real videos validate the promise of the proposed\napproach. Code, data, and videos showcasing the capabilities of our approach\nare available in the project webpage: https://16lemoing.github.io/dot .\n","authors":["Guillaume Le Moing","Jean Ponce","Cordelia Schmid"],"pdf_url":"https://arxiv.org/pdf/2312.00786v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.14388v2","updated":"2023-12-07T12:20:25Z","published":"2023-11-24T10:07:14Z","title":"A Parameterized Generative Adversarial Network Using Cyclic Projection\n for Explainable Medical Image Classification","summary":" Although current data augmentation methods are successful to alleviate the\ndata insufficiency, conventional augmentation are primarily intra-domain while\nadvanced generative adversarial networks (GANs) generate images remaining\nuncertain, particularly in small-scale datasets. In this paper, we propose a\nparameterized GAN (ParaGAN) that effectively controls the changes of synthetic\nsamples among domains and highlights the attention regions for downstream\nclassification. Specifically, ParaGAN incorporates projection distance\nparameters in cyclic projection and projects the source images to the decision\nboundary to obtain the class-difference maps. Our experiments show that ParaGAN\ncan consistently outperform the existing augmentation methods with explainable\nclassification on two small-scale medical datasets.\n","authors":["Xiangyu Xiong","Yue Sun","Xiaohong Liu","Chan-Tong Lam","Tong Tong","Hao Chen","Qinquan Gao","Wei Ke","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2311.14388v2.pdf","comment":"5 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.17663v2","updated":"2023-12-07T12:19:27Z","published":"2023-11-29T14:25:46Z","title":"Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in\n Autonomous Driving Applications","summary":" Understanding how the surrounding environment changes is crucial for\nperforming downstream tasks safely and reliably in autonomous driving\napplications. Recent occupancy estimation techniques using only camera images\nas input can provide dense occupancy representations of large-scale scenes\nbased on the current observation. However, they are mostly limited to\nrepresenting the current 3D space and do not consider the future state of\nsurrounding objects along the time axis. To extend camera-only occupancy\nestimation into spatiotemporal prediction, we propose Cam4DOcc, a new benchmark\nfor camera-only 4D occupancy forecasting, evaluating the surrounding scene\nchanges in a near future. We build our benchmark based on multiple publicly\navailable datasets, including nuScenes, nuScenes-Occupancy, and Lyft-Level5,\nwhich provides sequential occupancy states of general movable and static\nobjects, as well as their 3D backward centripetal flow. To establish this\nbenchmark for future research with comprehensive comparisons, we introduce four\nbaseline types from diverse camera-based perception and prediction\nimplementations, including a static-world occupancy model, voxelization of\npoint cloud prediction, 2D-3D instance-based prediction, and our proposed novel\nend-to-end 4D occupancy forecasting network. Furthermore, the standardized\nevaluation protocol for preset multiple tasks is also provided to compare the\nperformance of all the proposed baselines on present and future occupancy\nestimation with respect to objects of interest in autonomous driving scenarios.\nThe dataset and our implementation of all four baselines in the proposed\nCam4DOcc benchmark will be released here: https://github.com/haomo-ai/Cam4DOcc.\n","authors":["Junyi Ma","Xieyuanli Chen","Jiawei Huang","Jingyi Xu","Zhen Luo","Jintao Xu","Weihao Gu","Rui Ai","Hesheng Wang"],"pdf_url":"https://arxiv.org/pdf/2311.17663v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.03894v3","updated":"2023-12-07T12:18:20Z","published":"2022-11-07T22:56:23Z","title":"visClust: A visual clustering algorithm based on orthogonal projections","summary":" We present a novel clustering algorithm, visClust, that is based on lower\ndimensional data representations and visual interpretation. Thereto, we design\na transformation that allows the data to be represented by a binary integer\narray enabling the use of image processing methods to select a partition.\nQualitative and quantitative analyses measured in accuracy and an adjusted\nRand-Index show that the algorithm performs well while requiring low runtime\nand RAM. We compare the results to 6 state-of-the-art algorithms with available\ncode, confirming the quality of visClust by superior performance in most\nexperiments. Moreover, the algorithm asks for just one obligatory input\nparameter while allowing optimization via optional parameters. The code is made\navailable on GitHub and straightforward to use.\n","authors":["Anna Breger","Clemens Karner","Martin Ehler"],"pdf_url":"https://arxiv.org/pdf/2211.03894v3.pdf","comment":"28 pages"},{"id":"http://arxiv.org/abs/2312.04248v1","updated":"2023-12-07T12:10:05Z","published":"2023-12-07T12:10:05Z","title":"TeMO: Towards Text-Driven 3D Stylization for Multi-Object Meshes","summary":" Recent progress in the text-driven 3D stylization of a single object has been\nconsiderably promoted by CLIP-based methods. However, the stylization of\nmulti-object 3D scenes is still impeded in that the image-text pairs used for\npre-training CLIP mostly consist of an object. Meanwhile, the local details of\nmultiple objects may be susceptible to omission due to the existing supervision\nmanner primarily relying on coarse-grained contrast of image-text pairs. To\novercome these challenges, we present a novel framework, dubbed TeMO, to parse\nmulti-object 3D scenes and edit their styles under the contrast supervision at\nmultiple levels. We first propose a Decoupled Graph Attention (DGA) module to\ndistinguishably reinforce the features of 3D surface points. Particularly, a\ncross-modal graph is constructed to align the object points accurately and noun\nphrases decoupled from the 3D mesh and textual description. Then, we develop a\nCross-Grained Contrast (CGC) supervision system, where a fine-grained loss\nbetween the words in the textual description and the randomly rendered images\nare constructed to complement the coarse-grained loss. Extensive experiments\nshow that our method can synthesize high-quality stylized content and\noutperform the existing methods over a wide range of multi-object 3D meshes.\nOur code and results will be made publicly available\n","authors":["Xuying Zhang","Bo-Wen Yin","Yuming Chen","Zheng Lin","Yunheng Li","Qibin Hou","Ming-Ming Cheng"],"pdf_url":"https://arxiv.org/pdf/2312.04248v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.08528v2","updated":"2023-12-07T12:03:20Z","published":"2023-10-12T17:21:41Z","title":"4D Gaussian Splatting for Real-Time Dynamic Scene Rendering","summary":" Representing and rendering dynamic scenes has been an important but\nchallenging task. Especially, to accurately model complex motions, high\nefficiency is usually hard to guarantee. To achieve real-time dynamic scene\nrendering while also enjoying high training and storage efficiency, we propose\n4D Gaussian Splatting (4D-GS) as a holistic representation for dynamic scenes\nrather than applying 3D-GS for each individual frame. In 4D-GS, a novel\nexplicit representation containing both 3D Gaussians and 4D neural voxels is\nproposed. A decomposed neural voxel encoding algorithm inspired by HexPlane is\nproposed to efficiently build Gaussian features from 4D neural voxels and then\na lightweight MLP is applied to predict Gaussian deformations at novel\ntimestamps. Our 4D-GS method achieves real-time rendering under high\nresolutions, 82 FPS at an 800$\\times$800 resolution on an RTX 3090 GPU while\nmaintaining comparable or better quality than previous state-of-the-art\nmethods. More demos and code are available at\nhttps://guanjunwu.github.io/4dgs/.\n","authors":["Guanjun Wu","Taoran Yi","Jiemin Fang","Lingxi Xie","Xiaopeng Zhang","Wei Wei","Wenyu Liu","Qi Tian","Xinggang Wang"],"pdf_url":"https://arxiv.org/pdf/2310.08528v2.pdf","comment":"Project page: https://guanjunwu.github.io/4dgs/"},{"id":"http://arxiv.org/abs/2312.04236v1","updated":"2023-12-07T11:41:26Z","published":"2023-12-07T11:41:26Z","title":"Detecting and Restoring Non-Standard Hands in Stable Diffusion Generated\n Images","summary":" We introduce a pipeline to address anatomical inaccuracies in Stable\nDiffusion generated hand images. The initial step involves constructing a\nspecialized dataset, focusing on hand anomalies, to train our models\neffectively. A finetuned detection model is pivotal for precise identification\nof these anomalies, ensuring targeted correction. Body pose estimation aids in\nunderstanding hand orientation and positioning, crucial for accurate anomaly\ncorrection. The integration of ControlNet and InstructPix2Pix facilitates\nsophisticated inpainting and pixel-level transformation, respectively. This\ndual approach allows for high-fidelity image adjustments. This comprehensive\napproach ensures the generation of images with anatomically accurate hands,\nclosely resembling real-world appearances. Our experimental results demonstrate\nthe pipeline's efficacy in enhancing hand image realism in Stable Diffusion\noutputs. We provide an online demo at https://fixhand.yiqun.io\n","authors":["Yiqun Zhang","Zhenyue Qin","Yang Liu","Dylan Campbell"],"pdf_url":"https://arxiv.org/pdf/2312.04236v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04233v1","updated":"2023-12-07T11:39:11Z","published":"2023-12-07T11:39:11Z","title":"Fine-tune vision foundation model for crack segmentation in civil\n infrastructures","summary":" Large-scale foundation models have become the mainstream method in the field\nof deep learning, while in civil engineering, the scale of AI models is\nstrictly limited. In this work, vision foundation model is introduced for crack\nsegmentation. Two Parameter-efficient fine-tuning methods, adapter and low-rank\nadaptation, are adopted to fine-tune the foundation model in the field of\nsemantic segmentation: Segment Anything Model (SAM). The fine-tuned model\nCrackSAM is much larger than all the existing crack segmentation models, but\nshows excellent performance. To test the zero-shot performance of the proposed\nmethod, two unique datasets related to road and exterior wall cracks are\ncollected, annotated and open-sourced, in total 810 images. Comparative\nexperiments are conducted with twelve mature semantic segmentation models. On\ndatasets with artificial noise and previously unseen datasets, the performance\nof CrackSAM far exceeds that of all state-of-the-art models. CrackSAM exhibits\nremarkable superiority, particularly in challenging conditions such as dim\nlighting, shadows, road markings, construction joints, and other interference\nfactors. Such cross-scenario results demonstrate the outstanding zero-shot\ncapability of foundation models, and provide new ideas for the development of\nvision models in civil engineering.\n","authors":["Kang Ge","Chen Wang","Yutao Guo"],"pdf_url":"https://arxiv.org/pdf/2312.04233v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04231v1","updated":"2023-12-07T11:31:20Z","published":"2023-12-07T11:31:20Z","title":"Adventures of Trustworthy Vision-Language Models: A Survey","summary":" Recently, transformers have become incredibly popular in computer vision and\nvision-language tasks. This notable rise in their usage can be primarily\nattributed to the capabilities offered by attention mechanisms and the\noutstanding ability of transformers to adapt and apply themselves to a variety\nof tasks and domains. Their versatility and state-of-the-art performance have\nestablished them as indispensable tools for a wide array of applications.\nHowever, in the constantly changing landscape of machine learning, the\nassurance of the trustworthiness of transformers holds utmost importance. This\npaper conducts a thorough examination of vision-language transformers,\nemploying three fundamental principles of responsible AI: Bias, Robustness, and\nInterpretability. The primary objective of this paper is to delve into the\nintricacies and complexities associated with the practical use of transformers,\nwith the overarching goal of advancing our comprehension of how to enhance\ntheir reliability and accountability.\n","authors":["Mayank Vatsa","Anubhooti Jain","Richa Singh"],"pdf_url":"https://arxiv.org/pdf/2312.04231v1.pdf","comment":"Accepted in AAAI 2024"},{"id":"http://arxiv.org/abs/2210.11407v3","updated":"2023-12-07T11:24:37Z","published":"2022-10-20T16:56:47Z","title":"Similarity of Neural Architectures using Adversarial Attack\n Transferability","summary":" In recent years, many deep neural architectures have been developed for image\nclassification. Whether they are similar or dissimilar and what factors\ncontribute to their (dis)similarities remains curious. To address this\nquestion, we aim to design a quantitative and scalable similarity measure\nbetween neural architectures. We propose Similarity by Attack Transferability\n(SAT) from the observation that adversarial attack transferability contains\ninformation related to input gradients and decision boundaries widely used to\nunderstand model behaviors. We conduct a large-scale analysis on 69\nstate-of-the-art ImageNet classifiers using our proposed similarity function to\nanswer the question. Moreover, we observe neural architecture-related phenomena\nusing model similarity that model diversity can lead to better performance on\nmodel ensembles and knowledge distillation under specific conditions. Our\nresults provide insights into why developing diverse neural architectures with\ndistinct components is necessary.\n","authors":["Jaehui Hwang","Dongyoon Han","Byeongho Heo","Song Park","Sanghyuk Chun","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2210.11407v3.pdf","comment":"20pages, 13 figures, 2.3MB"},{"id":"http://arxiv.org/abs/2312.04225v1","updated":"2023-12-07T11:16:00Z","published":"2023-12-07T11:16:00Z","title":"TLCE: Transfer-Learning Based Classifier Ensembles for Few-Shot\n Class-Incremental Learning","summary":" Few-shot class-incremental learning (FSCIL) struggles to incrementally\nrecognize novel classes from few examples without catastrophic forgetting of\nold classes or overfitting to new classes. We propose TLCE, which ensembles\nmultiple pre-trained models to improve separation of novel and old classes.\nTLCE minimizes interference between old and new classes by mapping old class\nimages to quasi-orthogonal prototypes using episodic training. It then\nensembles diverse pre-trained models to better adapt to novel classes despite\ndata imbalance. Extensive experiments on various datasets demonstrate that our\ntransfer learning ensemble approach outperforms state-of-the-art FSCIL methods.\n","authors":["Shuangmei Wang","Yang Cao","Tieru Wu"],"pdf_url":"https://arxiv.org/pdf/2312.04225v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04215v1","updated":"2023-12-07T11:03:42Z","published":"2023-12-07T11:03:42Z","title":"Guided Reconstruction with Conditioned Diffusion Models for Unsupervised\n Anomaly Detection in Brain MRIs","summary":" Unsupervised anomaly detection in Brain MRIs aims to identify abnormalities\nas outliers from a healthy training distribution. Reconstruction-based\napproaches that use generative models to learn to reconstruct healthy brain\nanatomy are commonly used for this task. Diffusion models are an emerging class\nof deep generative models that show great potential regarding reconstruction\nfidelity. However, they face challenges in preserving intensity characteristics\nin the reconstructed images, limiting their performance in anomaly detection.\nTo address this challenge, we propose to condition the denoising mechanism of\ndiffusion models with additional information about the image to reconstruct\ncoming from a latent representation of the noise-free input image. This\nconditioning enables high-fidelity reconstruction of healthy brain structures\nwhile aligning local intensity characteristics of input-reconstruction pairs.\nWe evaluate our method's reconstruction quality, domain adaptation features and\nfinally segmentation performance on publicly available data sets with various\npathologies. Using our proposed conditioning mechanism we can reduce the\nfalse-positive predictions and enable a more precise delineation of anomalies\nwhich significantly enhances the anomaly detection performance compared to\nestablished state-of-the-art approaches to unsupervised anomaly detection in\nbrain MRI. Furthermore, our approach shows promise in domain adaptation across\ndifferent MRI acquisitions and simulated contrasts, a crucial property of\ngeneral anomaly detection methods.\n","authors":["Finn Behrendt","Debayan Bhattacharya","Robin Mieling","Lennart Maack","Julia Krüger","Roland Opfer","Alexander Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2312.04215v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2312.01623v3","updated":"2023-12-07T10:55:03Z","published":"2023-12-04T04:47:48Z","title":"Universal Segmentation at Arbitrary Granularity with Language\n Instruction","summary":" This paper aims to achieve universal segmentation of arbitrary semantic\nlevel. Despite significant progress in recent years, specialist segmentation\napproaches are limited to specific tasks and data distribution. Retraining a\nnew model for adaptation to new scenarios or settings takes expensive\ncomputation and time cost, which raises the demand for versatile and universal\nsegmentation model that can cater to various granularity. Although some\nattempts have been made for unifying different segmentation tasks or\ngeneralization to various scenarios, limitations in the definition of paradigms\nand input-output spaces make it difficult for them to achieve accurate\nunderstanding of content at arbitrary granularity. To this end, we present\nUniLSeg, a universal segmentation model that can perform segmentation at any\nsemantic level with the guidance of language instructions. For training\nUniLSeg, we reorganize a group of tasks from original diverse distributions\ninto a unified data format, where images with texts describing segmentation\ntargets as input and corresponding masks are output. Combined with a automatic\nannotation engine for utilizing numerous unlabeled data, UniLSeg achieves\nexcellent performance on various tasks and settings, surpassing both specialist\nand unified segmentation models.\n","authors":["Yong Liu","Cairong Zhang","Yitong Wang","Jiahao Wang","Yujiu Yang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2312.01623v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04210v1","updated":"2023-12-07T10:52:16Z","published":"2023-12-07T10:52:16Z","title":"Constraint Model for the Satellite Image Mosaic Selection Problem","summary":" Satellite imagery solutions are widely used to study and monitor different\nregions of the Earth. However, a single satellite image can cover only a\nlimited area. In cases where a larger area of interest is studied, several\nimages must be stitched together to create a single larger image, called a\nmosaic, that can cover the area. Today, with the increasing number of satellite\nimages available for commercial use, selecting the images to build the mosaic\nis challenging, especially when the user wants to optimize one or more\nparameters, such as the total cost and the cloud coverage percentage in the\nmosaic. More precisely, for this problem the input is an area of interest,\nseveral satellite images intersecting the area, a list of requirements relative\nto the image and the mosaic, such as cloud coverage percentage, image\nresolution, and a list of objectives to optimize. We contribute to the\nconstraint and mixed integer lineal programming formulation of this new\nproblem, which we call the \\textit{satellite image mosaic selection problem},\nwhich is a multi-objective extension of the polygon cover problem. We propose a\ndataset of realistic and challenging instances, where the images were captured\nby the satellite constellations SPOT, Pl\\'eiades and Pl\\'eiades Neo. We\nevaluate and compare the two proposed models and show their efficiency for\nlarge instances, up to 200 images.\n","authors":["Manuel Combarro Simón","Pierre Talbot","Grégoire Danoy","Jedrzej Musial","Mohammed Alswaitti","Pascal Bouvry"],"pdf_url":"https://arxiv.org/pdf/2312.04210v1.pdf","comment":"This paper contains minor corrections from the original document\n presented at the 29th International Conference on Principles and Practice of\n Constraint Programming (CP 2023). Minor corrections in Figures 5a and 5b that\n do not affect the analysis result. Minor typo corrections in Appendix A"},{"id":"http://arxiv.org/abs/2310.13355v2","updated":"2023-12-07T10:39:44Z","published":"2023-10-20T08:44:47Z","title":"SILC: Improving Vision Language Pretraining with Self-Distillation","summary":" Image-Text pretraining on web-scale image caption datasets has become the\ndefault recipe for open vocabulary classification and retrieval models thanks\nto the success of CLIP and its variants. Several works have also used CLIP\nfeatures for dense prediction tasks and have shown the emergence of open-set\nabilities. However, the contrastive objective used by these models only focuses\non image-text alignment and does not incentivise image feature learning for\ndense prediction tasks. In this work, we introduce SILC, a novel framework for\nvision language pretraining. SILC improves image-text contrastive learning with\nthe simple addition of local-to-global correspondence learning by\nself-distillation. We show that distilling local image features from an\nexponential moving average (EMA) teacher model significantly improves model\nperformance on dense predictions tasks like detection and segmentation, while\nalso providing improvements on image-level tasks such as classification and\nretrieval. SILC models sets a new state of the art for zero-shot\nclassification, few shot classification, image and text retrieval, zero-shot\nsegmentation, and open vocabulary segmentation. We further show that SILC\nfeatures greatly benefit open vocabulary detection, captioning and visual\nquestion answering.\n","authors":["Muhammad Ferjad Naeem","Yongqin Xian","Xiaohua Zhai","Lukas Hoyer","Luc Van Gool","Federico Tombari"],"pdf_url":"https://arxiv.org/pdf/2310.13355v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2306.12816v2","updated":"2023-12-07T10:37:09Z","published":"2023-06-22T11:31:11Z","title":"XAI-TRIS: Non-linear image benchmarks to quantify false positive\n post-hoc attribution of feature importance","summary":" The field of 'explainable' artificial intelligence (XAI) has produced highly\ncited methods that seek to make the decisions of complex machine learning (ML)\nmethods 'understandable' to humans, for example by attributing 'importance'\nscores to input features. Yet, a lack of formal underpinning leaves it unclear\nas to what conclusions can safely be drawn from the results of a given XAI\nmethod and has also so far hindered the theoretical verification and empirical\nvalidation of XAI methods. This means that challenging non-linear problems,\ntypically solved by deep neural networks, presently lack appropriate remedies.\nHere, we craft benchmark datasets for three different non-linear classification\nscenarios, in which the important class-conditional features are known by\ndesign, serving as ground truth explanations. Using novel quantitative metrics,\nwe benchmark the explanation performance of a wide set of XAI methods across\nthree deep learning model architectures. We show that popular XAI methods are\noften unable to significantly outperform random performance baselines and edge\ndetection methods. Moreover, we demonstrate that explanations derived from\ndifferent model architectures can be vastly different; thus, prone to\nmisinterpretation even under controlled conditions.\n","authors":["Benedict Clark","Rick Wilming","Stefan Haufe"],"pdf_url":"https://arxiv.org/pdf/2306.12816v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.04197v1","updated":"2023-12-07T10:31:05Z","published":"2023-12-07T10:31:05Z","title":"SAMBA: A Trainable Segmentation Web-App with Smart Labelling","summary":" Segmentation is the assigning of a semantic class to every pixel in an image\nand is a prerequisite for various statistical analysis tasks in materials\nscience, like phase quantification, physics simulations or morphological\ncharacterization. The wide range of length scales, imaging techniques and\nmaterials studied in materials science means any segmentation algorithm must\ngeneralise to unseen data and support abstract, user-defined semantic classes.\nTrainable segmentation is a popular interactive segmentation paradigm where a\nclassifier is trained to map from image features to user drawn labels. SAMBA is\na trainable segmentation tool that uses Meta's Segment Anything Model (SAM) for\nfast, high-quality label suggestions and a random forest classifier for robust,\ngeneralizable segmentations. It is accessible in the browser\n(https://www.sambasegment.com/) without the need to download any external\ndependencies. The segmentation backend is run in the cloud, so does not require\nthe user to have powerful hardware.\n","authors":["Ronan Docherty","Isaac Squires","Antonis Vamvakeros","Samuel J. Cooper"],"pdf_url":"https://arxiv.org/pdf/2312.04197v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04189v1","updated":"2023-12-07T10:16:21Z","published":"2023-12-07T10:16:21Z","title":"Joint-Individual Fusion Structure with Fusion Attention Module for\n Multi-Modal Skin Cancer Classification","summary":" Most convolutional neural network (CNN) based methods for skin cancer\nclassification obtain their results using only dermatological images. Although\ngood classification results have been shown, more accurate results can be\nachieved by considering the patient's metadata, which is valuable clinical\ninformation for dermatologists. Current methods only use the simple joint\nfusion structure (FS) and fusion modules (FMs) for the multi-modal\nclassification methods, there still is room to increase the accuracy by\nexploring more advanced FS and FM. Therefore, in this paper, we design a new\nfusion method that combines dermatological images (dermoscopy images or\nclinical images) and patient metadata for skin cancer classification from the\nperspectives of FS and FM. First, we propose a joint-individual fusion (JIF)\nstructure that learns the shared features of multi-modality data and preserves\nspecific features simultaneously. Second, we introduce a fusion attention (FA)\nmodule that enhances the most relevant image and metadata features based on\nboth the self and mutual attention mechanism to support the decision-making\npipeline. We compare the proposed JIF-MMFA method with other state-of-the-art\nfusion methods on three different public datasets. The results show that our\nJIF-MMFA method improves the classification results for all tested CNN\nbackbones and performs better than the other fusion methods on the three public\ndatasets, demonstrating our method's effectiveness and robustness\n","authors":["Peng Tang","Xintong Yan","Yang Nan","Xiaobin Hu","Xiaobin Hu","Bjoern H Menzee. Sebastian Krammer","Tobias Lasser"],"pdf_url":"https://arxiv.org/pdf/2312.04189v1.pdf","comment":"submitted to Pattern Recognition journal before 2022"},{"id":"http://arxiv.org/abs/2205.08324v3","updated":"2023-12-07T10:08:04Z","published":"2022-05-17T13:20:30Z","title":"Exploring the Interactive Guidance for Unified and Effective Image\n Matting","summary":" Recent image matting studies are developing towards proposing trimap-free or\ninteractive methods for complete complex image matting tasks. Although avoiding\nthe extensive labors of trimap annotation, existing methods still suffer from\ntwo limitations: (1) For the single image with multiple objects, it is\nessential to provide extra interaction information to help determining the\nmatting target; (2) For transparent objects, the accurate regression of alpha\nmatte from RGB image is much more difficult compared with the opaque ones. In\nthis work, we propose a Unified Interactive image Matting method, named UIM,\nwhich solves the limitations and achieves satisfying matting results for any\nscenario. Specifically, UIM leverages multiple types of user interaction to\navoid the ambiguity of multiple matting targets, and we compare the pros and\ncons of different annotation types in detail. To unify the matting performance\nfor transparent and opaque objects, we decouple image matting into two stages,\ni.e., foreground segmentation and transparency prediction. Moreover, we design\na multi-scale attentive fusion module to alleviate the vagueness in the\nboundary region. Experimental results demonstrate that UIM achieves\nstate-of-the-art performance on the Composition-1K test set and a synthetic\nunified dataset.\n","authors":["Dinghao Yang","Bin Wang","Weijia Li","Yiqi Lin","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2205.08324v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.07921v2","updated":"2023-12-07T09:59:23Z","published":"2021-12-15T06:57:01Z","title":"Temporal Shuffling for Defending Deep Action Recognition Models against\n Adversarial Attacks","summary":" Recently, video-based action recognition methods using convolutional neural\nnetworks (CNNs) achieve remarkable recognition performance. However, there is\nstill lack of understanding about the generalization mechanism of action\nrecognition models. In this paper, we suggest that action recognition models\nrely on the motion information less than expected, and thus they are robust to\nrandomization of frame orders. Furthermore, we find that motion monotonicity\nremaining after randomization also contributes to such robustness. Based on\nthis observation, we develop a novel defense method using temporal shuffling of\ninput videos against adversarial attacks for action recognition models. Another\nobservation enabling our defense method is that adversarial perturbations on\nvideos are sensitive to temporal destruction. To the best of our knowledge,\nthis is the first attempt to design a defense method without additional\ntraining for 3D CNN-based video action recognition models.\n","authors":["Jaehui Hwang","Huan Zhang","Jun-Ho Choi","Cho-Jui Hsieh","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2112.07921v2.pdf","comment":"12 pages, accepted to Neural Networks"},{"id":"http://arxiv.org/abs/2312.00690v2","updated":"2023-12-07T09:41:54Z","published":"2023-12-01T16:17:16Z","title":"Open-vocabulary object 6D pose estimation","summary":" We introduce the new setting of open-vocabulary object 6D pose estimation, in\nwhich a textual prompt is used to specify the object of interest. In contrast\nto existing approaches, in our setting (i) the object of interest is specified\nsolely through the textual prompt, (ii) no object model (e.g. CAD or video\nsequence) is required at inference, (iii) the object is imaged from two\ndifferent viewpoints of two different scenes, and (iv) the object was not\nobserved during the training phase. To operate in this setting, we introduce a\nnovel approach that leverages a Vision-Language Model to segment the object of\ninterest from two distinct scenes and to estimate its relative 6D pose. The key\nof our approach is a carefully devised strategy to fuse object-level\ninformation provided by the prompt with local image features, resulting in a\nfeature space that can generalize to novel concepts. We validate our approach\non a new benchmark based on two popular datasets, REAL275 and Toyota-Light,\nwhich collectively encompass 39 object instances appearing in four thousand\nimage pairs. The results demonstrate that our approach outperforms both a\nwell-established hand-crafted method and a recent deep learning-based baseline\nin estimating the relative 6D pose of objects in different scenes. Project\npage: https://jcorsetti.github.io/oryon/.\n","authors":["Jaime Corsetti","Davide Boscaini","Changjae Oh","Andrea Cavallaro","Fabio Poiesi"],"pdf_url":"https://arxiv.org/pdf/2312.00690v2.pdf","comment":"Technical report. 21 pages, 15 figures, 6 tables. Updated website\n link"},{"id":"http://arxiv.org/abs/2312.04168v1","updated":"2023-12-07T09:37:28Z","published":"2023-12-07T09:37:28Z","title":"Augmentation-Free Dense Contrastive Knowledge Distillation for Efficient\n Semantic Segmentation","summary":" In recent years, knowledge distillation methods based on contrastive learning\nhave achieved promising results on image classification and object detection\ntasks. However, in this line of research, we note that less attention is paid\nto semantic segmentation. Existing methods heavily rely on data augmentation\nand memory buffer, which entail high computational resource demands when\napplying them to handle semantic segmentation that requires to preserve\nhigh-resolution feature maps for making dense pixel-wise predictions. In order\nto address this problem, we present Augmentation-free Dense Contrastive\nKnowledge Distillation (Af-DCD), a new contrastive distillation learning\nparadigm to train compact and accurate deep neural networks for semantic\nsegmentation applications. Af-DCD leverages a masked feature mimicking\nstrategy, and formulates a novel contrastive learning loss via taking advantage\nof tactful feature partitions across both channel and spatial dimensions,\nallowing to effectively transfer dense and structured local knowledge learnt by\nthe teacher model to a target student model while maintaining training\nefficiency. Extensive experiments on five mainstream benchmarks with various\nteacher-student network pairs demonstrate the effectiveness of our approach.\nFor instance, the DeepLabV3-Res18|DeepLabV3-MBV2 model trained by Af-DCD\nreaches 77.03%|76.38% mIOU on Cityscapes dataset when choosing DeepLabV3-Res101\nas the teacher, setting new performance records. Besides that, Af-DCD achieves\nan absolute mIOU improvement of 3.26%|3.04%|2.75%|2.30%|1.42% compared with\nindividually trained counterpart on Cityscapes|Pascal\nVOC|Camvid|ADE20K|COCO-Stuff-164K. Code is available at\nhttps://github.com/OSVAI/Af-DCD\n","authors":["Jiawei Fan","Chao Li","Xiaolong Liu","Meina Song","Anbang Yao"],"pdf_url":"https://arxiv.org/pdf/2312.04168v1.pdf","comment":"The paper of Af-DCD is accepted to NeurIPS 2023. Code and models are\n available at https://github.com/OSVAI/Af-DCD"},{"id":"http://arxiv.org/abs/2305.09132v2","updated":"2023-12-07T09:33:38Z","published":"2023-05-16T03:25:38Z","title":"DualGenerator: Information Interaction-based Generative Network for\n Point Cloud Completion","summary":" Point cloud completion estimates complete shapes from incomplete point clouds\nto obtain higher-quality point cloud data. Most existing methods only consider\nglobal object features, ignoring spatial and semantic information of adjacent\npoints. They cannot distinguish structural information well between different\nobject parts, and the robustness of models is poor. To tackle these challenges,\nwe propose an information interaction-based generative network for point cloud\ncompletion ($\\mathbf{DualGenerator}$). It contains an adversarial generation\npath and a variational generation path, which interact with each other and\nshare weights. DualGenerator introduces a local refinement module in generation\npaths, which captures general structures from partial inputs, and then refines\nshape details of the point cloud. It promotes completion in the unknown region\nand makes a distinction between different parts more obvious. Moreover, we\ndesign DGStyleGAN to improve the generation quality further. It promotes the\nrobustness of this network combined with fusion analysis of dual-path\ncompletion results. Qualitative and quantitative evaluations demonstrate that\nour method is superior on MVP and Completion3D datasets. The performance will\nnot degrade significantly after adding noise interference or sparse sampling.\n","authors":["Pengcheng Shi","Haozhe Cheng","Xu Han","Yiyang Zhou","Jihua Zhu"],"pdf_url":"https://arxiv.org/pdf/2305.09132v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01860v2","updated":"2023-12-07T09:24:35Z","published":"2023-12-04T12:48:44Z","title":"Unveiling Objects with SOLA: An Annotation-Free Image Search on the\n Object Level for Automotive Data Sets","summary":" Huge image data sets are the fundament for the development of the perception\nof automated driving systems. A large number of images is necessary to train\nrobust neural networks that can cope with diverse situations. A sufficiently\nlarge data set contains challenging situations and objects. For testing the\nresulting functions, it is necessary that these situations and objects can be\nfound and extracted from the data set. While it is relatively easy to record a\nlarge amount of unlabeled data, it is far more difficult to find demanding\nsituations and objects. However, during the development of perception systems,\nit must be possible to access challenging data without having to perform\nlengthy and time-consuming annotations. A developer must therefore be able to\nsearch dynamically for specific situations and objects in a data set. Thus, we\ndesigned a method which is based on state-of-the-art neural networks to search\nfor objects with certain properties within an image. For the ease of use, the\nquery of this search is described using natural language. To determine the time\nsavings and performance gains, we evaluated our method qualitatively and\nquantitatively on automotive data sets.\n","authors":["Philipp Rigoll","Jacob Langner","Eric Sax"],"pdf_url":"https://arxiv.org/pdf/2312.01860v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04160v1","updated":"2023-12-07T09:22:20Z","published":"2023-12-07T09:22:20Z","title":"Text as Image: Learning Transferable Adapter for Multi-Label\n Classification","summary":" Pre-trained vision-language models have notably accelerated progress of\nopen-world concept recognition. Their impressive zero-shot ability has recently\nbeen transferred to multi-label image classification via prompt tuning,\nenabling to discover novel labels in an open-vocabulary manner. However, this\nparadigm suffers from non-trivial training costs, and becomes computationally\nprohibitive for a large number of candidate labels. To address this issue, we\nnote that vision-language pre-training aligns images and texts in a unified\nembedding space, making it potential for an adapter network to identify labels\nin visual modality while be trained in text modality. To enhance such\ncross-modal transfer ability, a simple yet effective method termed random\nperturbation is proposed, which enables the adapter to search for potential\nvisual embeddings by perturbing text embeddings with noise during training,\nresulting in better performance in visual modality. Furthermore, we introduce\nan effective approach to employ large language models for multi-label\ninstruction-following text generation. In this way, a fully automated pipeline\nfor visual label recognition is developed without relying on any manual data.\nExtensive experiments on public benchmarks show the superiority of our method\nin various multi-label classification tasks.\n","authors":["Xuelin Zhu","Jiuxin Cao","Jian liu","Dongqi Tang","Furong Xu","Weijia Liu","Jiawei Ge","Bo Liu","Qingpei Guo","Tianyi Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.04160v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04152v1","updated":"2023-12-07T09:10:16Z","published":"2023-12-07T09:10:16Z","title":"EulerMormer: Robust Eulerian Motion Magnification via Dynamic Filtering\n within Transformer","summary":" Video Motion Magnification (VMM) aims to break the resolution limit of human\nvisual perception capability and reveal the imperceptible minor motion that\ncontains valuable information in the macroscopic domain. However, challenges\narise in this task due to photon noise inevitably introduced by photographic\ndevices and spatial inconsistency in amplification, leading to flickering\nartifacts in static fields and motion blur and distortion in dynamic fields in\nthe video. Existing methods focus on explicit motion modeling without\nemphasizing prioritized denoising during the motion magnification process. This\npaper proposes a novel dynamic filtering strategy to achieve static-dynamic\nfield adaptive denoising. Specifically, based on Eulerian theory, we separate\ntexture and shape to extract motion representation through inter-frame shape\ndifferences, expecting to leverage these subdivided features to solve this task\nfinely. Then, we introduce a novel dynamic filter that eliminates noise cues\nand preserves critical features in the motion magnification and amplification\ngeneration phases. Overall, our unified framework, EulerMormer, is a pioneering\neffort to first equip with Transformer in learning-based VMM. The core of the\ndynamic filter lies in a global dynamic sparse cross-covariance attention\nmechanism that explicitly removes noise while preserving vital information,\ncoupled with a multi-scale dual-path gating mechanism that selectively\nregulates the dependence on different frequency features to reduce spatial\nattenuation and complement motion boundaries. We demonstrate extensive\nexperiments that EulerMormer achieves more robust video motion magnification\nfrom the Eulerian perspective, significantly outperforming state-of-the-art\nmethods. The source code is available at\nhttps://github.com/VUT-HFUT/EulerMormer.\n","authors":["Fei Wang","Dan Guo","Kun Li","Meng Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04152v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04145v1","updated":"2023-12-07T08:59:20Z","published":"2023-12-07T08:59:20Z","title":"Diffusing Colors: Image Colorization with Text Guided Diffusion","summary":" The colorization of grayscale images is a complex and subjective task with\nsignificant challenges. Despite recent progress in employing large-scale\ndatasets with deep neural networks, difficulties with controllability and\nvisual quality persist. To tackle these issues, we present a novel image\ncolorization framework that utilizes image diffusion techniques with granular\ntext prompts. This integration not only produces colorization outputs that are\nsemantically appropriate but also greatly improves the level of control users\nhave over the colorization process. Our method provides a balance between\nautomation and control, outperforming existing techniques in terms of visual\nquality and semantic coherence. We leverage a pretrained generative Diffusion\nModel, and show that we can finetune it for the colorization task without\nlosing its generative power or attention to text prompts. Moreover, we present\na novel CLIP-based ranking model that evaluates color vividness, enabling\nautomatic selection of the most suitable level of vividness based on the\nspecific scene semantics. Our approach holds potential particularly for color\nenhancement and historical image colorization.\n","authors":["Nir Zabari","Aharon Azulay","Alexey Gorkor","Tavi Halperin","Ohad Fried"],"pdf_url":"https://arxiv.org/pdf/2312.04145v1.pdf","comment":"SIGGRAPH Asia 2023"},{"id":"http://arxiv.org/abs/2312.04143v1","updated":"2023-12-07T08:58:33Z","published":"2023-12-07T08:58:33Z","title":"Towards 4D Human Video Stylization","summary":" We present a first step towards 4D (3D and time) human video stylization,\nwhich addresses style transfer, novel view synthesis and human animation within\na unified framework. While numerous video stylization methods have been\ndeveloped, they are often restricted to rendering images in specific viewpoints\nof the input video, lacking the capability to generalize to novel views and\nnovel poses in dynamic scenes. To overcome these limitations, we leverage\nNeural Radiance Fields (NeRFs) to represent videos, conducting stylization in\nthe rendered feature space. Our innovative approach involves the simultaneous\nrepresentation of both the human subject and the surrounding scene using two\nNeRFs. This dual representation facilitates the animation of human subjects\nacross various poses and novel viewpoints. Specifically, we introduce a novel\ngeometry-guided tri-plane representation, significantly enhancing feature\nrepresentation robustness compared to direct tri-plane optimization. Following\nthe video reconstruction, stylization is performed within the NeRFs' rendered\nfeature space. Extensive experiments demonstrate that the proposed method\nstrikes a superior balance between stylized textures and temporal coherence,\nsurpassing existing approaches. Furthermore, our framework uniquely extends its\ncapabilities to accommodate novel poses and viewpoints, making it a versatile\ntool for creative human video stylization.\n","authors":["Tiantian Wang","Xinxin Zuo","Fangzhou Mu","Jian Wang","Ming-Hsuan Yang"],"pdf_url":"https://arxiv.org/pdf/2312.04143v1.pdf","comment":"Under Review"},{"id":"http://arxiv.org/abs/2312.04140v1","updated":"2023-12-07T08:55:28Z","published":"2023-12-07T08:55:28Z","title":"Polarimetric Light Transport Analysis for Specular Inter-reflection","summary":" Polarization is well known for its ability to decompose diffuse and specular\nreflections. However, the existing decomposition methods only focus on direct\nreflection and overlook multiple reflections, especially specular\ninter-reflection. In this paper, we propose a novel decomposition method for\nhandling specular inter-reflection of metal objects by using a unique\npolarimetric feature: the rotation direction of linear polarization. This\nrotation direction serves as a discriminative factor between direct and\ninter-reflection on specular surfaces. To decompose the reflectance components,\nwe actively rotate the linear polarization of incident light and analyze the\nrotation direction of the reflected light. We evaluate our method using both\nsynthetic and real data, demonstrating its effectiveness in decomposing\nspecular inter-reflections of metal objects. Furthermore, we demonstrate that\nour method can be combined with other decomposition methods for a detailed\nanalysis of light transport. As a practical application, we show its\neffectiveness in improving the accuracy of 3D measurement against strong\nspecular inter-reflection.\n","authors":["Ryota Maeda","Shinsaku Hiura"],"pdf_url":"https://arxiv.org/pdf/2312.04140v1.pdf","comment":"This work has been submitted to the IEEE for possible publication.\n Copyright may be transferred without notice, after which this version may no\n longer be accessible"},{"id":"http://arxiv.org/abs/2204.04969v2","updated":"2023-12-07T08:54:36Z","published":"2022-04-11T09:35:24Z","title":"Assessing hierarchies by their consistent segmentations","summary":" Current approaches to generic segmentation start by creating a hierarchy of\nnested image partitions and then specifying a segmentation from it. Our first\ncontribution is to describe several ways, most of them new, for specifying\nsegmentations using the hierarchy elements. Then, we consider the best\nhierarchy-induced segmentation specified by a limited number of hierarchy\nelements. We focus on a common quality measure for binary segmentations, the\nJaccard index (also known as IoU). Optimizing the Jaccard index is highly\nnon-trivial, and yet we propose an efficient approach for doing exactly that.\nThis way we get algorithm-independent upper bounds on the quality of any\nsegmentation created from the hierarchy. We found that the obtainable\nsegmentation quality varies significantly depending on the way that the\nsegments are specified by the hierarchy elements, and that representing a\nsegmentation with only a few hierarchy elements is often possible. (Code is\navailable).\n","authors":["Zeev Gutman","Ritvik Vij","Laurent Najman","Michael Lindenbaum"],"pdf_url":"https://arxiv.org/pdf/2204.04969v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02183v2","updated":"2023-12-07T08:48:37Z","published":"2023-11-03T18:27:43Z","title":"A New Fine-grained Alignment Method for Image-text Matching","summary":" Image-text retrieval is a widely studied topic in the field of computer\nvision due to the exponential growth of multimedia data, whose core concept is\nto measure the similarity between images and text. However, most existing\nretrieval methods heavily rely on cross-attention mechanisms for cross-modal\nfine-grained alignment, which takes into account excessive irrelevant regions\nand treats prominent and non-significant words equally, thereby limiting\nretrieval accuracy. This paper aims to investigate an alignment approach that\nreduces the involvement of non-significant fragments in images and text while\nenhancing the alignment of prominent segments. For this purpose, we introduce\nthe Cross-Modal Prominent Fragments Enhancement Aligning Network(CPFEAN), which\nachieves improved retrieval accuracy by diminishing the participation of\nirrelevant regions during alignment and relatively increasing the alignment\nsimilarity of prominent words. Additionally, we incorporate prior textual\ninformation into image regions to reduce misalignment occurrences. In practice,\nwe first design a novel intra-modal fragments relationship reasoning method,\nand subsequently employ our proposed alignment mechanism to compute the\nsimilarity between images and text. Extensive quantitative comparative\nexperiments on MS-COCO and Flickr30K datasets demonstrate that our approach\noutperforms state-of-the-art methods by about 5% to 10% in the rSum metric.\n","authors":["Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.02183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04125v1","updated":"2023-12-07T08:28:41Z","published":"2023-12-07T08:28:41Z","title":"Forensic Iris Image Synthesis","summary":" Post-mortem iris recognition is an emerging application of iris-based human\nidentification in a forensic setup, able to correctly identify deceased\nsubjects even three weeks post-mortem. This technique thus is considered as an\nimportant component of future forensic toolkits. The current advancements in\nthis field are seriously slowed down by exceptionally difficult data\ncollection, which can happen in mortuary conditions, at crime scenes, or in\n``body farm'' facilities. This paper makes a novel contribution to facilitate\nprogress in post-mortem iris recognition by offering a conditional\nStyleGAN-based iris synthesis model, trained on the largest-available dataset\nof post-mortem iris samples acquired from more than 350 subjects, generating --\nthrough appropriate exploration of StyleGAN latent space -- multiple\nwithin-class (same identity) and between-class (different new identities)\npost-mortem iris images, compliant with ISO/IEC 29794-6, and with decomposition\ndeformations controlled by the requested PMI (post mortem interval). Besides an\nobvious application to enhance the existing, very sparse, post-mortem iris\ndatasets to advance -- among others -- iris presentation attack endeavors, we\nanticipate it may be useful to generate samples that would expose professional\nforensic human examiners to never-seen-before deformations for various PMIs,\nincreasing their training effectiveness. The source codes and model weights are\nmade available with the paper.\n","authors":["Rasel Ahmed Bhuiyan","Adam Czajka"],"pdf_url":"https://arxiv.org/pdf/2312.04125v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04119v1","updated":"2023-12-07T08:20:07Z","published":"2023-12-07T08:20:07Z","title":"A Multilevel Guidance-Exploration Network and Behavior-Scene Matching\n Method for Human Behavior Anomaly Detection","summary":" Human behavior anomaly detection aims to identify unusual human actions,\nplaying a crucial role in intelligent surveillance and other areas. The current\nmainstream methods still adopt reconstruction or future frame prediction\ntechniques. However, reconstructing or predicting low-level pixel features\neasily enables the network to achieve overly strong generalization ability,\nallowing anomalies to be reconstructed or predicted as effectively as normal\ndata. Different from their methods, inspired by the Student-Teacher Network, we\npropose a novel framework called the Multilevel Guidance-Exploration\nNetwork(MGENet), which detects anomalies through the difference in high-level\nrepresentation between the Guidance and Exploration network. Specifically, we\nfirst utilize the pre-trained Normalizing Flow that takes skeletal keypoints as\ninput to guide an RGB encoder, which takes unmasked RGB frames as input, to\nexplore motion latent features. Then, the RGB encoder guides the mask encoder,\nwhich takes masked RGB frames as input, to explore the latent appearance\nfeature. Additionally, we design a Behavior-Scene Matching Module(BSMM) to\ndetect scene-related behavioral anomalies. Extensive experiments demonstrate\nthat our proposed method achieves state-of-the-art performance on ShanghaiTech\nand UBnormal datasets, with AUC of 86.9 % and 73.5 %, respectively. The code\nwill be available on https://github.com/molu-ggg/GENet.\n","authors":["Guoqing Yang","Zhiming Luo","Jianzhe Gao","Yingxin Lai","Kun Yang","Yifan He","Shaozi Li"],"pdf_url":"https://arxiv.org/pdf/2312.04119v1.pdf","comment":"8 pages,7 figures"},{"id":"http://arxiv.org/abs/2312.04118v1","updated":"2023-12-07T08:18:40Z","published":"2023-12-07T08:18:40Z","title":"Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic\n Play","summary":" Infants' ability to recognize and categorize objects develops gradually. The\nsecond year of life is marked by both the emergence of more semantic visual\nrepresentations and a better understanding of word meaning. This suggests that\nlanguage input may play an important role in shaping visual representations.\nHowever, even in suitable contexts for word learning like dyadic play sessions,\ncaregivers utterances are sparse and ambiguous, often referring to objects that\nare different from the one to which the child attends. Here, we systematically\ninvestigate to what extent caregivers' utterances can nevertheless enhance\nvisual representations. For this we propose a computational model of visual\nrepresentation learning during dyadic play. We introduce a synthetic dataset of\nego-centric images perceived by a toddler-agent that moves and rotates toy\nobjects in different parts of its home environment while hearing caregivers'\nutterances, modeled as captions. We propose to model toddlers' learning as\nsimultaneously aligning representations for 1) close-in-time images and 2)\nco-occurring images and utterances. We show that utterances with statistics\nmatching those of real caregivers give rise to representations supporting\nimproved category recognition. Our analysis reveals that a small\ndecrease/increase in object-relevant naming frequencies can drastically impact\nthe learned representations. This affects the attention on object names within\nan utterance, which is required for efficient visuo-linguistic alignment.\nOverall, our results support the hypothesis that caregivers' naming utterances\ncan improve toddlers' visual representations.\n","authors":["Timothy Schaumlöffel","Arthur Aubret","Gemma Roig","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2312.04118v1.pdf","comment":"Proceedings of the 2023 IEEE International Conference on Development\n and Learning (ICDL)"},{"id":"http://arxiv.org/abs/2312.04117v1","updated":"2023-12-07T08:18:35Z","published":"2023-12-07T08:18:35Z","title":"Instance Tracking in 3D Scenes from Egocentric Videos","summary":" Egocentric sensors such as AR/VR devices capture human-object interactions\nand offer the potential to provide task-assistance by recalling 3D locations of\nobjects of interest in the surrounding environment. This capability requires\ninstance tracking in real-world 3D scenes from egocentric videos (IT3DEgo). We\nexplore this problem by first introducing a new benchmark dataset, consisting\nof RGB and depth videos, per-frame camera pose, and instance-level annotations\nin both 2D camera and 3D world coordinates. We present an evaluation protocol\nwhich evaluates tracking performance in 3D coordinates with two settings for\nenrolling instances to track: (1) single-view online enrollment where an\ninstance is specified on-the-fly based on the human wearer's interactions. and\n(2) multi-view pre-enrollment where images of an instance to be tracked are\nstored in memory ahead of time. To address IT3DEgo, we first re-purpose methods\nfrom relevant areas, e.g., single object tracking (SOT) -- running SOT methods\nto track instances in 2D frames and lifting them to 3D using camera pose and\ndepth. We also present a simple method that leverages pretrained segmentation\nand detection models to generate proposals from RGB frames and match proposals\nwith enrolled instance images. Perhaps surprisingly, our extensive experiments\nshow that our method (with no finetuning) significantly outperforms SOT-based\napproaches. We conclude by arguing that the problem of egocentric instance\ntracking is made easier by leveraging camera pose and using a 3D allocentric\n(world) coordinate representation.\n","authors":["Yunhan Zhao","Haoyu Ma","Shu Kong","Charless Fowlkes"],"pdf_url":"https://arxiv.org/pdf/2312.04117v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.10158v2","updated":"2023-12-07T08:04:48Z","published":"2023-08-20T04:12:50Z","title":"HODN: Disentangling Human-Object Feature for HOI Detection","summary":" The task of Human-Object Interaction (HOI) detection is to detect humans and\ntheir interactions with surrounding objects, where transformer-based methods\nshow dominant advances currently. However, these methods ignore the\nrelationship among humans, objects, and interactions: 1) human features are\nmore contributive than object ones to interaction prediction; 2) interactive\ninformation disturbs the detection of objects but helps human detection. In\nthis paper, we propose a Human and Object Disentangling Network (HODN) to model\nthe HOI relationships explicitly, where humans and objects are first detected\nby two disentangling decoders independently and then processed by an\ninteraction decoder. Considering that human features are more contributive to\ninteraction, we propose a Human-Guide Linking method to make sure the\ninteraction decoder focuses on the human-centric regions with human features as\nthe positional embeddings. To handle the opposite influences of interactions on\nhumans and objects, we propose a Stop-Gradient Mechanism to stop interaction\ngradients from optimizing the object detection but to allow them to optimize\nthe human detection. Our proposed method achieves competitive performance on\nboth the V-COCO and the HICO-Det datasets. It can be combined with existing\nmethods easily for state-of-the-art results.\n","authors":["Shuman Fang","Zhiwen Lin","Ke Yan","Jie Li","Xianming Lin","Rongrong Ji"],"pdf_url":"https://arxiv.org/pdf/2308.10158v2.pdf","comment":"Accepted by TMM 2023"},{"id":"http://arxiv.org/abs/2312.04113v1","updated":"2023-12-07T07:59:10Z","published":"2023-12-07T07:59:10Z","title":"Multi-strategy Collaborative Optimized YOLOv5s and its Application in\n Distance Estimation","summary":" The increasing accident rate brought about by the explosive growth of\nautomobiles has made the research on active safety systems of automobiles\nincreasingly important. The importance of improving the accuracy of vehicle\ntarget detection is self-evident. To achieve the goals of vehicle detection and\ndistance estimation and provide safety warnings, a Distance Estimation Safety\nWarning System (DESWS) based on a new neural network model (YOLOv5s-SE) by\nreplacing the IoU with DIoU, embedding SE attention module, and a distance\nestimation method through using the principle of similar triangles was\nproposed. In addition, a method that can give safety suggestions based on the\nestimated distance using nonparametric testing was presented in this work.\nThrough the simulation experiment, it was verified that the mAP was improved by\n5.5% and the purpose of giving safety suggestions based on the estimated\ndistance information can be achieved.\n","authors":["Zijian Shen","Zhenping Mu","Xiangxiang Li"],"pdf_url":"https://arxiv.org/pdf/2312.04113v1.pdf","comment":"This paper contains 5 pages, 10 figures, and was accepted at 4th\n International Conference on Advances in Electrical Engineering and Computer\n Applications (AEECA2023)"},{"id":"http://arxiv.org/abs/2305.16225v3","updated":"2023-12-07T07:56:52Z","published":"2023-05-25T16:32:01Z","title":"ProSpect: Prompt Spectrum for Attribute-Aware Personalization of\n Diffusion Models","summary":" Personalizing generative models offers a way to guide image generation with\nuser-provided references. Current personalization methods can invert an object\nor concept into the textual conditioning space and compose new natural\nsentences for text-to-image diffusion models. However, representing and editing\nspecific visual attributes such as material, style, and layout remains a\nchallenge, leading to a lack of disentanglement and editability. To address\nthis problem, we propose a novel approach that leverages the step-by-step\ngeneration process of diffusion models, which generate images from low to high\nfrequency information, providing a new perspective on representing, generating,\nand editing images. We develop the Prompt Spectrum Space P*, an expanded\ntextual conditioning space, and a new image representation method called\n\\sysname. ProSpect represents an image as a collection of inverted textual\ntoken embeddings encoded from per-stage prompts, where each prompt corresponds\nto a specific generation stage (i.e., a group of consecutive steps) of the\ndiffusion model. Experimental results demonstrate that P* and ProSpect offer\nbetter disentanglement and controllability compared to existing methods. We\napply ProSpect in various personalized attribute-aware image generation\napplications, such as image-guided or text-driven manipulations of materials,\nstyle, and layout, achieving previously unattainable results from a single\nimage input without fine-tuning the diffusion models. Our source code is\navailable athttps://github.com/zyxElsa/ProSpect.\n","authors":["Yuxin Zhang","Weiming Dong","Fan Tang","Nisha Huang","Haibin Huang","Chongyang Ma","Tong-Yee Lee","Oliver Deussen","Changsheng Xu"],"pdf_url":"https://arxiv.org/pdf/2305.16225v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2112.00428v3","updated":"2023-12-07T07:48:55Z","published":"2021-12-01T11:37:26Z","title":"Adv-4-Adv: Thwarting Changing Adversarial Perturbations via Adversarial\n Domain Adaptation","summary":" Whereas adversarial training can be useful against specific adversarial\nperturbations, they have also proven ineffective in generalizing towards\nattacks deviating from those used for training. However, we observe that this\nineffectiveness is intrinsically connected to domain adaptability, another\ncrucial issue in deep learning for which adversarial domain adaptation appears\nto be a promising solution. Consequently, we proposed Adv-4-Adv as a novel\nadversarial training method that aims to retain robustness against unseen\nadversarial perturbations. Essentially, Adv-4-Adv treats attacks incurring\ndifferent perturbations as distinct domains, and by leveraging the power of\nadversarial domain adaptation, it aims to remove the domain/attack-specific\nfeatures. This forces a trained model to learn a robust domain-invariant\nrepresentation, which in turn enhances its generalization ability. Extensive\nevaluations on Fashion-MNIST, SVHN, CIFAR-10, and CIFAR-100 demonstrate that a\nmodel trained by Adv-4-Adv based on samples crafted by simple attacks (e.g.,\nFGSM) can be generalized to more advanced attacks (e.g., PGD), and the\nperformance exceeds state-of-the-art proposals on these datasets.\n","authors":["Tianyue Zheng","Zhe Chen","Shuya Ding","Chao Cai","Jun Luo"],"pdf_url":"https://arxiv.org/pdf/2112.00428v3.pdf","comment":"22 pages"},{"id":"http://arxiv.org/abs/2302.11196v2","updated":"2023-12-07T07:44:24Z","published":"2023-02-22T08:13:34Z","title":"Invariant Target Detection in Images through the Normalized 2-D\n Correlation Technique","summary":" The normalized 2-D correlation technique is a robust method for detecting\ntargets in images due to its ability to remain invariant under rotation,\ntranslation, and scaling. This paper examines the impact of translation, and\nscaling on target identification in images. The results indicate a high level\nof accuracy in detecting targets, even when they are exhibit variations in\nlocation and size. The results indicate that the similarity between the image\nand the two used targets improves as the resize ratio increases. All\nstatistical estimators demonstrate a strong similarity between the original and\nextracted targets. The elapsed time for all scenarios falls within the range\n(44.75-44.85), (37.48-37.73) seconds for bird and children targets\nrespectively, and the correlation coefficient displays stable relationships\nwith values that fall within the range of (0.90-0.98) and (0.87-0.93) for bird\nand children targets respectively.\n","authors":["Fatin E. M. Al-Obaidi","Anwar H. Al-Saleh","Shaymaa H. Kafi","Ali J. Karam","Ali A. D. Al-Zuky"],"pdf_url":"https://arxiv.org/pdf/2302.11196v2.pdf","comment":"The paper in its uploaded format contained errors in analysis as in\n Abstract-4th and 5th line. In section 4 ; results and discussion and in\n conclusion section. So, in order not to adopt the research as a source by\n other researchers I wish to withdraw it completely"},{"id":"http://arxiv.org/abs/2312.04106v1","updated":"2023-12-07T07:41:10Z","published":"2023-12-07T07:41:10Z","title":"Identity-Obscured Neural Radiance Fields: Privacy-Preserving 3D Facial\n Reconstruction","summary":" Neural radiance fields (NeRF) typically require a complete set of images\ntaken from multiple camera perspectives to accurately reconstruct geometric\ndetails. However, this approach raise significant privacy concerns in the\ncontext of facial reconstruction. The critical need for privacy protection\noften leads invidividuals to be reluctant in sharing their facial images, due\nto fears of potential misuse or security risks. Addressing these concerns, we\npropose a method that leverages privacy-preserving images for reconstructing 3D\nhead geometry within the NeRF framework. Our method stands apart from\ntraditional facial reconstruction techniques as it does not depend on RGB\ninformation from images containing sensitive facial data. Instead, it\neffectively generates plausible facial geometry using a series of\nidentity-obscured inputs, thereby protecting facial privacy.\n","authors":["Jiayi Kong","Baixin Xu","Xurui Song","Chen Qian","Jun Luo","Ying He"],"pdf_url":"https://arxiv.org/pdf/2312.04106v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04095v1","updated":"2023-12-07T07:17:24Z","published":"2023-12-07T07:17:24Z","title":"Learn to Unlearn for Deep Neural Networks: Minimizing Unlearning\n Interference with Gradient Projection","summary":" Recent data-privacy laws have sparked interest in machine unlearning, which\ninvolves removing the effect of specific training samples from a learnt model\nas if they were never present in the original training dataset. The challenge\nof machine unlearning is to discard information about the ``forget'' data in\nthe learnt model without altering the knowledge about the remaining dataset and\nto do so more efficiently than the naive retraining approach. To achieve this,\nwe adopt a projected-gradient based learning method, named as\nProjected-Gradient Unlearning (PGU), in which the model takes steps in the\northogonal direction to the gradient subspaces deemed unimportant for the\nretaining dataset, so as to its knowledge is preserved. By utilizing Stochastic\nGradient Descent (SGD) to update the model weights, our method can efficiently\nscale to any model and dataset size. We provide empirically evidence to\ndemonstrate that our unlearning method can produce models that behave similar\nto models retrained from scratch across various metrics even when the training\ndataset is no longer accessible. Our code is available at\nhttps://github.com/hnanhtuan/projected_gradient_unlearning.\n","authors":["Tuan Hoang","Santu Rana","Sunil Gupta","Svetha Venkatesh"],"pdf_url":"https://arxiv.org/pdf/2312.04095v1.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2207.13326v2","updated":"2023-12-07T07:05:12Z","published":"2022-07-27T07:02:36Z","title":"Point Cloud Attacks in Graph Spectral Domain: When 3D Geometry Meets\n Graph Signal Processing","summary":" With the increasing attention in various 3D safety-critical applications,\npoint cloud learning models have been shown to be vulnerable to adversarial\nattacks. Although existing 3D attack methods achieve high success rates, they\ndelve into the data space with point-wise perturbation, which may neglect the\ngeometric characteristics. Instead, we propose point cloud attacks from a new\nperspective -- the graph spectral domain attack, aiming to perturb graph\ntransform coefficients in the spectral domain that corresponds to varying\ncertain geometric structure. Specifically, leveraging on graph signal\nprocessing, we first adaptively transform the coordinates of points onto the\nspectral domain via graph Fourier transform (GFT) for compact representation.\nThen, we analyze the influence of different spectral bands on the geometric\nstructure, based on which we propose to perturb the GFT coefficients via a\nlearnable graph spectral filter. Considering the low-frequency components\nmainly contribute to the rough shape of the 3D object, we further introduce a\nlow-frequency constraint to limit perturbations within imperceptible\nhigh-frequency components. Finally, the adversarial point cloud is generated by\ntransforming the perturbed spectral representation back to the data domain via\nthe inverse GFT. Experimental results demonstrate the effectiveness of the\nproposed attack in terms of both the imperceptibility and attack success rates.\n","authors":["Daizong Liu","Wei Hu","Xin Li"],"pdf_url":"https://arxiv.org/pdf/2207.13326v2.pdf","comment":"Accepted to IEEE Transactions on Pattern Analysis and Machine\n Intelligence (TPAMI). arXiv admin note: substantial text overlap with\n arXiv:2202.07261"},{"id":"http://arxiv.org/abs/2312.04089v1","updated":"2023-12-07T07:00:09Z","published":"2023-12-07T07:00:09Z","title":"Open-Vocabulary Segmentation with Semantic-Assisted Calibration","summary":" This paper studies open-vocabulary segmentation (OVS) through calibrating\nin-vocabulary and domain-biased embedding space with generalized contextual\nprior of CLIP. As the core of open-vocabulary understanding, alignment of\nvisual content with the semantics of unbounded text has become the bottleneck\nof this field. To address this challenge, recent works propose to utilize CLIP\nas an additional classifier and aggregate model predictions with CLIP\nclassification results. Despite their remarkable progress, performance of OVS\nmethods in relevant scenarios is still unsatisfactory compared with supervised\ncounterparts. We attribute this to the in-vocabulary embedding and\ndomain-biased CLIP prediction. To this end, we present a Semantic-assisted\nCAlibration Network (SCAN). In SCAN, we incorporate generalized semantic prior\nof CLIP into proposal embedding to avoid collapsing on known categories.\nBesides, a contextual shift strategy is applied to mitigate the lack of global\ncontext and unnatural background noise. With above designs, SCAN achieves\nstate-of-the-art performance on all popular open-vocabulary segmentation\nbenchmarks. Furthermore, we also focus on the problem of existing evaluation\nsystem that ignores semantic duplication across categories, and propose a new\nmetric called Semantic-Guided IoU (SG-IoU).\n","authors":["Yong Liu","Sule Bai","Guanbin Li","Yitong Wang","Yansong Tang"],"pdf_url":"https://arxiv.org/pdf/2312.04089v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04087v1","updated":"2023-12-07T06:53:55Z","published":"2023-12-07T06:53:55Z","title":"VRPTEST: Evaluating Visual Referring Prompting in Large Multimodal\n Models","summary":" With recent advancements in Large Multimodal Models (LMMs) across various\ndomains, a novel prompting method called visual referring prompting has\nemerged, showing significant potential in enhancing human-computer interaction\nwithin multimodal systems. This method offers a more natural and flexible\napproach to human interaction with these systems compared to traditional text\ndescriptions or coordinates. However, the categorization of visual referring\nprompting remains undefined, and its impact on the performance of LMMs has yet\nto be formally examined. In this study, we conduct the first comprehensive\nanalysis of LMMs using a variety of visual referring prompting strategies. We\nintroduce a benchmark dataset called VRPTEST, comprising 3 different visual\ntasks and 2,275 images, spanning diverse combinations of prompt strategies.\nUsing VRPTEST, we conduct a comprehensive evaluation of eight versions of\nprominent open-source and proprietary foundation models, including two early\nversions of GPT-4V. We develop an automated assessment framework based on\nsoftware metamorphic testing techniques to evaluate the accuracy of LMMs\nwithout the need for human intervention or manual labeling. We find that the\ncurrent proprietary models generally outperform the open-source ones, showing\nan average accuracy improvement of 22.70%; however, there is still potential\nfor improvement. Moreover, our quantitative analysis shows that the choice of\nprompt strategy significantly affects the accuracy of LMMs, with variations\nranging from -17.5% to +7.3%. Further case studies indicate that an appropriate\nvisual referring prompting strategy can improve LMMs' understanding of context\nand location information, while an unsuitable one might lead to answer\nrejection. We also provide insights on minimizing the negative impact of visual\nreferring prompting on LMMs.\n","authors":["Zongjie Li","Chaozheng Wang","Chaowei Liu","Pingchuan Ma","Daoyuan Wu","Shuai Wang","Cuiyun Gao"],"pdf_url":"https://arxiv.org/pdf/2312.04087v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2312.04086v1","updated":"2023-12-07T06:53:25Z","published":"2023-12-07T06:53:25Z","title":"MTVG : Multi-text Video Generation with Text-to-Video Models","summary":" Recently, video generation has attracted massive attention and yielded\nnoticeable outcomes. Concerning the characteristics of video, multi-text\nconditioning incorporating sequential events is necessary for next-step video\ngeneration. In this work, we propose a novel multi-text video generation~(MTVG)\nby directly utilizing a pre-trained diffusion-based text-to-video~(T2V)\ngeneration model without additional fine-tuning. To generate consecutive video\nsegments, visual consistency generated by distinct prompts is necessary with\ndiverse variations, such as motion and content-related transitions. Our\nproposed MTVG includes Dynamic Noise and Last Frame Aware Inversion which\nreinitialize the noise latent to preserve visual coherence between videos of\ndifferent prompts and prevent repetitive motion or contents. Furthermore, we\npresent Structure Guiding Sampling to maintain the global appearance across the\nframes in a single video clip, where we leverage iterative latent updates\nacross the preceding frame. Additionally, our Prompt Generator allows for\narbitrary format of text conditions consisting of diverse events. As a result,\nour extensive experiments, including diverse transitions of descriptions,\ndemonstrate that our proposed methods show superior generated outputs in terms\nof semantically coherent and temporally seamless video.Video examples are\navailable in our project page: https://kuai-lab.github.io/mtvg-page.\n","authors":["Gyeongrok Oh","Jaehwan Jeong","Sieun Kim","Wonmin Byeon","Jinkyu Kim","Sungwoong Kim","Hyeokmin Kwon","Sangpil Kim"],"pdf_url":"https://arxiv.org/pdf/2312.04086v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.05588v2","updated":"2023-12-07T06:52:21Z","published":"2022-11-10T13:58:38Z","title":"Watching the News: Towards VideoQA Models that can Read","summary":" Video Question Answering methods focus on commonsense reasoning and visual\ncognition of objects or persons and their interactions over time. Current\nVideoQA approaches ignore the textual information present in the video.\nInstead, we argue that textual information is complementary to the action and\nprovides essential contextualisation cues to the reasoning process. To this\nend, we propose a novel VideoQA task that requires reading and understanding\nthe text in the video. To explore this direction, we focus on news videos and\nrequire QA systems to comprehend and answer questions about the topics\npresented by combining visual and textual cues in the video. We introduce the\n``NewsVideoQA'' dataset that comprises more than $8,600$ QA pairs on $3,000+$\nnews videos obtained from diverse news channels from around the world. We\ndemonstrate the limitations of current Scene Text VQA and VideoQA methods and\npropose ways to incorporate scene text information into VideoQA methods.\n","authors":["Soumya Jahagirdar","Minesh Mathew","Dimosthenis Karatzas","C. V. Jawahar"],"pdf_url":"https://arxiv.org/pdf/2211.05588v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04076v1","updated":"2023-12-07T06:43:34Z","published":"2023-12-07T06:43:34Z","title":"Large Language Models are Good Prompt Learners for Low-Shot Image\n Classification","summary":" Low-shot image classification, where training images are limited or\ninaccessible, has benefited from recent progress on pre-trained vision-language\n(VL) models with strong generalizability, e.g. CLIP. Prompt learning methods\nbuilt with VL models generate text features from the class names that only have\nconfined class-specific information. Large Language Models (LLMs), with their\nvast encyclopedic knowledge, emerge as the complement. Thus, in this paper, we\ndiscuss the integration of LLMs to enhance pre-trained VL models, specifically\non low-shot classification. However, the domain gap between language and vision\nblocks the direct application of LLMs. Thus, we propose LLaMP, Large Language\nModels as Prompt learners, that produces adaptive prompts for the CLIP text\nencoder, establishing it as the connecting bridge. Experiments show that,\ncompared with other state-of-the-art prompt learning methods, LLaMP yields\nbetter performance on both zero-shot generalization and few-shot image\nclassification, over a spectrum of 11 datasets.\n","authors":["Zhaoheng Zheng","Jingmin Wei","Xuefeng Hu","Haidong Zhu","Ram Nevatia"],"pdf_url":"https://arxiv.org/pdf/2312.04076v1.pdf","comment":"Technical Report"},{"id":"http://arxiv.org/abs/2304.14484v2","updated":"2023-12-07T06:17:29Z","published":"2023-04-27T19:52:47Z","title":"OriCon3D: Effective 3D Object Detection using Orientation and Confidence","summary":" We introduce a technique for detecting 3D objects and estimating their\nposition from a single image. Our method is built on top of a similar\nstate-of-the-art technique [1], but with improved accuracy. The approach\nfollowed in this research first estimates common 3D properties of an object\nusing a Deep Convolutional Neural Network (DCNN), contrary to other frameworks\nthat only leverage centre-point predictions. We then combine these estimates\nwith geometric constraints provided by a 2D bounding box to produce a complete\n3D bounding box. The first output of our network estimates the 3D object\norientation using a discrete-continuous loss [1]. The second output predicts\nthe 3D object dimensions with minimal variance. Here we also present our\nextensions by augmenting light-weight feature extractors and a customized\nmultibin architecture. By combining these estimates with the geometric\nconstraints of the 2D bounding box, we can accurately (or comparatively)\ndetermine the 3D object pose better than our baseline [1] on the KITTI 3D\ndetection benchmark [2].\n","authors":["Dhyey Manish Rajani","Surya Pratap Singh","Rahul Kashyap Swayampakula"],"pdf_url":"https://arxiv.org/pdf/2304.14484v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04066v1","updated":"2023-12-07T06:16:39Z","published":"2023-12-07T06:16:39Z","title":"Combining inherent knowledge of vision-language models with unsupervised\n domain adaptation through self-knowledge distillation","summary":" Unsupervised domain adaptation (UDA) tries to overcome the tedious work of\nlabeling data by leveraging a labeled source dataset and transferring its\nknowledge to a similar but different target dataset. On the other hand, current\nvision-language models exhibit astonishing zero-shot prediction capabilities.\nIn this work, we combine knowledge gained through UDA with the inherent\nknowledge of vision-language models. In a first step, we generate the zero-shot\npredictions of the source and target dataset using the vision-language model.\nSince zero-shot predictions usually exhibit a large entropy, meaning that the\nclass probabilities are rather evenly distributed, we first adjust the\ndistribution to accentuate the winning probabilities. This is done using both\nsource and target data to keep the relative confidence between source and\ntarget data. We then employ a conventional DA method, to gain the knowledge\nfrom the source dataset, in combination with self-knowledge distillation, to\nmaintain the inherent knowledge of the vision-language model. We further\ncombine our method with a gradual source domain expansion strategy (GSDE) and\nshow that this strategy can also benefit by including zero-shot predictions. We\nconduct experiments and ablation studies on three benchmarks (OfficeHome,\nVisDA, and DomainNet) and outperform state-of-the-art methods. We further show\nin ablation studies the contributions of different parts of our algorithm.\n","authors":["Thomas Westfechtel","Dexuan Zhang","Tatsuya Harada"],"pdf_url":"https://arxiv.org/pdf/2312.04066v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11086v2","updated":"2023-12-07T06:04:58Z","published":"2023-07-20T17:59:33Z","title":"PAPR: Proximity Attention Point Rendering","summary":" Learning accurate and parsimonious point cloud representations of scene\nsurfaces from scratch remains a challenge in 3D representation learning.\nExisting point-based methods often suffer from the vanishing gradient problem\nor require a large number of points to accurately model scene geometry and\ntexture. To address these limitations, we propose Proximity Attention Point\nRendering (PAPR), a novel method that consists of a point-based scene\nrepresentation and a differentiable renderer. Our scene representation uses a\npoint cloud where each point is characterized by its spatial position,\ninfluence score, and view-independent feature vector. The renderer selects the\nrelevant points for each ray and produces accurate colours using their\nassociated features. PAPR effectively learns point cloud positions to represent\nthe correct scene geometry, even when the initialization drastically differs\nfrom the target geometry. Notably, our method captures fine texture details\nwhile using only a parsimonious set of points. We also demonstrate four\npractical applications of our method: zero-shot geometry editing, object\nmanipulation, texture transfer, and exposure control. More results and code are\navailable on our project website at https://zvict.github.io/papr/.\n","authors":["Yanshu Zhang","Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2307.11086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04063v1","updated":"2023-12-07T06:03:07Z","published":"2023-12-07T06:03:07Z","title":"An unsupervised approach towards promptable defect segmentation in\n laser-based additive manufacturing by Segment Anything","summary":" Foundation models are currently driving a paradigm shift in computer vision\ntasks for various fields including biology, astronomy, and robotics among\nothers, leveraging user-generated prompts to enhance their performance. In the\nmanufacturing domain, accurate image-based defect segmentation is imperative to\nensure product quality and facilitate real-time process control. However, such\ntasks are often characterized by multiple challenges including the absence of\nlabels and the requirement for low latency inference among others. To address\nthese issues, we construct a framework for image segmentation using a\nstate-of-the-art Vision Transformer (ViT) based Foundation model (Segment\nAnything Model) with a novel multi-point prompt generation scheme using\nunsupervised clustering. We apply our framework to perform real-time porosity\nsegmentation in a case study of laser base powder bed fusion (L-PBF) and obtain\nhigh Dice Similarity Coefficients (DSC) without the necessity for any\nsupervised fine-tuning in the model. Using such lightweight foundation model\ninference in conjunction with unsupervised prompt generation, we envision the\nconstruction of a real-time anomaly detection pipeline that has the potential\nto revolutionize the current laser-based additive manufacturing processes,\nthereby facilitating the shift towards Industry 4.0 and promoting defect-free\nproduction along with operational efficiency.\n","authors":["Israt Zarin Era","Imtiaz Ahmed","Zhichao Liu","Srinjoy Das"],"pdf_url":"https://arxiv.org/pdf/2312.04063v1.pdf","comment":"18 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.11642v2","updated":"2023-12-07T05:47:50Z","published":"2023-11-20T10:01:13Z","title":"Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging","summary":" Video face re-aging deals with altering the apparent age of a person to the\ntarget age in videos. This problem is challenging due to the lack of paired\nvideo datasets maintaining temporal consistency in identity and age. Most\nre-aging methods process each image individually without considering the\ntemporal consistency of videos. While some existing works address the issue of\ntemporal coherence through video facial attribute manipulation in latent space,\nthey often fail to deliver satisfactory performance in age transformation. To\ntackle the issues, we propose (1) a novel synthetic video dataset that features\nsubjects across a diverse range of age groups; (2) a baseline architecture\ndesigned to validate the effectiveness of our proposed dataset, and (3) the\ndevelopment of three novel metrics tailored explicitly for evaluating the\ntemporal consistency of video re-aging techniques. Our comprehensive\nexperiments on public datasets, such as VFHQ and CelebV-HQ, show that our\nmethod outperforms the existing approaches in terms of both age transformation\nand temporal consistency.\n","authors":["Abdul Muqeet","Kyuchul Lee","Bumsoo Kim","Yohan Hong","Hyungrae Lee","Woonggon Kim","KwangHee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11642v2.pdf","comment":"8 pages, 6 figures, 4 tables, Project page:\n https://video-reaging.github.io/"},{"id":"http://arxiv.org/abs/2312.04060v1","updated":"2023-12-07T05:46:10Z","published":"2023-12-07T05:46:10Z","title":"Differentiable Registration of Images and LiDAR Point Clouds with\n VoxelPoint-to-Pixel Matching","summary":" Cross-modality registration between 2D images from cameras and 3D point\nclouds from LiDARs is a crucial task in computer vision and robotic. Previous\nmethods estimate 2D-3D correspondences by matching point and pixel patterns\nlearned by neural networks, and use Perspective-n-Points (PnP) to estimate\nrigid transformation during post-processing. However, these methods struggle to\nmap points and pixels to a shared latent space robustly since points and pixels\nhave very different characteristics with patterns learned in different manners\n(MLP and CNN), and they also fail to construct supervision directly on the\ntransformation since the PnP is non-differentiable, which leads to unstable\nregistration results. To address these problems, we propose to learn a\nstructured cross-modality latent space to represent pixel features and 3D\nfeatures via a differentiable probabilistic PnP solver. Specifically, we design\na triplet network to learn VoxelPoint-to-Pixel matching, where we represent 3D\nelements using both voxels and points to learn the cross-modality latent space\nwith pixels. We design both the voxel and pixel branch based on CNNs to operate\nconvolutions on voxels/pixels represented in grids, and integrate an additional\npoint branch to regain the information lost during voxelization. We train our\nframework end-to-end by imposing supervisions directly on the predicted pose\ndistribution with a probabilistic PnP solver. To explore distinctive patterns\nof cross-modality features, we design a novel loss with adaptive-weighted\noptimization for cross-modality feature description. The experimental results\non KITTI and nuScenes datasets show significant improvements over the\nstate-of-the-art methods. The code and models are available at\nhttps://github.com/junshengzhou/VP2P-Match.\n","authors":["Junsheng Zhou","Baorui Ma","Wenyuan Zhang","Yi Fang","Yu-Shen Liu","Zhizhong Han"],"pdf_url":"https://arxiv.org/pdf/2312.04060v1.pdf","comment":"To appear at NeurIPS2023 (Spotlight). Code is available at\n https://github.com/junshengzhou/VP2P-Match"},{"id":"http://arxiv.org/abs/2310.13255v2","updated":"2023-12-07T05:36:26Z","published":"2023-10-20T03:22:05Z","title":"Steve-Eye: Equipping LLM-based Embodied Agents with Visual Perception in\n Open Worlds","summary":" Recent studies have presented compelling evidence that large language models\n(LLMs) can equip embodied agents with the self-driven capability to interact\nwith the world, which marks an initial step toward versatile robotics. However,\nthese efforts tend to overlook the visual richness of open worlds, rendering\nthe entire interactive process akin to \"a blindfolded text-based game.\"\nConsequently, LLM-based agents frequently encounter challenges in intuitively\ncomprehending their surroundings and producing responses that are easy to\nunderstand. In this paper, we propose Steve-Eye, an end-to-end trained large\nmultimodal model designed to address this limitation. Steve-Eye integrates the\nLLM with a visual encoder which enables it to process visual-text inputs and\ngenerate multimodal feedback. In addition, we use a semi-automatic strategy to\ncollect an extensive dataset comprising 850K open-world instruction pairs,\nempowering our model to encompass three essential functions for an agent:\nmultimodal perception, foundational knowledge base, and skill prediction and\nplanning. Lastly, we develop three open-world evaluation benchmarks, then carry\nout extensive experiments from a wide range of perspectives to validate our\nmodel's capability to strategically act and plan. Codes and datasets will be\nreleased.\n","authors":["Sipeng Zheng","Jiazheng Liu","Yicheng Feng","Zongqing Lu"],"pdf_url":"https://arxiv.org/pdf/2310.13255v2.pdf","comment":"19 pages, 19 figures"},{"id":"http://arxiv.org/abs/2312.04044v1","updated":"2023-12-07T05:04:41Z","published":"2023-12-07T05:04:41Z","title":"Residual Graph Convolutional Network for Bird's-Eye-View Semantic\n Segmentation","summary":" Retrieving spatial information and understanding the semantic information of\nthe surroundings are important for Bird's-Eye-View (BEV) semantic segmentation.\nIn the application of autonomous driving, autonomous vehicles need to be aware\nof their surroundings to drive safely. However, current BEV semantic\nsegmentation techniques, deep Convolutional Neural Networks (CNNs) and\ntransformers, have difficulties in obtaining the global semantic relationships\nof the surroundings at the early layers of the network. In this paper, we\npropose to incorporate a novel Residual Graph Convolutional (RGC) module in\ndeep CNNs to acquire both the global information and the region-level semantic\nrelationship in the multi-view image domain. Specifically, the RGC module\nemploys a non-overlapping graph space projection to efficiently project the\ncomplete BEV information into graph space. It then builds interconnected\nspatial and channel graphs to extract spatial information between each node and\nchannel information within each node (i.e., extract contextual relationships of\nthe global features). Furthermore, it uses a downsample residual process to\nenhance the coordinate feature reuse to maintain the global information. The\nsegmentation data augmentation and alignment module helps to simultaneously\naugment and align BEV features and ground truth to geometrically preserve their\nalignment to achieve better segmentation results. Our experimental results on\nthe nuScenes benchmark dataset demonstrate that the RGC network outperforms\nfour state-of-the-art networks and its four variants in terms of IoU and mIoU.\nThe proposed RGC network achieves a higher mIoU of 3.1% than the best\nstate-of-the-art network, BEVFusion. Code and models will be released.\n","authors":["Qiuxiao Chen","Xiaojun Qi"],"pdf_url":"https://arxiv.org/pdf/2312.04044v1.pdf","comment":"8 pages, 5 figures, this paper has been accepted by and will be\n presented at the WACV 2024"},{"id":"http://arxiv.org/abs/2312.04043v1","updated":"2023-12-07T05:04:33Z","published":"2023-12-07T05:04:33Z","title":"Doodle Your 3D: From Abstract Freehand Sketches to Precise 3D Shapes","summary":" In this paper, we democratise 3D content creation, enabling precise\ngeneration of 3D shapes from abstract sketches while overcoming limitations\ntied to drawing skills. We introduce a novel part-level modelling and alignment\nframework that facilitates abstraction modelling and cross-modal\ncorrespondence. Leveraging the same part-level decoder, our approach seamlessly\nextends to sketch modelling by establishing correspondence between CLIPasso\nedgemaps and projected 3D part regions, eliminating the need for a dataset\npairing human sketches and 3D shapes. Additionally, our method introduces a\nseamless in-position editing process as a byproduct of cross-modal part-aligned\nmodelling. Operating in a low-dimensional implicit space, our approach\nsignificantly reduces computational demands and processing time.\n","authors":["Hmrishav Bandyopadhyay","Subhadeep Koley","Ayan Das","Aneeshan Sain","Pinaki Nath Chowdhury","Tao Xiang","Ayan Kumar Bhunia","Yi-Zhe Song"],"pdf_url":"https://arxiv.org/pdf/2312.04043v1.pdf","comment":"Project Page: https://hmrishavbandy.github.io/doodle23d/"},{"id":"http://arxiv.org/abs/2310.14702v2","updated":"2023-12-07T04:42:07Z","published":"2023-10-23T08:45:12Z","title":"BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities","summary":" Collaborative perception enables agents to share complementary perceptual\ninformation with nearby agents. This would improve the perception performance\nand alleviate the issues of single-view perception, such as occlusion and\nsparsity. Most existing approaches mainly focus on single modality (especially\nLiDAR), and not fully exploit the superiority of multi-modal perception. We\npropose a collaborative perception paradigm, BM2CP, which employs LiDAR and\ncamera to achieve efficient multi-modal perception. It utilizes LiDAR-guided\nmodal fusion, cooperative depth generation and modality-guided intermediate\nfusion to acquire deep interactions among modalities of different agents,\nMoreover, it is capable to cope with the special case where one of the sensors,\nsame or different type, of any agent is missing. Extensive experiments validate\nthat our approach outperforms the state-of-the-art methods with 50X lower\ncommunication volumes in both simulated and real-world autonomous driving\nscenarios. Our code is available at https://github.com/byzhaoAI/BM2CP.\n","authors":["Binyu Zhao","Wei Zhang","Zhaonian Zou"],"pdf_url":"https://arxiv.org/pdf/2310.14702v2.pdf","comment":"14 pages, 8 figures. Accepted by CoRL 2023"},{"id":"http://arxiv.org/abs/2312.04036v1","updated":"2023-12-07T04:39:22Z","published":"2023-12-07T04:39:22Z","title":"DiffusionPhase: Motion Diffusion in Frequency Domain","summary":" In this study, we introduce a learning-based method for generating\nhigh-quality human motion sequences from text descriptions (e.g., ``A person\nwalks forward\"). Existing techniques struggle with motion diversity and smooth\ntransitions in generating arbitrary-length motion sequences, due to limited\ntext-to-motion datasets and the pose representations used that often lack\nexpressiveness or compactness. To address these issues, we propose the first\nmethod for text-conditioned human motion generation in the frequency domain of\nmotions. We develop a network encoder that converts the motion space into a\ncompact yet expressive parameterized phase space with high-frequency details\nencoded, capturing the local periodicity of motions in time and space with high\naccuracy. We also introduce a conditional diffusion model for predicting\nperiodic motion parameters based on text descriptions and a start pose,\nefficiently achieving smooth transitions between motion sequences associated\nwith different text descriptions. Experiments demonstrate that our approach\noutperforms current methods in generating a broader variety of high-quality\nmotions, and synthesizing long sequences with natural transitions.\n","authors":["Weilin Wan","Yiming Huang","Shutong Wu","Taku Komura","Wenping Wang","Dinesh Jayaraman","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.04464v3","updated":"2023-12-07T04:35:24Z","published":"2023-11-08T05:18:57Z","title":"Enhancing Few-shot CLIP with Semantic-Aware Fine-Tuning","summary":" Learning generalized representations from limited training samples is crucial\nfor applying deep neural networks in low-resource scenarios. Recently, methods\nbased on Contrastive Language-Image Pre-training (CLIP) have exhibited\npromising performance in few-shot adaptation tasks. To avoid catastrophic\nforgetting and overfitting caused by few-shot fine-tuning, existing works\nusually freeze the parameters of CLIP pre-trained on large-scale datasets,\noverlooking the possibility that some parameters might not be suitable for\ndownstream tasks. To this end, we revisit CLIP's visual encoder with a specific\nfocus on its distinctive attention pooling layer, which performs a spatial\nweighted-sum of the dense feature maps. Given that dense feature maps contain\nmeaningful semantic information, and different semantics hold varying\nimportance for diverse downstream tasks (such as prioritizing semantics like\nears and eyes in pet classification tasks rather than side mirrors), using the\nsame weighted-sum operation for dense features across different few-shot tasks\nmight not be appropriate. Hence, we propose fine-tuning the parameters of the\nattention pooling layer during the training process to encourage the model to\nfocus on task-specific semantics. In the inference process, we perform residual\nblending between the features pooled by the fine-tuned and the original\nattention pooling layers to incorporate both the few-shot knowledge and the\npre-trained CLIP's prior knowledge. We term this method as Semantic-Aware\nFinE-tuning (SAFE). SAFE is effective in enhancing the conventional few-shot\nCLIP and is compatible with the existing adapter approach (termed SAFE-A).\n","authors":["Yao Zhu","Yuefeng Chen","Wei Wang","Xiaofeng Mao","Xiu Yan","Yue Wang","Zhigang Li","Wang lu","Jindong Wang","Xiangyang Ji"],"pdf_url":"https://arxiv.org/pdf/2311.04464v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03289v2","updated":"2023-12-07T04:21:33Z","published":"2023-12-06T04:38:02Z","title":"Class Incremental Learning for Adversarial Robustness","summary":" Adversarial training integrates adversarial examples during model training to\nenhance robustness. However, its application in fixed dataset settings differs\nfrom real-world dynamics, where data accumulates incrementally. In this study,\nwe investigate Adversarially Robust Class Incremental Learning (ARCIL), a\nmethod that combines adversarial robustness with incremental learning. We\nobserve that combining incremental learning with naive adversarial training\neasily leads to a loss of robustness. We discover that this is attributed to\nthe disappearance of the flatness of the loss function, a characteristic of\nadversarial training. To address this issue, we propose the Flatness Preserving\nDistillation (FPD) loss that leverages the output difference between\nadversarial and clean examples. Additionally, we introduce the Logit Adjustment\nDistillation (LAD) loss, which adapts the model's knowledge to perform well on\nnew tasks. Experimental results demonstrate the superiority of our method over\napproaches that apply adversarial training to existing incremental learning\nmethods, which provides a strong baseline for incremental learning on\nadversarial robustness in the future. Our method achieves AutoAttack accuracy\nthat is 5.99\\%p, 5.27\\%p, and 3.90\\%p higher on average than the baseline on\nsplit CIFAR-10, CIFAR-100, and Tiny ImageNet, respectively. The code will be\nmade available.\n","authors":["Seungju Cho","Hongsin Lee","Changick Kim"],"pdf_url":"https://arxiv.org/pdf/2312.03289v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2303.06458v2","updated":"2023-12-07T04:04:31Z","published":"2023-03-11T17:14:33Z","title":"ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and\n Multilingual Natural Language Generation","summary":" Natural Language Generation (NLG) accepts input data in the form of images,\nvideos, or text and generates corresponding natural language text as output.\nExisting NLG methods mainly adopt a supervised approach and rely heavily on\ncoupled data-to-text pairs. However, for many targeted scenarios and for\nnon-English languages, sufficient quantities of labeled data are often not\navailable. To relax the dependency on labeled data of downstream tasks, we\npropose an intuitive and effective zero-shot learning framework, ZeroNLG, which\ncan deal with multiple NLG tasks, including image-to-text (image captioning),\nvideo-to-text (video captioning), and text-to-text (neural machine\ntranslation), across English, Chinese, German, and French within a unified\nframework. ZeroNLG does not require any labeled downstream pairs for training.\nDuring training, ZeroNLG (i) projects different domains (across modalities and\nlanguages) to corresponding coordinates in a shared common latent space; (ii)\nbridges different domains by aligning their corresponding coordinates in this\nspace; and (iii) builds an unsupervised multilingual auto-encoder to learn to\ngenerate text by reconstructing the input text given its coordinate in shared\nlatent space. Consequently, during inference, based on the data-to-text\npipeline, ZeroNLG can generate target sentences across different languages\ngiven the coordinate of input data in the common space. Within this unified\nframework, given visual (imaging or video) data as input, ZeroNLG can perform\nzero-shot visual captioning; given textual sentences as input, ZeroNLG can\nperform zero-shot machine translation. We present the results of extensive\nexperiments on twelve NLG tasks, showing that, without using any labeled\ndownstream pairs for training, ZeroNLG generates high-quality and believable\noutputs and significantly outperforms existing zero-shot methods.\n","authors":["Bang Yang","Fenglin Liu","Yuexian Zou","Xian Wu","Yaowei Wang","David A. Clifton"],"pdf_url":"https://arxiv.org/pdf/2303.06458v2.pdf","comment":"Our code and data are available at\n https://github.com/yangbang18/ZeroNLG"},{"id":"http://arxiv.org/abs/2311.09257v5","updated":"2023-12-07T03:56:56Z","published":"2023-11-14T23:07:50Z","title":"UFOGen: You Forward Once Large Scale Text-to-Image Generation via\n Diffusion GANs","summary":" Text-to-image diffusion models have demonstrated remarkable capabilities in\ntransforming textual prompts into coherent images, yet the computational cost\nof their inference remains a persistent challenge. To address this issue, we\npresent UFOGen, a novel generative model designed for ultra-fast, one-step\ntext-to-image synthesis. In contrast to conventional approaches that focus on\nimproving samplers or employing distillation techniques for diffusion models,\nUFOGen adopts a hybrid methodology, integrating diffusion models with a GAN\nobjective. Leveraging a newly introduced diffusion-GAN objective and\ninitialization with pre-trained diffusion models, UFOGen excels in efficiently\ngenerating high-quality images conditioned on textual descriptions in a single\nstep. Beyond traditional text-to-image generation, UFOGen showcases versatility\nin applications. Notably, UFOGen stands among the pioneering models enabling\none-step text-to-image generation and diverse downstream tasks, presenting a\nsignificant advancement in the landscape of efficient generative models.\n","authors":["Yanwu Xu","Yang Zhao","Zhisheng Xiao","Tingbo Hou"],"pdf_url":"https://arxiv.org/pdf/2311.09257v5.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04029v1","updated":"2023-12-07T03:55:20Z","published":"2023-12-07T03:55:20Z","title":"Improved Face Representation via Joint Label Classification and\n Supervised Contrastive Clustering","summary":" Face clustering tasks can learn hierarchical semantic information from\nlarge-scale data, which has the potential to help facilitate face recognition.\nHowever, there are few works on this problem. This paper explores it by\nproposing a joint optimization task of label classification and supervised\ncontrastive clustering to introduce the cluster knowledge to the traditional\nface recognition task in two ways. We first extend ArcFace with a\ncluster-guided angular margin to adjust the within-class feature distribution\naccording to the hard level of face clustering. Secondly, we propose a\nsupervised contrastive clustering approach to pull the features to the cluster\ncenter and propose the cluster-aligning procedure to align the cluster center\nand the learnable class center in the classifier for joint training. Finally,\nextensive qualitative and quantitative experiments on popular facial benchmarks\ndemonstrate the effectiveness of our paradigm and its superiority over the\nexisting approaches to face recognition.\n","authors":["Zhenduo Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.04029v1.pdf","comment":"9 pages"},{"id":"http://arxiv.org/abs/2312.04028v1","updated":"2023-12-07T03:53:53Z","published":"2023-12-07T03:53:53Z","title":"ImFace++: A Sophisticated Nonlinear 3D Morphable Face Model with\n Implicit Neural Representations","summary":" Accurate representations of 3D faces are of paramount importance in various\ncomputer vision and graphics applications. However, the challenges persist due\nto the limitations imposed by data discretization and model linearity, which\nhinder the precise capture of identity and expression clues in current studies.\nThis paper presents a novel 3D morphable face model, named ImFace++, to learn a\nsophisticated and continuous space with implicit neural representations.\nImFace++ first constructs two explicitly disentangled deformation fields to\nmodel complex shapes associated with identities and expressions, respectively,\nwhich simultaneously facilitate the automatic learning of correspondences\nacross diverse facial shapes. To capture more sophisticated facial details, a\nrefinement displacement field within the template space is further\nincorporated, enabling a fine-grained learning of individual-specific facial\ndetails. Furthermore, a Neural Blend-Field is designed to reinforce the\nrepresentation capabilities through adaptive blending of an array of local\nfields. In addition to ImFace++, we have devised an improved learning strategy\nto extend expression embeddings, allowing for a broader range of expression\nvariations. Comprehensive qualitative and quantitative evaluations demonstrate\nthat ImFace++ significantly advances the state-of-the-art in terms of both face\nreconstruction fidelity and correspondence accuracy.\n","authors":["Mingwu Zheng","Haiyu Zhang","Hongyu Yang","Liming Chen","Di Huang"],"pdf_url":"https://arxiv.org/pdf/2312.04028v1.pdf","comment":"17 pages, 14 figures. arXiv admin note: text overlap with\n arXiv:2203.14510"},{"id":"http://arxiv.org/abs/2312.04024v1","updated":"2023-12-07T03:42:48Z","published":"2023-12-07T03:42:48Z","title":"k* Distribution: Evaluating the Latent Space of Deep Neural Networks\n using Local Neighborhood Analysis","summary":" Most examinations of neural networks' learned latent spaces typically employ\ndimensionality reduction techniques such as t-SNE or UMAP. While these methods\neffectively capture the overall sample distribution in the entire learned\nlatent space, they tend to distort the structure of sample distributions within\nspecific classes in the subset of the latent space. This distortion complicates\nthe task of easily distinguishing classes identifiable by neural networks. In\nresponse to this challenge, we introduce the k* Distribution methodology. This\napproach focuses on capturing the characteristics and structure of sample\ndistributions for individual classes within the subset of the learned latent\nspace using local neighborhood analysis. The key concept is to facilitate easy\ncomparison of different k* distributions, enabling analysis of how various\nclasses are processed by the same neural network. This provides a more profound\nunderstanding of existing contemporary visualizations. Our study reveals three\ndistinct distributions of samples within the learned latent space subset: a)\nFractured, b) Overlapped, and c) Clustered. We note and demonstrate that the\ndistribution of samples within the network's learned latent space significantly\nvaries depending on the class. Furthermore, we illustrate that our analysis can\nbe applied to explore the latent space of diverse neural network architectures,\nvarious layers within neural networks, transformations applied to input\nsamples, and the distribution of training and testing data for neural networks.\nWe anticipate that our approach will facilitate more targeted investigations\ninto neural networks by collectively examining the distribution of different\nsamples within the learned latent space.\n","authors":["Shashank Kotyan","Ueda Tatsuya","Danilo Vasconcellos Vargas"],"pdf_url":"https://arxiv.org/pdf/2312.04024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.01841v2","updated":"2023-12-07T03:14:22Z","published":"2023-12-04T12:25:37Z","title":"VividTalk: One-Shot Audio-Driven Talking Head Generation Based on 3D\n Hybrid Prior","summary":" Audio-driven talking head generation has drawn much attention in recent\nyears, and many efforts have been made in lip-sync, expressive facial\nexpressions, natural head pose generation, and high video quality. However, no\nmodel has yet led or tied on all these metrics due to the one-to-many mapping\nbetween audio and motion. In this paper, we propose VividTalk, a two-stage\ngeneric framework that supports generating high-visual quality talking head\nvideos with all the above properties. Specifically, in the first stage, we map\nthe audio to mesh by learning two motions, including non-rigid expression\nmotion and rigid head motion. For expression motion, both blendshape and vertex\nare adopted as the intermediate representation to maximize the representation\nability of the model. For natural head motion, a novel learnable head pose\ncodebook with a two-phase training mechanism is proposed. In the second stage,\nwe proposed a dual branch motion-vae and a generator to transform the meshes\ninto dense motion and synthesize high-quality video frame-by-frame. Extensive\nexperiments show that the proposed VividTalk can generate high-visual quality\ntalking head videos with lip-sync and realistic enhanced by a large margin, and\noutperforms previous state-of-the-art works in objective and subjective\ncomparisons.\n","authors":["Xusen Sun","Longhao Zhang","Hao Zhu","Peng Zhang","Bang Zhang","Xinya Ji","Kangneng Zhou","Daiheng Gao","Liefeng Bo","Xun Cao"],"pdf_url":"https://arxiv.org/pdf/2312.01841v2.pdf","comment":"10 pages, 8 figures"},{"id":"http://arxiv.org/abs/2312.03594v2","updated":"2023-12-07T03:13:33Z","published":"2023-12-06T16:34:46Z","title":"A Task is Worth One Word: Learning with Task Prompts for High-Quality\n Versatile Image Inpainting","summary":" Achieving high-quality versatile image inpainting, where user-specified\nregions are filled with plausible content according to user intent, presents a\nsignificant challenge. Existing methods face difficulties in simultaneously\naddressing context-aware image inpainting and text-guided object inpainting due\nto the distinct optimal training strategies required. To overcome this\nchallenge, we introduce PowerPaint, the first high-quality and versatile\ninpainting model that excels in both tasks. First, we introduce learnable task\nprompts along with tailored fine-tuning strategies to guide the model's focus\non different inpainting targets explicitly. This enables PowerPaint to\naccomplish various inpainting tasks by utilizing different task prompts,\nresulting in state-of-the-art performance. Second, we demonstrate the\nversatility of the task prompt in PowerPaint by showcasing its effectiveness as\na negative prompt for object removal. Additionally, we leverage prompt\ninterpolation techniques to enable controllable shape-guided object inpainting.\nFinally, we extensively evaluate PowerPaint on various inpainting benchmarks to\ndemonstrate its superior performance for versatile image inpainting. We release\nour codes and models on our project page: https://powerpaint.github.io/.\n","authors":["Junhao Zhuang","Yanhong Zeng","Wenran Liu","Chun Yuan","Kai Chen"],"pdf_url":"https://arxiv.org/pdf/2312.03594v2.pdf","comment":"Project page with code: https://powerpaint.github.io/"},{"id":"http://arxiv.org/abs/2312.04016v1","updated":"2023-12-07T03:10:03Z","published":"2023-12-07T03:10:03Z","title":"PartDistill: 3D Shape Part Segmentation by Vision-Language Model\n Distillation","summary":" This paper proposes a cross-modal distillation framework, PartDistill, which\ntransfers 2D knowledge from vision-language models (VLMs) to facilitate 3D\nshape part segmentation. PartDistill addresses three major challenges in this\ntask: the lack of 3D segmentation in invisible or undetected regions in the 2D\nprojections, inaccurate and inconsistent 2D predictions by VLMs, and the lack\nof knowledge accumulation across different 3D shapes. PartDistill consists of a\nteacher network that uses a VLM to make 2D predictions and a student network\nthat learns from the 2D predictions while extracting geometrical features from\nmultiple 3D shapes to carry out 3D part segmentation. A bi-directional\ndistillation, including forward and backward distillations, is carried out\nwithin the framework, where the former forward distills the 2D predictions to\nthe student network, and the latter improves the quality of the 2D predictions,\nwhich subsequently enhances the final 3D part segmentation. Moreover,\nPartDistill can exploit generative models that facilitate effortless 3D shape\ncreation for generating knowledge sources to be distilled. Through extensive\nexperiments, PartDistill boosts the existing methods with substantial margins\non widely used ShapeNetPart and PartE datasets, by more than 15% and 12% higher\nmIoU scores, respectively.\n","authors":["Ardian Umam","Cheng-Kun Yang","Min-Hung Chen","Jen-Hui Chuang","Yen-Yu Lin"],"pdf_url":"https://arxiv.org/pdf/2312.04016v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04008v1","updated":"2023-12-07T02:55:46Z","published":"2023-12-07T02:55:46Z","title":"Natural-language-driven Simulation Benchmark and Copilot for Efficient\n Production of Object Interactions in Virtual Road Scenes","summary":" We advocate the idea of the natural-language-driven(NLD) simulation to\nefficiently produce the object interactions between multiple objects in the\nvirtual road scenes, for teaching and testing the autonomous driving systems\nthat should take quick action to avoid collision with obstacles with\nunpredictable motions. The NLD simulation allows the brief natural-language\ndescription to control the object interactions, significantly reducing the\nhuman efforts for creating a large amount of interaction data. To facilitate\nthe research of NLD simulation, we collect the Language-to-Interaction(L2I)\nbenchmark dataset with 120,000 natural-language descriptions of object\ninteractions in 6 common types of road topologies. Each description is\nassociated with the programming code, which the graphic render can use to\nvisually reconstruct the object interactions in the virtual scenes. As a\nmethodology contribution, we design SimCopilot to translate the interaction\ndescriptions to the renderable code. We use the L2I dataset to evaluate\nSimCopilot's abilities to control the object motions, generate complex\ninteractions, and generalize interactions across road topologies. The L2I\ndataset and the evaluation results motivate the relevant research of the NLD\nsimulation.\n","authors":["Kairui Yang","Zihao Guo","Gengjie Lin","Haotian Dong","Die Zuo","Jibin Peng","Zhao Huang","Zhecheng Xu","Fupeng Li","Ziyun Bai","Di Lin"],"pdf_url":"https://arxiv.org/pdf/2312.04008v1.pdf","comment":"10 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.04005v1","updated":"2023-12-07T02:46:18Z","published":"2023-12-07T02:46:18Z","title":"KOALA: Self-Attention Matters in Knowledge Distillation of Latent\n Diffusion Models for Memory-Efficient and Fast Image Synthesis","summary":" Stable diffusion is the mainstay of the text-to-image (T2I) synthesis in the\ncommunity due to its generation performance and open-source nature. Recently,\nStable Diffusion XL (SDXL), the successor of stable diffusion, has received a\nlot of attention due to its significant performance improvements with a higher\nresolution of 1024x1024 and a larger model. However, its increased computation\ncost and model size require higher-end hardware(e.g., bigger VRAM GPU) for\nend-users, incurring higher costs of operation. To address this problem, in\nthis work, we propose an efficient latent diffusion model for text-to-image\nsynthesis obtained by distilling the knowledge of SDXL. To this end, we first\nperform an in-depth analysis of the denoising U-Net in SDXL, which is the main\nbottleneck of the model, and then design a more efficient U-Net based on the\nanalysis. Secondly, we explore how to effectively distill the generation\ncapability of SDXL into an efficient U-Net and eventually identify four\nessential factors, the core of which is that self-attention is the most\nimportant part. With our efficient U-Net and self-attention-based knowledge\ndistillation strategy, we build our efficient T2I models, called KOALA-1B &\n-700M, while reducing the model size up to 54% and 69% of the original SDXL\nmodel. In particular, the KOALA-700M is more than twice as fast as SDXL while\nstill retaining a decent generation quality. We hope that due to its balanced\nspeed-performance tradeoff, our KOALA models can serve as a cost-effective\nalternative to SDXL in resource-constrained environments.\n","authors":["Youngwan Lee","Kwanyong Park","Yoorhim Cho","Yong-Ju Lee","Sung Ju Hwang"],"pdf_url":"https://arxiv.org/pdf/2312.04005v1.pdf","comment":"Project page: https://youngwanlee.github.io/KOALA/"},{"id":"http://arxiv.org/abs/2304.10306v2","updated":"2023-12-07T02:43:03Z","published":"2023-04-20T13:40:49Z","title":"FIANCEE: Faster Inference of Adversarial Networks via Conditional Early\n Exits","summary":" Generative DNNs are a powerful tool for image synthesis, but they are limited\nby their computational load. On the other hand, given a trained model and a\ntask, e.g. faces generation within a range of characteristics, the output image\nquality will be unevenly distributed among images with different\ncharacteristics. It follows, that we might restrain the models complexity on\nsome instances, maintaining a high quality. We propose a method for diminishing\ncomputations by adding so-called early exit branches to the original\narchitecture, and dynamically switching the computational path depending on how\ndifficult it will be to render the output. We apply our method on two different\nSOTA models performing generative tasks: generation from a semantic map, and\ncross-reenactment of face expressions; showing it is able to output images with\ncustom lower-quality thresholds. For a threshold of LPIPS <=0.1, we diminish\ntheir computations by up to a half. This is especially relevant for real-time\napplications such as synthesis of faces, when quality loss needs to be\ncontained, but most of the inputs need fewer computations than the complex\ninstances.\n","authors":["Polina Karpikova","Radionova Ekaterina","Anastasia Yaschenko","Andrei Spiridonov","Leonid Kostyushko","Riccardo Fabbricatore","Aleksei Ivakhnenko"],"pdf_url":"https://arxiv.org/pdf/2304.10306v2.pdf","comment":"12 pages, 22 figures"},{"id":"http://arxiv.org/abs/2312.04000v1","updated":"2023-12-07T02:31:28Z","published":"2023-12-07T02:31:28Z","title":"LiDAR: Sensing Linear Probing Performance in Joint Embedding SSL\n Architectures","summary":" Joint embedding (JE) architectures have emerged as a promising avenue for\nacquiring transferable data representations. A key obstacle to using JE\nmethods, however, is the inherent challenge of evaluating learned\nrepresentations without access to a downstream task, and an annotated dataset.\nWithout efficient and reliable evaluation, it is difficult to iterate on\narchitectural and training choices for JE methods. In this paper, we introduce\nLiDAR (Linear Discriminant Analysis Rank), a metric designed to measure the\nquality of representations within JE architectures. Our metric addresses\nseveral shortcomings of recent approaches based on feature covariance rank by\ndiscriminating between informative and uninformative features. In essence,\nLiDAR quantifies the rank of the Linear Discriminant Analysis (LDA) matrix\nassociated with the surrogate SSL task -- a measure that intuitively captures\nthe information content as it pertains to solving the SSL task. We empirically\ndemonstrate that LiDAR significantly surpasses naive rank based approaches in\nits predictive power of optimal hyperparameters. Our proposed criterion\npresents a more robust and intuitive means of assessing the quality of\nrepresentations within JE architectures, which we hope facilitates broader\nadoption of these powerful techniques in various domains.\n","authors":["Vimal Thilak","Chen Huang","Omid Saremi","Laurent Dinh","Hanlin Goh","Preetum Nakkiran","Joshua M. Susskind","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2312.04000v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.03996v1","updated":"2023-12-07T02:23:32Z","published":"2023-12-07T02:23:32Z","title":"Stable diffusion for Data Augmentation in COCO and Weed Datasets","summary":" Generative models have increasingly impacted relative tasks ranging from\nimage revision and object detection in computer vision to interior design and\nidea illustration in more general fields. Stable diffusion is an outstanding\nmodel series that paves the way for producing high-resolution images with\nthorough details from text prompts or reference images. It will be an\ninteresting topic about how to leverage the capability of stable diffusion to\nelevate the image variations of certain categories (e.g., vehicles, humans, and\ndaily objects); particularly, it has the potential to gain improvements for\nsmall datasets with image-sparse categories. This study utilized seven\ncategories in the popular COCO dataset and three widespread weed species in\nMichigan to evaluate the efficiency of a recent version of stable diffusion. In\ndetail, Stable diffusion was used to generate synthetic images belonging to\nthese classes; then, YOLOv8 models were trained based on these synthetic\nimages, whose performance was compared to the models trained on original\nimages. In addition, several techniques (e.g., Image-to-image translation,\nDreambooth, ControlNet) of Stable diffusion were leveraged for image generation\nwith different focuses. In spite of the overall results being disappointing,\npromising results have been achieved in some classes, illustrating the\npotential of stable diffusion models to improve the performance of detection\nmodels, which represent more helpful information being conveyed into the models\nby the generated images. This seminal study may expedite the adaption of stable\ndiffusion models to classification and detection tasks in different fields.\n","authors":["Boyang Deng","Yuzhen Lu"],"pdf_url":"https://arxiv.org/pdf/2312.03996v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03993v1","updated":"2023-12-07T02:21:31Z","published":"2023-12-07T02:21:31Z","title":"Style Transfer to Calvin and Hobbes comics using Stable Diffusion","summary":" This project report summarizes our journey to perform stable diffusion\nfine-tuning on a dataset containing Calvin and Hobbes comics. The purpose is to\nconvert any given input image into the comic style of Calvin and Hobbes,\nessentially performing style transfer. We train stable-diffusion-v1.5 using Low\nRank Adaptation (LoRA) to efficiently speed up the fine-tuning process. The\ndiffusion itself is handled by a Variational Autoencoder (VAE), which is a\nU-net. Our results were visually appealing for the amount of training time and\nthe quality of input data that went into training.\n","authors":["Sloke Shrestha","Sundar Sripada V. S.","Asvin Venkataramanan"],"pdf_url":"https://arxiv.org/pdf/2312.03993v1.pdf","comment":"Project report for ECE 371Q Digital Image Processing at UT Austin"},{"id":"http://arxiv.org/abs/2312.03698v2","updated":"2023-12-07T02:19:27Z","published":"2023-12-06T18:59:03Z","title":"Intrinsic Harmonization for Illumination-Aware Compositing","summary":" Despite significant advancements in network-based image harmonization\ntechniques, there still exists a domain disparity between typical training\npairs and real-world composites encountered during inference. Most existing\nmethods are trained to reverse global edits made on segmented image regions,\nwhich fail to accurately capture the lighting inconsistencies between the\nforeground and background found in composited images. In this work, we\nintroduce a self-supervised illumination harmonization approach formulated in\nthe intrinsic image domain. First, we estimate a simple global lighting model\nfrom mid-level vision representations to generate a rough shading for the\nforeground region. A network then refines this inferred shading to generate a\nharmonious re-shading that aligns with the background scene. In order to match\nthe color appearance of the foreground and background, we utilize ideas from\nprior harmonization approaches to perform parameterized image edits in the\nalbedo domain. To validate the effectiveness of our approach, we present\nresults from challenging real-world composites and conduct a user study to\nobjectively measure the enhanced realism achieved compared to state-of-the-art\nharmonization methods.\n","authors":["Chris Careaga","S. Mahdi H. Miangoleh","Yağız Aksoy"],"pdf_url":"https://arxiv.org/pdf/2312.03698v2.pdf","comment":"10 pages, 8 figures. Accepted to SIGGRAPH Asia 2023 (Conference\n Track). Project page: https://yaksoy.github.io/intrinsicCompositing/"},{"id":"http://arxiv.org/abs/2206.04979v4","updated":"2023-12-07T02:03:25Z","published":"2022-06-10T10:28:55Z","title":"Convolutional layers are equivariant to discrete shifts but not\n continuous translations","summary":" The purpose of this short and simple note is to clarify a common\nmisconception about convolutional neural networks (CNNs). CNNs are made up of\nconvolutional layers which are shift equivariant due to weight sharing.\nHowever, convolutional layers are not translation equivariant, even when\nboundary effects are ignored and when pooling and subsampling are absent. This\nis because shift equivariance is a discrete symmetry while translation\nequivariance is a continuous symmetry. This fact is well known among\nresearchers in equivariant machine learning, but is usually overlooked among\nnon-experts. To minimize confusion, we suggest using the term `shift\nequivariance' to refer to discrete shifts in pixels and `translation\nequivariance' to refer to continuous translations.\n","authors":["Nick McGreivy","Ammar Hakim"],"pdf_url":"https://arxiv.org/pdf/2206.04979v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03406v2","updated":"2023-12-07T01:24:54Z","published":"2023-12-06T10:42:40Z","title":"SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting","summary":" Spatiotemporal forecasting tasks, such as weather forecasting and traffic\nprediction, offer significant societal benefits. These tasks can be effectively\napproached as image forecasting problems using computer vision models. Vector\nquantization (VQ) is a well-known method for discrete representation that\nimproves the latent space, leading to enhanced generalization and transfer\nlearning capabilities. One of the main challenges in using VQ for\nspatiotemporal forecasting is how to balance between keeping enough details and\nremoving noises from the original patterns for better generalization. We\naddress this challenge by developing sparse vector quantization, or {\\bf SVQ}\nfor short, that leverages sparse regression to make better trade-off between\nthe two objectives. The main innovation of this work is to approximate sparse\nregression by a two-layer MLP and a randomly fixed or learnable matrix,\ndramatically improving its computational efficiency. Through experiments\nconducted on diverse datasets in multiple fields including weather forecasting,\ntraffic flow prediction, and video forecasting, we unequivocally demonstrate\nthat our proposed method consistently enhances the performance of base models\nand achieves state-of-the-art results across all benchmarks.\n","authors":["Chao Chen","Tian Zhou","Yanjun Zhao","Hui Liu","Liang Sun","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2312.03406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03970v1","updated":"2023-12-07T01:01:45Z","published":"2023-12-07T01:01:45Z","title":"Improving Medical Report Generation with Adapter Tuning and Knowledge\n Enhancement in Vision-Language Foundation Models","summary":" Medical report generation demands automatic creation of coherent and precise\ndescriptions for medical images. However, the scarcity of labelled medical\nimage-report pairs poses formidable challenges in developing large-scale neural\nnetworks capable of harnessing the potential of artificial intelligence,\nexemplified by large language models. This study builds upon the\nstate-of-the-art vision-language pre-training and fine-tuning approach, BLIP-2,\nto customize general large-scale foundation models. Integrating adapter tuning\nand a medical knowledge enhancement loss, our model significantly improves\naccuracy and coherence. Validation on the dataset of ImageCLEFmedical 2023\ndemonstrates our model's prowess, achieving the best-averaged results against\nseveral state-of-the-art methods. Significant improvements in ROUGE and CIDEr\nunderscore our method's efficacy, highlighting promising outcomes for the rapid\nmedical-domain adaptation of the vision-language foundation models in\naddressing challenges posed by data scarcity.\n","authors":["Shibin Wu","Bang Yang","Zhiyu Ye","Haoqian Wang","Hairong Zheng","Tong Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03970v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.03518v2","updated":"2023-12-07T00:45:21Z","published":"2023-11-06T20:51:37Z","title":"High-resolution power equipment recognition based on improved\n self-attention","summary":" The current trend of automating inspections at substations has sparked a\nsurge in interest in the field of transformer image recognition. However, due\nto restrictions in the number of parameters in existing models, high-resolution\nimages can't be directly applied, leaving significant room for enhancing\nrecognition accuracy. Addressing this challenge, the paper introduces a novel\nimprovement on deep self-attention networks tailored for this issue. The\nproposed model comprises four key components: a foundational network, a region\nproposal network, a module for extracting and segmenting target areas, and a\nfinal prediction network. The innovative approach of this paper differentiates\nitself by decoupling the processes of part localization and recognition,\ninitially using low-resolution images for localization followed by\nhigh-resolution images for recognition. Moreover, the deep self-attention\nnetwork's prediction mechanism uniquely incorporates the semantic context of\nimages, resulting in substantially improved recognition performance.\nComparative experiments validate that this method outperforms the two other\nprevalent target recognition models, offering a groundbreaking perspective for\nautomating electrical equipment inspections.\n","authors":["Siyi Zhang","Cheng Liu","Xiang Li","Xin Zhai","Zhen Wei","Sizhe Li","Xun Ma"],"pdf_url":"https://arxiv.org/pdf/2311.03518v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.07956v2","updated":"2023-12-07T00:17:41Z","published":"2023-11-14T07:20:46Z","title":"Robust Learning Based Condition Diagnosis Method for Distribution\n Network Switchgear","summary":" This paper introduces a robust, learning-based method for diagnosing the\nstate of distribution network switchgear, which is crucial for maintaining the\npower quality for end users. Traditional diagnostic models often rely heavily\non expert knowledge and lack robustness. To address this, our method\nincorporates an expanded feature vector that includes environmental data,\ntemperature readings, switch position, motor operation, insulation conditions,\nand local discharge information. We tackle the issue of high dimensionality\nthrough feature mapping. The method introduces a decision radius to categorize\nunlabeled samples and updates the model parameters using a combination of\nsupervised and unsupervised loss, along with a consistency regularization\nfunction. This approach ensures robust learning even with a limited number of\nlabeled samples. Comparative analysis demonstrates that this method\nsignificantly outperforms existing models in both accuracy and robustness.\n","authors":["Wenxi Zhang","Zhe Li","Weixi Li","Weisi Ma","Xinyi Chen","Sizhe Li"],"pdf_url":"https://arxiv.org/pdf/2311.07956v2.pdf","comment":null}],"Information Retrieval":[{"id":"http://arxiv.org/abs/2312.04264v1","updated":"2023-12-07T12:42:40Z","published":"2023-12-07T12:42:40Z","title":"Multi-agricultural Machinery Collaborative Task Assignment Based on\n Improved Genetic Hybrid Optimization Algorithm","summary":" To address the challenges of delayed scheduling information, heavy reliance\non manual labour, and low operational efficiency in traditional large-scale\nagricultural machinery operations, this study proposes a method for\nmulti-agricultural machinery collaborative task assignment based on an improved\ngenetic hybrid optimisation algorithm. The proposed method establishes a\nmulti-agricultural machinery task allocation model by combining the path\npre-planning of a simulated annealing algorithm and the static task allocation\nof a genetic algorithm. By sequentially fusing these two algorithms, their\nrespective shortcomings can be overcome, and their advantages in global and\nlocal search can be utilised. Consequently, the search capability of the\npopulation is enhanced, leading to the discovery of more optimal solutions.\nThen, an adaptive crossover operator is constructed according to the task\nassignment model, considering the capacity, path cost, and time of agricultural\nmachinery; two-segment coding and multi-population adaptive mutation are used\nto assign tasks to improve the diversity of the population and enhance the\nexploration ability of the population; and to improve the global optimisation\nability of the hybrid algorithm, a 2-Opt local optimisation operator and an\nCircle modification algorithm are introduced. Finally, simulation experiments\nwere conducted in MATLAB to evaluate the performance of the multi-agricultural\nmachinery collaborative task assignment based on the improved genetic hybrid\nalgorithm. The algorithm's capabilities were assessed through comparative\nanalysis in the simulation trials. The results demonstrate that the developed\nhybrid algorithm can effectively reduce path costs, and the efficiency of the\nassignment outcomes surpasses that of the classical genetic algorithm. This\napproach proves particularly suitable for addressing large-scale task\nallocation problems.\n","authors":["Haohao Du"],"pdf_url":"https://arxiv.org/pdf/2312.04264v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04071v1","updated":"2023-12-07T06:29:26Z","published":"2023-12-07T06:29:26Z","title":"Synergistic Signals: Exploiting Co-Engagement and Semantic Links via\n Graph Neural Networks","summary":" Given a set of candidate entities (e.g. movie titles), the ability to\nidentify similar entities is a core capability of many recommender systems.\nMost often this is achieved by collaborative filtering approaches, i.e. if\nusers co-engage with a pair of entities frequently enough, the embeddings\nshould be similar. However, relying on co-engagement data alone can result in\nlower-quality embeddings for new and unpopular entities. We study this problem\nin the context recommender systems at Netflix. We observe that there is\nabundant semantic information such as genre, content maturity level, themes,\netc. that complements co-engagement signals and provides interpretability in\nsimilarity models. To learn entity similarities from both data sources\nholistically, we propose a novel graph-based approach called SemanticGNN.\nSemanticGNN models entities, semantic concepts, collaborative edges, and\nsemantic edges within a large-scale knowledge graph and conducts representation\nlearning over it. Our key technical contributions are twofold: (1) we develop a\nnovel relation-aware attention graph neural network (GNN) to handle the\nimbalanced distribution of relation types in our graph; (2) to handle web-scale\ngraph data that has millions of nodes and billions of edges, we develop a novel\ndistributed graph training paradigm. The proposed model is successfully\ndeployed within Netflix and empirical experiments indicate it yields up to 35%\nimprovement in performance on similarity judgment tasks.\n","authors":["Zijie Huang","Baolin Li","Hafez Asgharzadeh","Anne Cocos","Lingyi Liu","Evan Cox","Colby Wise","Sudarshan Lamkhede"],"pdf_url":"https://arxiv.org/pdf/2312.04071v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2308.02205v2","updated":"2023-12-07T02:15:27Z","published":"2023-08-04T08:45:02Z","title":"GEMRec: Towards Generative Model Recommendation","summary":" Recommender Systems are built to retrieve relevant items to satisfy users'\ninformation needs. The candidate corpus usually consists of a finite set of\nitems that are ready to be served, such as videos, products, or articles. With\nrecent advances in Generative AI such as GPT and Diffusion models, a new form\nof recommendation task is yet to be explored where items are to be created by\ngenerative models with personalized prompts. Taking image generation as an\nexample, with a single prompt from the user and access to a generative model,\nit is possible to generate hundreds of new images in a few minutes. How shall\nwe attain personalization in the presence of \"infinite\" items? In this\npreliminary study, we propose a two-stage framework, namely Prompt-Model\nRetrieval and Generated Item Ranking, to approach this new task formulation. We\nrelease GEMRec-18K, a prompt-model interaction dataset with 18K images\ngenerated by 200 publicly-available generative models paired with a diverse set\nof 90 textual prompts. Our findings demonstrate the promise of generative model\nrecommendation as a novel personalization problem and the limitations of\nexisting evaluation metrics. We highlight future directions for the RecSys\ncommunity to advance towards generative recommender systems. Our code and\ndataset are available at https://github.com/MAPS-research/GEMRec.\n","authors":["Yuanhe Guo","Haoming Liu","Hongyi Wen"],"pdf_url":"https://arxiv.org/pdf/2308.02205v2.pdf","comment":"Accepted by WSDM 2024 (Demo Track)"},{"id":"http://arxiv.org/abs/2202.09517v2","updated":"2023-12-07T01:07:13Z","published":"2022-02-19T03:48:20Z","title":"Deep Learning for Hate Speech Detection: A Comparative Study","summary":" Automated hate speech detection is an important tool in combating the spread\nof hate speech, particularly in social media. Numerous methods have been\ndeveloped for the task, including a recent proliferation of deep-learning based\napproaches. A variety of datasets have also been developed, exemplifying\nvarious manifestations of the hate-speech detection problem. We present here a\nlarge-scale empirical comparison of deep and shallow hate-speech detection\nmethods, mediated through the three most commonly used datasets. Our goal is to\nilluminate progress in the area, and identify strengths and weaknesses in the\ncurrent state-of-the-art. We particularly focus our analysis on measures of\npractical performance, including detection accuracy, computational efficiency,\ncapability in using pre-trained models, and domain generalization. In doing so\nwe aim to provide guidance as to the use of hate-speech detection in practice,\nquantify the state-of-the-art, and identify future research directions. Code\nand dataset are available at\nhttps://github.com/jmjmalik22/Hate-Speech-Detection.\n","authors":["Jitendra Singh Malik","Hezhe Qiao","Guansong Pang","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2202.09517v2.pdf","comment":"18 pages, 4 figures, and 6 tables"}],"Machine Learning":[{"id":"http://arxiv.org/abs/2312.04556v1","updated":"2023-12-07T18:59:29Z","published":"2023-12-07T18:59:29Z","title":"Large Language Models for Mathematicians","summary":" Large language models (LLMs) such as ChatGPT have received immense interest\nfor their general-purpose language understanding and, in particular, their\nability to generate high-quality text or computer code. For many professions,\nLLMs represent an invaluable tool that can speed up and improve the quality of\nwork. In this note, we discuss to what extent they can aid professional\nmathematicians. We first provide a mathematical description of the transformer\nmodel used in all modern language models. Based on recent studies, we then\noutline best practices and potential issues and report on the mathematical\nabilities of language models. Finally, we shed light on the potential of LMMs\nto change how mathematicians work.\n","authors":["Simon Frieder","Julius Berner","Philipp Petersen","Thomas Lukasiewicz"],"pdf_url":"https://arxiv.org/pdf/2312.04556v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04554v1","updated":"2023-12-07T18:59:22Z","published":"2023-12-07T18:59:22Z","title":"Improved Visual Grounding through Self-Consistent Explanations","summary":" Vision-and-language models trained to match images with text can be combined\nwith visual explanation methods to point to the locations of specific objects\nin an image. Our work shows that the localization --\"grounding\"-- abilities of\nthese models can be further improved by finetuning for self-consistent visual\nexplanations. We propose a strategy for augmenting existing text-image datasets\nwith paraphrases using a large language model, and SelfEQ, a weakly-supervised\nstrategy on visual explanation maps for paraphrases that encourages\nself-consistency. Specifically, for an input textual phrase, we attempt to\ngenerate a paraphrase and finetune the model so that the phrase and paraphrase\nmap to the same region in the image. We posit that this both expands the\nvocabulary that the model is able to handle, and improves the quality of the\nobject locations highlighted by gradient-based visual explanation methods (e.g.\nGradCAM). We demonstrate that SelfEQ improves performance on Flickr30k,\nReferIt, and RefCOCO+ over a strong baseline method and several prior works.\nParticularly, comparing to other methods that do not use any type of box\nannotations, we obtain 84.07% on Flickr30k (an absolute improvement of 4.69%),\n67.40% on ReferIt (an absolute improvement of 7.68%), and 75.10%, 55.49% on\nRefCOCO+ test sets A and B respectively (an absolute improvement of 3.74% on\naverage).\n","authors":["Ruozhen He","Paola Cascante-Bonilla","Ziyan Yang","Alexander C. Berg","Vicente Ordonez"],"pdf_url":"https://arxiv.org/pdf/2312.04554v1.pdf","comment":"Project Page: https://catherine-r-he.github.io/SelfEQ/"},{"id":"http://arxiv.org/abs/2312.04552v1","updated":"2023-12-07T18:59:20Z","published":"2023-12-07T18:59:20Z","title":"Generating Illustrated Instructions","summary":" We introduce the new task of generating Illustrated Instructions, i.e.,\nvisual instructions customized to a user's needs. We identify desiderata unique\nto this task, and formalize it through a suite of automatic and human\nevaluation metrics, designed to measure the validity, consistency, and efficacy\nof the generations. We combine the power of large language models (LLMs)\ntogether with strong text-to-image generation diffusion models to propose a\nsimple approach called StackedDiffusion, which generates such illustrated\ninstructions given text as input. The resulting model strongly outperforms\nbaseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases,\nusers even prefer it to human-generated articles. Most notably, it enables\nvarious new and exciting applications far beyond what static articles on the\nweb can provide, such as personalized instructions complete with intermediate\nsteps and pictures in response to a user's individual situation.\n","authors":["Sachit Menon","Ishan Misra","Rohit Girdhar"],"pdf_url":"https://arxiv.org/pdf/2312.04552v1.pdf","comment":"Project website:\n http://facebookresearch.github.io/IllustratedInstructions"},{"id":"http://arxiv.org/abs/2312.04548v1","updated":"2023-12-07T18:59:14Z","published":"2023-12-07T18:59:14Z","title":"Multiview Aerial Visual Recognition (MAVREC): Can Multi-view Improve\n Aerial Visual Perception?","summary":" Despite the commercial abundance of UAVs, aerial data acquisition remains\nchallenging, and the existing Asia and North America-centric open-source UAV\ndatasets are small-scale or low-resolution and lack diversity in scene\ncontextuality. Additionally, the color content of the scenes, solar-zenith\nangle, and population density of different geographies influence the data\ndiversity. These two factors conjointly render suboptimal aerial-visual\nperception of the deep neural network (DNN) models trained primarily on the\nground-view data, including the open-world foundational models.\n To pave the way for a transformative era of aerial detection, we present\nMultiview Aerial Visual RECognition or MAVREC, a video dataset where we record\nsynchronized scenes from different perspectives -- ground camera and\ndrone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard\n2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million\nannotated bounding boxes. This makes MAVREC the largest ground and aerial-view\ndataset, and the fourth largest among all drone-based datasets across all\nmodalities and tasks. Through our extensive benchmarking on MAVREC, we\nrecognize that augmenting object detectors with ground-view images from the\ncorresponding geographical location is a superior pre-training strategy for\naerial detection. Building on this strategy, we benchmark MAVREC with a\ncurriculum-based semi-supervised object detection approach that leverages\nlabeled (ground and aerial) and unlabeled (only aerial) images to enhance the\naerial detection. We publicly release the MAVREC dataset:\nhttps://mavrec.github.io.\n","authors":["Aritra Dutta","Srijan Das","Jacob Nielsen","Rajatsubhra Chakraborty","Mubarak Shah"],"pdf_url":"https://arxiv.org/pdf/2312.04548v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04549v1","updated":"2023-12-07T18:59:14Z","published":"2023-12-07T18:59:14Z","title":"PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play","summary":" Learning from unstructured and uncurated data has become the dominant\nparadigm for generative approaches in language and vision. Such unstructured\nand unguided behavior data, commonly known as play, is also easier to collect\nin robotics but much more difficult to learn from due to its inherently\nmultimodal, noisy, and suboptimal nature. In this paper, we study this problem\nof learning goal-directed skill policies from unstructured play data which is\nlabeled with language in hindsight. Specifically, we leverage advances in\ndiffusion models to learn a multi-task diffusion model to extract robotic\nskills from play data. Using a conditional denoising diffusion process in the\nspace of states and actions, we can gracefully handle the complexity and\nmultimodality of play data and generate diverse and interesting robot\nbehaviors. To make diffusion models more useful for skill learning, we\nencourage robotic agents to acquire a vocabulary of skills by introducing\ndiscrete bottlenecks into the conditional behavior generation process. In our\nexperiments, we demonstrate the effectiveness of our approach across a wide\nvariety of environments in both simulation and the real world. Results\nvisualizations and videos at https://play-fusion.github.io\n","authors":["Lili Chen","Shikhar Bahl","Deepak Pathak"],"pdf_url":"https://arxiv.org/pdf/2312.04549v1.pdf","comment":"In CoRL 2023. Website at https://play-fusion.github.io"},{"id":"http://arxiv.org/abs/2312.04546v1","updated":"2023-12-07T18:58:40Z","published":"2023-12-07T18:58:40Z","title":"Adversarial Learning for Feature Shift Detection and Correction","summary":" Data shift is a phenomenon present in many real-world applications, and while\nthere are multiple methods attempting to detect shifts, the task of localizing\nand correcting the features originating such shifts has not been studied in\ndepth. Feature shifts can occur in many datasets, including in multi-sensor\ndata, where some sensors are malfunctioning, or in tabular and structured data,\nincluding biomedical, financial, and survey data, where faulty standardization\nand data processing pipelines can lead to erroneous features. In this work, we\nexplore using the principles of adversarial learning, where the information\nfrom several discriminators trained to distinguish between two distributions is\nused to both detect the corrupted features and fix them in order to remove the\ndistribution shift between datasets. We show that mainstream supervised\nclassifiers, such as random forest or gradient boosting trees, combined with\nsimple iterative heuristics, can localize and correct feature shifts,\noutperforming current statistical and neural network-based techniques. The code\nis available at https://github.com/AI-sandbox/DataFix.\n","authors":["Miriam Barrabes","Daniel Mas Montserrat","Margarita Geleta","Xavier Giro-i-Nieto","Alexander G. Ioannidis"],"pdf_url":"https://arxiv.org/pdf/2312.04546v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04542v1","updated":"2023-12-07T18:57:36Z","published":"2023-12-07T18:57:36Z","title":"SoK: Unintended Interactions among Machine Learning Defenses and Risks","summary":" Machine learning (ML) models cannot neglect risks to security, privacy, and\nfairness. Several defenses have been proposed to mitigate such risks. When a\ndefense is effective in mitigating one risk, it may correspond to increased or\ndecreased susceptibility to other risks. Existing research lacks an effective\nframework to recognize and explain these unintended interactions. We present\nsuch a framework, based on the conjecture that overfitting and memorization\nunderlie unintended interactions. We survey existing literature on unintended\ninteractions, accommodating them within our framework. We use our framework to\nconjecture on two previously unexplored interactions, and empirically validate\nour conjectures.\n","authors":["Vasisht Duddu","Sebastian Szyller","N. Asokan"],"pdf_url":"https://arxiv.org/pdf/2312.04542v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04540v1","updated":"2023-12-07T18:57:03Z","published":"2023-12-07T18:57:03Z","title":"Sim-to-Real Causal Transfer: A Metric Learning Approach to\n Causally-Aware Interaction Representations","summary":" Modeling spatial-temporal interactions among neighboring agents is at the\nheart of multi-agent problems such as motion forecasting and crowd navigation.\nDespite notable progress, it remains unclear to which extent modern\nrepresentations can capture the causal relationships behind agent interactions.\nIn this work, we take an in-depth look at the causal awareness of these\nrepresentations, from computational formalism to real-world practice. First, we\ncast doubt on the notion of non-causal robustness studied in the recent\nCausalAgents benchmark. We show that recent representations are already\npartially resilient to perturbations of non-causal agents, and yet modeling\nindirect causal effects involving mediator agents remains challenging. To\naddress this challenge, we introduce a metric learning approach that\nregularizes latent representations with causal annotations. Our controlled\nexperiments show that this approach not only leads to higher degrees of causal\nawareness but also yields stronger out-of-distribution robustness. To further\noperationalize it in practice, we propose a sim-to-real causal transfer method\nvia cross-domain multi-task learning. Experiments on pedestrian datasets show\nthat our method can substantially boost generalization, even in the absence of\nreal-world causal annotations. We hope our work provides a new perspective on\nthe challenges and potential pathways towards causally-aware representations of\nmulti-agent interactions. Our code is available at\nhttps://github.com/socialcausality.\n","authors":["Yuejiang Liu","Ahmad Rahimi","Po-Chien Luan","Frano Rajič","Alexandre Alahi"],"pdf_url":"https://arxiv.org/pdf/2312.04540v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2312.04535v1","updated":"2023-12-07T18:53:27Z","published":"2023-12-07T18:53:27Z","title":"Trajeglish: Learning the Language of Driving Scenarios","summary":" A longstanding challenge for self-driving development is simulating dynamic\ndriving scenarios seeded from recorded driving logs. In pursuit of this\nfunctionality, we apply tools from discrete sequence modeling to model how\nvehicles, pedestrians and cyclists interact in driving scenarios. Using a\nsimple data-driven tokenization scheme, we discretize trajectories to\ncentimeter-level resolution using a small vocabulary. We then model the\nmulti-agent sequence of motion tokens with a GPT-like encoder-decoder that is\nautoregressive in time and takes into account intra-timestep interaction\nbetween agents. Scenarios sampled from our model exhibit state-of-the-art\nrealism; our model tops the Waymo Sim Agents Benchmark, surpassing prior work\nalong the realism meta metric by 3.3% and along the interaction metric by 9.9%.\nWe ablate our modeling choices in full autonomy and partial autonomy settings,\nand show that the representations learned by our model can quickly be adapted\nto improve performance on nuScenes. We additionally evaluate the scalability of\nour model with respect to parameter count and dataset size, and use density\nestimates from our model to quantify the saliency of context length and\nintra-timestep interaction for the traffic modeling task.\n","authors":["Jonah Philion","Xue Bin Peng","Sanja Fidler"],"pdf_url":"https://arxiv.org/pdf/2312.04535v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2312.04533v1","updated":"2023-12-07T18:51:19Z","published":"2023-12-07T18:51:19Z","title":"Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language\n Models","summary":" We introduce Dream2Real, a robotics framework which integrates\nvision-language models (VLMs) trained on 2D data into a 3D object rearrangement\npipeline. This is achieved by the robot autonomously constructing a 3D\nrepresentation of the scene, where objects can be rearranged virtually and an\nimage of the resulting arrangement rendered. These renders are evaluated by a\nVLM, so that the arrangement which best satisfies the user instruction is\nselected and recreated in the real world with pick-and-place. This enables\nlanguage-conditioned rearrangement to be performed zero-shot, without needing\nto collect a training dataset of example arrangements. Results on a series of\nreal-world tasks show that this framework is robust to distractors,\ncontrollable by language, capable of understanding complex multi-object\nrelations, and readily applicable to both tabletop and 6-DoF rearrangement\ntasks.\n","authors":["Ivan Kapelyukh","Yifei Ren","Ignacio Alzugaray","Edward Johns"],"pdf_url":"https://arxiv.org/pdf/2312.04533v1.pdf","comment":"Project webpage with videos: https://www.robot-learning.uk/dream2real"},{"id":"http://arxiv.org/abs/2312.01530v2","updated":"2023-12-07T18:47:53Z","published":"2023-12-03T23:08:29Z","title":"Evaluation of Active Feature Acquisition Methods for Time-varying\n Feature Settings","summary":" Machine learning methods often assume input features are available at no\ncost. However, in domains like healthcare, where acquiring features could be\nexpensive or harmful, it is necessary to balance a feature's acquisition cost\nagainst its predictive value. The task of training an AI agent to decide which\nfeatures to acquire is called active feature acquisition (AFA). By deploying an\nAFA agent, we effectively alter the acquisition strategy and trigger a\ndistribution shift. To safely deploy AFA agents under this distribution shift,\nwe present the problem of active feature acquisition performance evaluation\n(AFAPE). We examine AFAPE under i) a no direct effect (NDE) assumption, stating\nthat acquisitions don't affect the underlying feature values; and ii) a no\nunobserved confounding (NUC) assumption, stating that retrospective feature\nacquisition decisions were only based on observed features. We show that one\ncan apply offline reinforcement learning under the NUC assumption and missing\ndata methods under the NDE assumption. When NUC and NDE hold, we propose a\nnovel semi-offline reinforcement learning framework, which requires a weaker\npositivity assumption and yields more data-efficient estimators. We introduce\nthree novel estimators: a direct method (DM), an inverse probability weighting\n(IPW), and a double reinforcement learning (DRL) estimator.\n","authors":["Henrik von Kleist","Alireza Zamanian","Ilya Shpitser","Narges Ahmidi"],"pdf_url":"https://arxiv.org/pdf/2312.01530v2.pdf","comment":"43 pages, 3 tables, 8 Figures"},{"id":"http://arxiv.org/abs/2312.04528v1","updated":"2023-12-07T18:46:50Z","published":"2023-12-07T18:46:50Z","title":"Using Large Language Models for Hyperparameter Optimization","summary":" This paper studies using foundational large language models (LLMs) to make\ndecisions during hyperparameter optimization (HPO). Empirical evaluations\ndemonstrate that in settings with constrained search budgets, LLMs can perform\ncomparably or better than traditional HPO methods like random search and\nBayesian optimization on standard benchmarks. Furthermore, we propose to treat\nthe code specifying our model as a hyperparameter, which the LLM outputs, going\nbeyond the capabilities of existing HPO approaches. Our findings suggest that\nLLMs are a promising tool for improving efficiency in the traditional\ndecision-making problem of hyperparameter optimization.\n","authors":["Michael R. Zhang","Nishkrit Desai","Juhan Bae","Jonathan Lorraine","Jimmy Ba"],"pdf_url":"https://arxiv.org/pdf/2312.04528v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2312.03619v2","updated":"2023-12-07T18:45:10Z","published":"2023-12-06T17:07:42Z","title":"Evaluation of Active Feature Acquisition Methods for Static Feature\n Settings","summary":" Active feature acquisition (AFA) agents, crucial in domains like healthcare\nwhere acquiring features is often costly or harmful, determine the optimal set\nof features for a subsequent classification task. As deploying an AFA agent\nintroduces a shift in missingness distribution, it's vital to assess its\nexpected performance at deployment using retrospective data. In a companion\npaper, we introduce a semi-offline reinforcement learning (RL) framework for\nactive feature acquisition performance evaluation (AFAPE) where features are\nassumed to be time-dependent. Here, we study and extend the AFAPE problem to\ncover static feature settings, where features are time-invariant, and hence\nprovide more flexibility to the AFA agents in deciding the order of the\nacquisitions. In this static feature setting, we derive and adapt new inverse\nprobability weighting (IPW), direct method (DM), and double reinforcement\nlearning (DRL) estimators within the semi-offline RL framework. These\nestimators can be applied when the missingness in the retrospective dataset\nfollows a missing-at-random (MAR) pattern. They also can be applied to\nmissing-not-at-random (MNAR) patterns in conjunction with appropriate existing\nmissing data techniques. We illustrate the improved data efficiency offered by\nthe semi-offline RL estimators in synthetic and real-world data experiments\nunder synthetic MAR and MNAR missingness.\n","authors":["Henrik von Kleist","Alireza Zamanian","Ilya Shpitser","Narges Ahmidi"],"pdf_url":"https://arxiv.org/pdf/2312.03619v2.pdf","comment":"38 pages, 7 figures, 5 tables"},{"id":"http://arxiv.org/abs/2305.03514v2","updated":"2023-12-07T18:37:55Z","published":"2023-04-12T17:33:28Z","title":"Can Large Language Models Transform Computational Social Science?","summary":" Large Language Models (LLMs) are capable of successfully performing many\nlanguage processing tasks zero-shot (without training data). If zero-shot LLMs\ncan also reliably classify and explain social phenomena like persuasiveness and\npolitical ideology, then LLMs could augment the Computational Social Science\n(CSS) pipeline in important ways. This work provides a road map for using LLMs\nas CSS tools. Towards this end, we contribute a set of prompting best practices\nand an extensive evaluation pipeline to measure the zero-shot performance of 13\nlanguage models on 25 representative English CSS benchmarks. On taxonomic\nlabeling tasks (classification), LLMs fail to outperform the best fine-tuned\nmodels but still achieve fair levels of agreement with humans. On free-form\ncoding tasks (generation), LLMs produce explanations that often exceed the\nquality of crowdworkers' gold references. We conclude that the performance of\ntoday's LLMs can augment the CSS research pipeline in two ways: (1) serving as\nzero-shot data annotators on human annotation teams, and (2) bootstrapping\nchallenging creative generation tasks (e.g., explaining the underlying\nattributes of a text). In summary, LLMs are posed to meaningfully participate\nin} social science analysis in partnership with humans.\n","authors":["Caleb Ziems","William Held","Omar Shaikh","Jiaao Chen","Zhehao Zhang","Diyi Yang"],"pdf_url":"https://arxiv.org/pdf/2305.03514v2.pdf","comment":"To appear in \"Computational Linguistics\" (CL)"},{"id":"http://arxiv.org/abs/2306.06503v2","updated":"2023-12-07T18:36:31Z","published":"2023-06-10T18:41:50Z","title":"Preserving privacy in domain transfer of medical AI models comes at no\n performance costs: The integral role of differential privacy","summary":" Developing robust and effective artificial intelligence (AI) models in\nmedicine requires access to large amounts of patient data. The use of AI models\nsolely trained on large multi-institutional datasets can help with this, yet\nthe imperative to ensure data privacy remains, particularly as membership\ninference risks breaching patient confidentiality. As a proposed remedy, we\nadvocate for the integration of differential privacy (DP). We specifically\ninvestigate the performance of models trained with DP as compared to models\ntrained without DP on data from institutions that the model had not seen during\nits training (i.e., external validation) - the situation that is reflective of\nthe clinical use of AI models. By leveraging more than 590,000 chest\nradiographs from five institutions, we evaluated the efficacy of DP-enhanced\ndomain transfer (DP-DT) in diagnosing cardiomegaly, pleural effusion,\npneumonia, atelectasis, and in identifying healthy subjects. We juxtaposed\nDP-DT with non-DP-DT and examined diagnostic accuracy and demographic fairness\nusing the area under the receiver operating characteristic curve (AUC) as the\nmain metric, as well as accuracy, sensitivity, and specificity. Our results\nshow that DP-DT, even with exceptionally high privacy levels (epsilon around\n1), performs comparably to non-DP-DT (P>0.119 across all domains). Furthermore,\nDP-DT led to marginal AUC differences - less than 1% - for nearly all\nsubgroups, relative to non-DP-DT. Despite consistent evidence suggesting that\nDP models induce significant performance degradation for on-domain\napplications, we show that off-domain performance is almost not affected.\nTherefore, we ardently advocate for the adoption of DP in training diagnostic\nmedical AI models, given its minimal impact on performance.\n","authors":["Soroosh Tayebi Arasteh","Mahshad Lotfinia","Teresa Nolte","Marwin Saehn","Peter Isfort","Christiane Kuhl","Sven Nebelung","Georgios Kaissis","Daniel Truhn"],"pdf_url":"https://arxiv.org/pdf/2306.06503v2.pdf","comment":"Published in Radiology: Artificial Intelligence. RSNA"},{"id":"http://arxiv.org/abs/2312.04510v1","updated":"2023-12-07T18:30:15Z","published":"2023-12-07T18:30:15Z","title":"A Block Metropolis-Hastings Sampler for Controllable Energy-based Text\n Generation","summary":" Recent work has shown that energy-based language modeling is an effective\nframework for controllable text generation because it enables flexible\nintegration of arbitrary discriminators. However, because energy-based LMs are\nglobally normalized, approximate techniques like Metropolis-Hastings (MH) are\nrequired for inference. Past work has largely explored simple proposal\ndistributions that modify a single token at a time, like in Gibbs sampling. In\nthis paper, we develop a novel MH sampler that, in contrast, proposes re-writes\nof the entire sequence in each step via iterative prompting of a large language\nmodel. Our new sampler (a) allows for more efficient and accurate sampling from\na target distribution and (b) allows generation length to be determined through\nthe sampling procedure rather than fixed in advance, as past work has required.\nWe perform experiments on two controlled generation tasks, showing both\ndownstream performance gains and more accurate target distribution sampling in\ncomparison with single-token proposal techniques.\n","authors":["Jarad Forristal","Niloofar Mireshghallah","Greg Durrett","Taylor Berg-Kirkpatrick"],"pdf_url":"https://arxiv.org/pdf/2312.04510v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.00054v2","updated":"2023-12-07T18:27:33Z","published":"2023-04-28T19:05:16Z","title":"LAVA: Data Valuation without Pre-Specified Learning Algorithms","summary":" Traditionally, data valuation (DV) is posed as a problem of equitably\nsplitting the validation performance of a learning algorithm among the training\ndata. As a result, the calculated data values depend on many design choices of\nthe underlying learning algorithm. However, this dependence is undesirable for\nmany DV use cases, such as setting priorities over different data sources in a\ndata acquisition process and informing pricing mechanisms in a data\nmarketplace. In these scenarios, data needs to be valued before the actual\nanalysis and the choice of the learning algorithm is still undetermined then.\nAnother side-effect of the dependence is that to assess the value of individual\npoints, one needs to re-run the learning algorithm with and without a point,\nwhich incurs a large computation burden. This work leapfrogs over the current\nlimits of data valuation methods by introducing a new framework that can value\ntraining data in a way that is oblivious to the downstream learning algorithm.\nOur main results are as follows. (1) We develop a proxy for the validation\nperformance associated with a training set based on a non-conventional\nclass-wise Wasserstein distance between training and validation sets. We show\nthat the distance characterizes the upper bound of the validation performance\nfor any given model under certain Lipschitz conditions. (2) We develop a novel\nmethod to value individual data based on the sensitivity analysis of the\nclass-wise Wasserstein distance. Importantly, these values can be directly\nobtained for free from the output of off-the-shelf optimization solvers when\ncomputing the distance. (3) We evaluate our new data valuation framework over\nvarious use cases related to detecting low-quality data and show that,\nsurprisingly, the learning-agnostic feature of our framework enables a\nsignificant improvement over SOTA performance while being orders of magnitude\nfaster.\n","authors":["Hoang Anh Just","Feiyang Kang","Jiachen T. Wang","Yi Zeng","Myeongseob Ko","Ming Jin","Ruoxi Jia"],"pdf_url":"https://arxiv.org/pdf/2305.00054v2.pdf","comment":"ICLR 2023 Spotlight"},{"id":"http://arxiv.org/abs/2312.04504v1","updated":"2023-12-07T18:24:19Z","published":"2023-12-07T18:24:19Z","title":"Coordination-free Decentralised Federated Learning on Complex Networks:\n Overcoming Heterogeneity","summary":" Federated Learning (FL) is a well-known framework for successfully performing\na learning task in an edge computing scenario where the devices involved have\nlimited resources and incomplete data representation. The basic assumption of\nFL is that the devices communicate directly or indirectly with a parameter\nserver that centrally coordinates the whole process, overcoming several\nchallenges associated with it. However, in highly pervasive edge scenarios, the\npresence of a central controller that oversees the process cannot always be\nguaranteed, and the interactions (i.e., the connectivity graph) between devices\nmight not be predetermined, resulting in a complex network structure. Moreover,\nthe heterogeneity of data and devices further complicates the learning process.\nThis poses new challenges from a learning standpoint that we address by\nproposing a communication-efficient Decentralised Federated Learning (DFL)\nalgorithm able to cope with them. Our solution allows devices communicating\nonly with their direct neighbours to train an accurate model, overcoming the\nheterogeneity induced by data and different training histories. Our results\nshow that the resulting local models generalise better than those trained with\ncompeting approaches, and do so in a more communication-efficient way.\n","authors":["Lorenzo Valerio","Chiara Boldrini","Andrea Passarella","János Kertész","Márton Karsai","Gerardo Iñiguez"],"pdf_url":"https://arxiv.org/pdf/2312.04504v1.pdf","comment":"Supported by the H2020 HumaneAI Net (#952026), H2020\n INFRAIA-01-2018-2019 SoBigData++ (#871042), and by the CHIST-ERA-19-XAI010\n SAI projects, FWF (grant No. I 5205). Also funded by PNRR MUR Partenariato\n Esteso PE00000013 FAIR, PNRR MUR Partenariato Esteso PE00000001 - \"RESTART\""},{"id":"http://arxiv.org/abs/2312.04501v1","updated":"2023-12-07T18:21:52Z","published":"2023-12-07T18:21:52Z","title":"Graph Metanetworks for Processing Diverse Neural Architectures","summary":" Neural networks efficiently encode learned information within their\nparameters. Consequently, many tasks can be unified by treating neural networks\nthemselves as input data. When doing so, recent studies demonstrated the\nimportance of accounting for the symmetries and geometry of parameter spaces.\nHowever, those works developed architectures tailored to specific networks such\nas MLPs and CNNs without normalization layers, and generalizing such\narchitectures to other types of networks can be challenging. In this work, we\novercome these challenges by building new metanetworks - neural networks that\ntake weights from other neural networks as input. Put simply, we carefully\nbuild graphs representing the input neural networks and process the graphs\nusing graph neural networks. Our approach, Graph Metanetworks (GMNs),\ngeneralizes to neural architectures where competing methods struggle, such as\nmulti-head attention layers, normalization layers, convolutional layers, ResNet\nblocks, and group-equivariant linear layers. We prove that GMNs are expressive\nand equivariant to parameter permutation symmetries that leave the input neural\nnetwork functions unchanged. We validate the effectiveness of our method on\nseveral metanetwork tasks over diverse neural network architectures.\n","authors":["Derek Lim","Haggai Maron","Marc T. Law","Jonathan Lorraine","James Lucas"],"pdf_url":"https://arxiv.org/pdf/2312.04501v1.pdf","comment":"29 pages"},{"id":"http://arxiv.org/abs/2301.09820v2","updated":"2023-12-07T18:08:34Z","published":"2023-01-24T05:11:17Z","title":"A Stability Analysis of Fine-Tuning a Pre-Trained Model","summary":" Fine-tuning a pre-trained model (such as BERT, ALBERT, RoBERTa, T5, GPT,\netc.) has proven to be one of the most promising paradigms in recent NLP\nresearch. However, numerous recent works indicate that fine-tuning suffers from\nthe instability problem, i.e., tuning the same model under the same setting\nresults in significantly different performance. Many recent works have proposed\ndifferent methods to solve this problem, but there is no theoretical\nunderstanding of why and how these methods work. In this paper, we propose a\nnovel theoretical stability analysis of fine-tuning that focuses on two\ncommonly used settings, namely, full fine-tuning and head tuning. We define the\nstability under each setting and prove the corresponding stability bounds. The\ntheoretical bounds explain why and how several existing methods can stabilize\nthe fine-tuning procedure. In addition to being able to explain most of the\nobserved empirical discoveries, our proposed theoretical analysis framework can\nalso help in the design of effective and provable methods. Based on our theory,\nwe propose three novel strategies to stabilize the fine-tuning procedure,\nnamely, Maximal Margin Regularizer (MMR), Multi-Head Loss (MHLoss), and Self\nUnsupervised Re-Training (SURT). We extensively evaluate our proposed\napproaches on 11 widely used real-world benchmark datasets, as well as hundreds\nof synthetic classification datasets. The experiment results show that our\nproposed methods significantly stabilize the fine-tuning procedure and also\ncorroborate our theoretical analysis.\n","authors":["Zihao Fu","Anthony Man-Cho So","Nigel Collier"],"pdf_url":"https://arxiv.org/pdf/2301.09820v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2104.12928v4","updated":"2023-12-07T17:58:04Z","published":"2021-04-27T01:02:15Z","title":"If your data distribution shifts, use self-learning","summary":" We demonstrate that self-learning techniques like entropy minimization and\npseudo-labeling are simple and effective at improving performance of a deployed\ncomputer vision model under systematic domain shifts. We conduct a wide range\nof large-scale experiments and show consistent improvements irrespective of the\nmodel architecture, the pre-training technique or the type of distribution\nshift. At the same time, self-learning is simple to use in practice because it\ndoes not require knowledge or access to the original training data or scheme,\nis robust to hyperparameter choices, is straight-forward to implement and\nrequires only a few adaptation epochs. This makes self-learning techniques\nhighly attractive for any practitioner who applies machine learning algorithms\nin the real world. We present state-of-the-art adaptation results on CIFAR10-C\n(8.5% error), ImageNet-C (22.0% mCE), ImageNet-R (17.4% error) and ImageNet-A\n(14.8% error), theoretically study the dynamics of self-supervised adaptation\nmethods and propose a new classification dataset (ImageNet-D) which is\nchallenging even with adaptation.\n","authors":["Evgenia Rusak","Steffen Schneider","George Pachitariu","Luisa Eck","Peter Gehler","Oliver Bringmann","Wieland Brendel","Matthias Bethge"],"pdf_url":"https://arxiv.org/pdf/2104.12928v4.pdf","comment":"Web: https://domainadaptation.org/selflearning"},{"id":"http://arxiv.org/abs/2312.01227v2","updated":"2023-12-07T17:55:02Z","published":"2023-12-02T21:10:06Z","title":"Distributed Bayesian Estimation in Sensor Networks: Consensus on\n Marginal Densities","summary":" In this paper, we aim to design and analyze distributed Bayesian estimation\nalgorithms for sensor networks. The challenges we address are to (i) derive a\ndistributed provably-correct algorithm in the functional space of probability\ndistributions over continuous variables, and (ii) leverage these results to\nobtain new distributed estimators restricted to subsets of variables observed\nby individual agents. This relates to applications such as cooperative\nlocalization and federated learning, where the data collected at any agent\ndepends on a subset of all variables of interest. We present Bayesian density\nestimation algorithms using data from non-linear likelihoods at agents in\ncentralized, distributed, and marginal distributed settings. After setting up a\ndistributed estimation objective, we prove almost-sure convergence to the\noptimal set of pdfs at each agent. Then, we prove the same for a storage-aware\nalgorithm estimating densities only over relevant variables at each agent.\nFinally, we present a Gaussian version of these algorithms and implement it in\na mapping problem using variational inference to handle non-linear likelihood\nmodels associated with LiDAR sensing.\n","authors":["Parth Paritosh","Nikolay Atanasov","Sonia Martinez"],"pdf_url":"https://arxiv.org/pdf/2312.01227v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04474v1","updated":"2023-12-07T17:51:43Z","published":"2023-12-07T17:51:43Z","title":"Chain of Code: Reasoning with a Language Model-Augmented Code Emulator","summary":" Code provides a general syntactic structure to build complex programs and\nperform precise computations when paired with a code interpreter -- we\nhypothesize that language models (LMs) can leverage code-writing to improve\nChain of Thought reasoning not only for logic and arithmetic tasks, but also\nfor linguistic ones (and in particular, those that are a mix of both). For\nexample, consider prompting an LM to write code that counts the number of times\nit detects sarcasm in an essay: the LM may struggle to write an implementation\nfor \"detect_sarcasm(string)\" that can be executed by the interpreter (handling\nthe edge cases would be insurmountable). However, LMs may still produce a valid\nsolution if they are used not only to write the code, but also to selectively\n\"emulate\" the interpreter by generating the expected output of\n\"detect_sarcasm(string)\" and other lines of code (e.g., that the interpreter\ncould not compile). In this work, we propose Chain of Code (CoT), a simple yet\nsurprisingly effective extension that improves LM code-driven reasoning. The\nkey idea is to encourage LMs to format linguistic sub-tasks in a program as\nflexible pseudocode that the compiler can explicitly catch undefined behaviors\nand hand off to simulate with an LM (as an \"LMulator\"). Experiments demonstrate\nthat Chain of Code outperforms Chain of Thought and other baselines across a\nvariety of benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of\n12% over Chain of Thought. CoT scales well with large and small models alike,\nand broadens the scope of reasoning questions that LMs can correctly answer by\n\"thinking in code\". Project webpage: https://chain-of-code.github.io/.\n","authors":["Chengshu Li","Jacky Liang","Andy Zeng","Xinyun Chen","Karol Hausman","Dorsa Sadigh","Sergey Levine","Li Fei-Fei","Fei Xia","Brian Ichter"],"pdf_url":"https://arxiv.org/pdf/2312.04474v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04469v1","updated":"2023-12-07T17:41:44Z","published":"2023-12-07T17:41:44Z","title":"On the Learnability of Watermarks for Language Models","summary":" Watermarking of language model outputs enables statistical detection of\nmodel-generated text, which has many applications in the responsible deployment\nof language models. Existing watermarking strategies operate by altering the\ndecoder of an existing language model, and the ability for a language model to\ndirectly learn to generate the watermark would have significant implications\nfor the real-world deployment of watermarks. First, learned watermarks could be\nused to build open models that naturally generate watermarked text, allowing\nfor open models to benefit from watermarking. Second, if watermarking is used\nto determine the provenance of generated text, an adversary can hurt the\nreputation of a victim model by spoofing its watermark and generating damaging\nwatermarked text. To investigate the learnability of watermarks, we propose\nwatermark distillation, which trains a student model to behave like a teacher\nmodel that uses decoding-based watermarking. We test our approach on three\ndistinct decoding-based watermarking strategies and various hyperparameter\nsettings, finding that models can learn to generate watermarked text with high\ndetectability. We also find limitations to learnability, including the loss of\nwatermarking capabilities under fine-tuning on normal text and high sample\ncomplexity when learning low-distortion watermarks.\n","authors":["Chenchen Gu","Xiang Lisa Li","Percy Liang","Tatsunori Hashimoto"],"pdf_url":"https://arxiv.org/pdf/2312.04469v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04464v1","updated":"2023-12-07T17:35:34Z","published":"2023-12-07T17:35:34Z","title":"Horizon-Free and Instance-Dependent Regret Bounds for Reinforcement\n Learning with General Function Approximation","summary":" To tackle long planning horizon problems in reinforcement learning with\ngeneral function approximation, we propose the first algorithm, termed as\nUCRL-WVTR, that achieves both \\emph{horizon-free} and\n\\emph{instance-dependent}, since it eliminates the polynomial dependency on the\nplanning horizon. The derived regret bound is deemed \\emph{sharp}, as it\nmatches the minimax lower bound when specialized to linear mixture MDPs up to\nlogarithmic factors. Furthermore, UCRL-WVTR is \\emph{computationally efficient}\nwith access to a regression oracle. The achievement of such a horizon-free,\ninstance-dependent, and sharp regret bound hinges upon (i) novel algorithm\ndesigns: weighted value-targeted regression and a high-order moment estimator\nin the context of general function approximation; and (ii) fine-grained\nanalyses: a novel concentration bound of weighted non-linear least squares and\na refined analysis which leads to the tight instance-dependent bound. We also\nconduct comprehensive experiments to corroborate our theoretical findings.\n","authors":["Jiayi Huang","Han Zhong","Liwei Wang","Lin F. Yang"],"pdf_url":"https://arxiv.org/pdf/2312.04464v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04461v1","updated":"2023-12-07T17:32:29Z","published":"2023-12-07T17:32:29Z","title":"PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding","summary":" Recent advances in text-to-image generation have made remarkable progress in\nsynthesizing realistic human photos conditioned on given text prompts. However,\nexisting personalized generation methods cannot simultaneously satisfy the\nrequirements of high efficiency, promising identity (ID) fidelity, and flexible\ntext controllability. In this work, we introduce PhotoMaker, an efficient\npersonalized text-to-image generation method, which mainly encodes an arbitrary\nnumber of input ID images into a stack ID embedding for preserving ID\ninformation. Such an embedding, serving as a unified ID representation, can not\nonly encapsulate the characteristics of the same input ID comprehensively, but\nalso accommodate the characteristics of different IDs for subsequent\nintegration. This paves the way for more intriguing and practically valuable\napplications. Besides, to drive the training of our PhotoMaker, we propose an\nID-oriented data construction pipeline to assemble the training data. Under the\nnourishment of the dataset constructed through the proposed pipeline, our\nPhotoMaker demonstrates better ID preservation ability than test-time\nfine-tuning based methods, yet provides significant speed improvements,\nhigh-quality generation results, strong generalization capabilities, and a wide\nrange of applications. Our project page is available at\nhttps://photo-maker.github.io/\n","authors":["Zhen Li","Mingdeng Cao","Xintao Wang","Zhongang Qi","Ming-Ming Cheng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.04461v1.pdf","comment":"Tech report; Project page: https://photo-maker.github.io/"},{"id":"http://arxiv.org/abs/2312.04455v1","updated":"2023-12-07T17:24:51Z","published":"2023-12-07T17:24:51Z","title":"Fortify the Shortest Stave in Attention: Enhancing Context Awareness of\n Large Language Models for Effective Tool Use","summary":" Recent advancements in large language models (LLMs) have significantly\nexpanded their functionality and skills as tool agents. In this paper, we argue\nthat a waveform pattern in the model's attention allocation has an impact on\nthe tool use performance, which degrades when the position of essential\ninformation hits the trough zone. To address this issue, we propose a novel\ninference method named Attention Buckets. This approach enables LLMs to handle\ncontext by conducting parallel processes, each featuring a unique RoPE angle\nbase that shapes the attention waveform. Attention Buckets ensures that an\nattention trough of a particular process can be compensated with an attention\npeak of another run, reducing the risk of the LLM missing essential information\nresiding within the attention trough. Our extensive experiments on the widely\nrecognized tool use benchmark demonstrate the efficacy of our approach, where a\n7B-parameter open-source model enhanced by Attention Buckets achieves SOTA\nperformance on par with GPT-4.\n","authors":["Yuhan Chen","Ang Lv","Ting-En Lin","Changyu Chen","Yuchuan Wu","Fei Huang","Yongbin Li","Rui Yan"],"pdf_url":"https://arxiv.org/pdf/2312.04455v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04447v1","updated":"2023-12-07T17:16:30Z","published":"2023-12-07T17:16:30Z","title":"Privacy-preserving quantum federated learning via gradient hiding","summary":" Distributed quantum computing, particularly distributed quantum machine\nlearning, has gained substantial prominence for its capacity to harness the\ncollective power of distributed quantum resources, transcending the limitations\nof individual quantum nodes. Meanwhile, the critical concern of privacy within\ndistributed computing protocols remains a significant challenge, particularly\nin standard classical federated learning (FL) scenarios where data of\nparticipating clients is susceptible to leakage via gradient inversion attacks\nby the server. This paper presents innovative quantum protocols with quantum\ncommunication designed to address the FL problem, strengthen privacy measures,\nand optimize communication efficiency. In contrast to previous works that\nleverage expressive variational quantum circuits or differential privacy\ntechniques, we consider gradient information concealment using quantum states\nand propose two distinct FL protocols, one based on private inner-product\nestimation and the other on incremental learning. These protocols offer\nsubstantial advancements in privacy preservation with low communication\nresources, forging a path toward efficient quantum communication-assisted FL\nprotocols and contributing to the development of secure distributed quantum\nmachine learning, thus addressing critical privacy concerns in the quantum\ncomputing era.\n","authors":["Changhao Li","Niraj Kumar","Zhixin Song","Shouvanik Chakrabarti","Marco Pistoia"],"pdf_url":"https://arxiv.org/pdf/2312.04447v1.pdf","comment":"12 pages, 2 figures, 1 table"},{"id":"http://arxiv.org/abs/2311.13060v2","updated":"2023-12-07T17:11:58Z","published":"2023-11-21T23:49:51Z","title":"Training Deep 3D Convolutional Neural Networks to Extract BSM Physics\n Parameters Directly from HEP Data: a Proof-of-Concept Study Using Monte Carlo\n Simulations","summary":" We report on a novel application of computer vision techniques to extract\nbeyond the Standard Model (BSM) parameters directly from high energy physics\n(HEP) flavor data. We develop a method of transforming angular and kinematic\ndistributions into \"quasi-images\" that can be used to train a convolutional\nneural network to perform regression tasks, similar to fitting. This contrasts\nwith the usual classification functions performed using ML/AI in HEP. As a\nproof-of-concept, we train a 34-layer Residual Neural Network to regress on\nthese images and determine the Wilson Coefficient $C_{9}$ in MC (Monte Carlo)\nsimulations of $B \\rightarrow K^{*}\\mu^{+}\\mu^{-}$ decays. The technique\ndescribed here can be generalized and may find applicability across various HEP\nexperiments and elsewhere.\n","authors":["S. Dubey","T. E. Browder","S. Kohani","R. Mandal","A. Sibidanov","R. Sinha"],"pdf_url":"https://arxiv.org/pdf/2311.13060v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04432v1","updated":"2023-12-07T16:56:24Z","published":"2023-12-07T16:56:24Z","title":"FreqFed: A Frequency Analysis-Based Approach for Mitigating Poisoning\n Attacks in Federated Learning","summary":" Federated learning (FL) is a collaborative learning paradigm allowing\nmultiple clients to jointly train a model without sharing their training data.\nHowever, FL is susceptible to poisoning attacks, in which the adversary injects\nmanipulated model updates into the federated model aggregation process to\ncorrupt or destroy predictions (untargeted poisoning) or implant hidden\nfunctionalities (targeted poisoning or backdoors). Existing defenses against\npoisoning attacks in FL have several limitations, such as relying on specific\nassumptions about attack types and strategies or data distributions or not\nsufficiently robust against advanced injection techniques and strategies and\nsimultaneously maintaining the utility of the aggregated model. To address the\ndeficiencies of existing defenses, we take a generic and completely different\napproach to detect poisoning (targeted and untargeted) attacks. We present\nFreqFed, a novel aggregation mechanism that transforms the model updates (i.e.,\nweights) into the frequency domain, where we can identify the core frequency\ncomponents that inherit sufficient information about weights. This allows us to\neffectively filter out malicious updates during local training on the clients,\nregardless of attack types, strategies, and clients' data distributions. We\nextensively evaluate the efficiency and effectiveness of FreqFed in different\napplication domains, including image classification, word prediction, IoT\nintrusion detection, and speech recognition. We demonstrate that FreqFed can\nmitigate poisoning attacks effectively with a negligible impact on the utility\nof the aggregated model.\n","authors":["Hossein Fereidooni","Alessandro Pegoraro","Phillip Rieger","Alexandra Dmitrienko","Ahmad-Reza Sadeghi"],"pdf_url":"https://arxiv.org/pdf/2312.04432v1.pdf","comment":"To appear in the Network and Distributed System Security (NDSS)\n Symposium 2024. 16 pages, 8 figures, 12 tables, 1 algorithm, 3 equations"},{"id":"http://arxiv.org/abs/2301.00752v4","updated":"2023-12-07T16:42:32Z","published":"2023-01-02T16:51:40Z","title":"Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave\n Communications","summary":" This study demonstrates the feasibility of point cloud-based proactive link\nquality prediction for millimeter-wave (mmWave) communications. Previous\nstudies have proposed machine learning-based methods to predict received signal\nstrength for future time periods using time series of depth images to mitigate\nthe line-of-sight (LOS) path blockage by pedestrians in mmWave communication.\nHowever, these image-based methods have limited applicability due to privacy\nconcerns as camera images may contain sensitive information. This study\nproposes a point cloud-based method for mmWave link quality prediction and\ndemonstrates its feasibility through experiments. Point clouds represent\nthree-dimensional (3D) spaces as a set of points and are sparser and less\nlikely to contain sensitive information than camera images. Additionally, point\nclouds provide 3D position and motion information, which is necessary for\nunderstanding the radio propagation environment involving pedestrians. This\nstudy designs the mmWave link quality prediction method and conducts realistic\nindoor experiments, where the link quality fluctuates significantly due to\nhuman blockage, using commercially available IEEE 802.11ad-based 60 GHz\nwireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light\ndetection and ranging (LiDAR) for point cloud acquisition. The experimental\nresults showed that our proposed method can predict future large attenuation of\nmmWave received signal strength and throughput induced by the LOS path blockage\nby pedestrians with comparable or superior accuracy to image-based prediction\nmethods. Hence, our point cloud-based method can serve as a viable alternative\nto image-based methods.\n","authors":["Shoki Ohta","Takayuki Nishio","Riichi Kudo","Kahoko Takahashi","Hisashi Nagata"],"pdf_url":"https://arxiv.org/pdf/2301.00752v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04416v1","updated":"2023-12-07T16:38:20Z","published":"2023-12-07T16:38:20Z","title":"Monitoring Sustainable Global Development Along Shared Socioeconomic\n Pathways","summary":" Sustainable global development is one of the most prevalent challenges facing\nthe world today, hinging on the equilibrium between socioeconomic growth and\nenvironmental sustainability. We propose approaches to monitor and quantify\nsustainable development along the Shared Socioeconomic Pathways (SSPs),\nincluding mathematically derived scoring algorithms, and machine learning\nmethods. These integrate socioeconomic and environmental datasets, to produce\nan interpretable metric for SSP alignment. An initial study demonstrates\npromising results, laying the groundwork for the application of different\nmethods to the monitoring of sustainable global development.\n","authors":["Michelle W. L. Wan","Jeffrey N. Clark","Edward A. Small","Elena Fillola Mayoral","Raúl Santos-Rodríguez"],"pdf_url":"https://arxiv.org/pdf/2312.04416v1.pdf","comment":"5 pages, 1 figure. Presented at NeurIPS 2023 Workshop: Tackling\n Climate Change with Machine Learning"},{"id":"http://arxiv.org/abs/2306.03088v2","updated":"2023-12-07T16:32:57Z","published":"2023-06-05T17:58:49Z","title":"DeepGraphDMD: Interpretable Spatio-Temporal Decomposition of Non-linear\n Functional Brain Network Dynamics","summary":" Functional brain dynamics is supported by parallel and overlapping functional\nnetwork modes that are associated with specific neural circuits. Decomposing\nthese network modes from fMRI data and finding their temporal characteristics\nis challenging due to their time-varying nature and the non-linearity of the\nfunctional dynamics. Dynamic Mode Decomposition (DMD) algorithms have been\nquite popular for solving this decomposition problem in recent years. In this\nwork, we apply GraphDMD -- an extension of the DMD for network data -- to\nextract the dynamic network modes and their temporal characteristics from the\nfMRI time series in an interpretable manner. GraphDMD, however, regards the\nunderlying system as a linear dynamical system that is sub-optimal for\nextracting the network modes from non-linear functional data. In this work, we\ndevelop a generalized version of the GraphDMD algorithm -- DeepGraphDMD --\napplicable to arbitrary non-linear graph dynamical systems. DeepGraphDMD is an\nautoencoder-based deep learning model that learns Koopman eigenfunctions for\ngraph data and embeds the non-linear graph dynamics into a latent linear space.\nWe show the effectiveness of our method in both simulated data and the HCP\nresting-state fMRI data. In the HCP data, DeepGraphDMD provides novel insights\ninto cognitive brain functions by discovering two major network modes related\nto fluid and crystallized intelligence.\n","authors":["Md Asadullah Turja","Martin Styner","Guorong Wu"],"pdf_url":"https://arxiv.org/pdf/2306.03088v2.pdf","comment":"Published in MICCAI 2023"},{"id":"http://arxiv.org/abs/2312.04404v1","updated":"2023-12-07T16:17:34Z","published":"2023-12-07T16:17:34Z","title":"On the Impact of Multi-dimensional Local Differential Privacy on\n Fairness","summary":" Automated decision systems are increasingly used to make consequential\ndecisions in people's lives. Due to the sensitivity of the manipulated data as\nwell as the resulting decisions, several ethical concerns need to be addressed\nfor the appropriate use of such technologies, in particular, fairness and\nprivacy. Unlike previous work, which focused on centralized differential\nprivacy (DP) or local DP (LDP) for a single sensitive attribute, in this paper,\nwe examine the impact of LDP in the presence of several sensitive attributes\n(i.e., multi-dimensional data) on fairness. Detailed empirical analysis on\nsynthetic and benchmark datasets revealed very relevant observations. In\nparticular, (1) multi-dimensional LDP is an efficient approach to reduce\ndisparity, (2) the multi-dimensional approach of LDP (independent vs. combined)\nmatters only at low privacy guarantees, and (3) the outcome Y distribution has\nan important effect on which group is more sensitive to the obfuscation. Last,\nwe summarize our findings in the form of recommendations to guide practitioners\nin adopting effective privacy-preserving practices while maintaining fairness\nand utility in ML applications.\n","authors":["karima Makhlouf","Heber H. Arcolezi","Sami Zhioua","Ghassen Ben Brahim","Catuscia Palamidessi"],"pdf_url":"https://arxiv.org/pdf/2312.04404v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04402v1","updated":"2023-12-07T16:16:47Z","published":"2023-12-07T16:16:47Z","title":"Semi-Supervised Active Learning for Semantic Segmentation in Unknown\n Environments Using Informative Path Planning","summary":" Semantic segmentation enables robots to perceive and reason about their\nenvironments beyond geometry. Most of such systems build upon deep learning\napproaches. As autonomous robots are commonly deployed in initially unknown\nenvironments, pre-training on static datasets cannot always capture the variety\nof domains and limits the robot's perception performance during missions.\nRecently, self-supervised and fully supervised active learning methods emerged\nto improve a robot's vision. These approaches rely on large in-domain\npre-training datasets or require substantial human labelling effort. We propose\na planning method for semi-supervised active learning of semantic segmentation\nthat substantially reduces human labelling requirements compared to fully\nsupervised approaches. We leverage an adaptive map-based planner guided towards\nthe frontiers of unexplored space with high model uncertainty collecting\ntraining data for human labelling. A key aspect of our approach is to combine\nthe sparse high-quality human labels with pseudo labels automatically extracted\nfrom highly certain environment map areas. Experimental results show that our\nmethod reaches segmentation performance close to fully supervised approaches\nwith drastically reduced human labelling effort while outperforming\nself-supervised approaches.\n","authors":["Julius Rückin","Federico Magistri","Cyrill Stachniss","Marija Popović"],"pdf_url":"https://arxiv.org/pdf/2312.04402v1.pdf","comment":"8 pages, 9 figures"},{"id":"http://arxiv.org/abs/2210.12583v3","updated":"2023-12-07T16:14:29Z","published":"2022-10-23T00:45:05Z","title":"Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model\n Predictive Control","summary":" Model-based control requires an accurate model of the system dynamics for\nprecisely and safely controlling the robot in complex and dynamic environments.\nMoreover, in the presence of variations in the operating conditions, the model\nshould be continuously refined to compensate for dynamics changes. In this\npaper, we present a self-supervised learning approach that actively models the\ndynamics of nonlinear robotic systems. We combine offline learning from past\nexperience and online learning from current robot interaction with the unknown\nenvironment. These two ingredients enable a highly sample-efficient and\nadaptive learning process, capable of accurately inferring model dynamics in\nreal-time even in operating regimes that greatly differ from the training\ndistribution. Moreover, we design an uncertainty-aware model predictive\ncontroller that is heuristically conditioned to the aleatoric (data)\nuncertainty of the learned dynamics. This controller actively chooses the\noptimal control actions that (i) optimize the control performance and (ii)\nimprove the efficiency of online learning sample collection. We demonstrate the\neffectiveness of our method through a series of challenging real-world\nexperiments using a quadrotor system. Our approach showcases high resilience\nand generalization capabilities by consistently adapting to unseen flight\nconditions, while it significantly outperforms classical and adaptive control\nbaselines.\n","authors":["Alessandro Saviolo","Jonathan Frey","Abhishek Rathod","Moritz Diehl","Giuseppe Loianno"],"pdf_url":"https://arxiv.org/pdf/2210.12583v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2301.05601v2","updated":"2023-12-07T16:13:24Z","published":"2023-01-13T15:06:47Z","title":"Sem@$K$: Is my knowledge graph embedding model semantic-aware?","summary":" Using knowledge graph embedding models (KGEMs) is a popular approach for\npredicting links in knowledge graphs (KGs). Traditionally, the performance of\nKGEMs for link prediction is assessed using rank-based metrics, which evaluate\ntheir ability to give high scores to ground-truth entities. However, the\nliterature claims that the KGEM evaluation procedure would benefit from adding\nsupplementary dimensions to assess. That is why, in this paper, we extend our\npreviously introduced metric Sem@K that measures the capability of models to\npredict valid entities w.r.t. domain and range constraints. In particular, we\nconsider a broad range of KGs and take their respective characteristics into\naccount to propose different versions of Sem@K. We also perform an extensive\nstudy to qualify the abilities of KGEMs as measured by our metric. Our\nexperiments show that Sem@K provides a new perspective on KGEM quality. Its\njoint analysis with rank-based metrics offers different conclusions on the\npredictive power of models. Regarding Sem@K, some KGEMs are inherently better\nthan others, but this semantic superiority is not indicative of their\nperformance w.r.t. rank-based metrics. In this work, we generalize conclusions\nabout the relative performance of KGEMs w.r.t. rank-based and semantic-oriented\nmetrics at the level of families of models. The joint analysis of the\naforementioned metrics gives more insight into the peculiarities of each model.\nThis work paves the way for a more comprehensive evaluation of KGEM adequacy\nfor specific downstream tasks.\n","authors":["Nicolas Hubert","Pierre Monnin","Armelle Brun","Davy Monticolo"],"pdf_url":"https://arxiv.org/pdf/2301.05601v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04398v1","updated":"2023-12-07T16:10:10Z","published":"2023-12-07T16:10:10Z","title":"Intelligent Anomaly Detection for Lane Rendering Using Transformer with\n Self-Supervised Pre-Training and Customized Fine-Tuning","summary":" The burgeoning navigation services using digital maps provide great\nconvenience to drivers. Nevertheless, the presence of anomalies in lane\nrendering map images occasionally introduces potential hazards, as such\nanomalies can be misleading to human drivers and consequently contribute to\nunsafe driving conditions. In response to this concern and to accurately and\neffectively detect the anomalies, this paper transforms lane rendering image\nanomaly detection into a classification problem and proposes a four-phase\npipeline consisting of data pre-processing, self-supervised pre-training with\nthe masked image modeling (MiM) method, customized fine-tuning using\ncross-entropy based loss with label smoothing, and post-processing to tackle it\nleveraging state-of-the-art deep learning techniques, especially those\ninvolving Transformer models. Various experiments verify the effectiveness of\nthe proposed pipeline. Results indicate that the proposed pipeline exhibits\nsuperior performance in lane rendering image anomaly detection, and notably,\nthe self-supervised pre-training with MiM can greatly enhance the detection\naccuracy while significantly reducing the total training time. For instance,\nemploying the Swin Transformer with Uniform Masking as self-supervised\npretraining (Swin-Trans-UM) yielded a heightened accuracy at 94.77% and an\nimproved Area Under The Curve (AUC) score of 0.9743 compared with the pure Swin\nTransformer without pre-training (Swin-Trans) with an accuracy of 94.01% and an\nAUC of 0.9498. The fine-tuning epochs were dramatically reduced to 41 from the\noriginal 280. In conclusion, the proposed pipeline, with its incorporation of\nself-supervised pre-training using MiM and other advanced deep learning\ntechniques, emerges as a robust solution for enhancing the accuracy and\nefficiency of lane rendering image anomaly detection in digital navigation\nsystems.\n","authors":["Yongqi Dong","Xingmin Lu","Ruohan Li","Wei Song","Bart van Arem","Haneen Farah"],"pdf_url":"https://arxiv.org/pdf/2312.04398v1.pdf","comment":"20 pages, 6 figures, accepted by the 103rd Transportation Research\n Board (TRB) Annual Meeting, under review by Transportation Research Record:\n Journal of the Transportation Research Board"},{"id":"http://arxiv.org/abs/2306.00857v2","updated":"2023-12-07T16:00:25Z","published":"2023-06-01T16:14:29Z","title":"Loss-Optimal Classification Trees: A Generalized Framework and the\n Logistic Case","summary":" The Classification Tree (CT) is one of the most common models in\ninterpretable machine learning. Although such models are usually built with\ngreedy strategies, in recent years, thanks to remarkable advances in\nMixer-Integer Programming (MIP) solvers, several exact formulations of the\nlearning problem have been developed. In this paper, we argue that some of the\nmost relevant ones among these training models can be encapsulated within a\ngeneral framework, whose instances are shaped by the specification of loss\nfunctions and regularizers. Next, we introduce a novel realization of this\nframework: specifically, we consider the logistic loss, handled in the MIP\nsetting by a linear piece-wise approximation, and couple it with\n$\\ell_1$-regularization terms. The resulting Optimal Logistic Tree model\nnumerically proves to be able to induce trees with enhanced interpretability\nfeatures and competitive generalization capabilities, compared to the\nstate-of-the-art MIP-based approaches.\n","authors":["Tommaso Aldinucci","Matteo Lapucci"],"pdf_url":"https://arxiv.org/pdf/2306.00857v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04386v1","updated":"2023-12-07T15:55:58Z","published":"2023-12-07T15:55:58Z","title":"Model-Based Epistemic Variance of Values for Risk-Aware Policy\n Optimization","summary":" We consider the problem of quantifying uncertainty over expected cumulative\nrewards in model-based reinforcement learning. In particular, we focus on\ncharacterizing the variance over values induced by a distribution over MDPs.\nPrevious work upper bounds the posterior variance over values by solving a\nso-called uncertainty Bellman equation (UBE), but the over-approximation may\nresult in inefficient exploration. We propose a new UBE whose solution\nconverges to the true posterior variance over values and leads to lower regret\nin tabular exploration problems. We identify challenges to apply the UBE theory\nbeyond tabular problems and propose a suitable approximation. Based on this\napproximation, we introduce a general-purpose policy optimization algorithm,\nQ-Uncertainty Soft Actor-Critic (QU-SAC), that can be applied for either\nrisk-seeking or risk-averse policy optimization with minimal changes.\nExperiments in both online and offline RL demonstrate improved performance\ncompared to other uncertainty estimation methods.\n","authors":["Carlos E. Luis","Alessandro G. Bottero","Julia Vinogradska","Felix Berkenkamp","Jan Peters"],"pdf_url":"https://arxiv.org/pdf/2312.04386v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2302.12526"},{"id":"http://arxiv.org/abs/2312.04374v1","updated":"2023-12-07T15:44:56Z","published":"2023-12-07T15:44:56Z","title":"Deep Dynamics: Vehicle Dynamics Modeling with a Physics-Informed Neural\n Network for Autonomous Racing","summary":" Autonomous racing is a critical research area for autonomous driving,\npresenting significant challenges in vehicle dynamics modeling, such as\nbalancing model precision and computational efficiency at high speeds\n(>280kmph), where minor errors in modeling have severe consequences. Existing\nphysics-based models for vehicle dynamics require elaborate testing setups and\ntuning, which are hard to implement, time-intensive, and cost-prohibitive.\nConversely, purely data-driven approaches do not generalize well and cannot\nadequately ensure physical constraints on predictions. This paper introduces\nDeep Dynamics, a physics-informed neural network (PINN) for vehicle dynamics\nmodeling of an autonomous racecar. It combines physics coefficient estimation\nand dynamical equations to accurately predict vehicle states at high speeds and\nincludes a unique Physics Guard layer to ensure internal coefficient estimates\nremain within their nominal physical ranges. Open-loop and closed-loop\nperformance assessments, using a physics-based simulator and full-scale\nautonomous Indy racecar data, highlight Deep Dynamics as a promising approach\nfor modeling racecar vehicle dynamics.\n","authors":["John Chrosniak","Jingyun Ning","Madhur Behl"],"pdf_url":"https://arxiv.org/pdf/2312.04374v1.pdf","comment":"This work has been submitted to the IEEE RA-L for possible\n publication"},{"id":"http://arxiv.org/abs/2312.04371v1","updated":"2023-12-07T15:42:53Z","published":"2023-12-07T15:42:53Z","title":"A Scalable Network-Aware Multi-Agent Reinforcement Learning Framework\n for Decentralized Inverter-based Voltage Control","summary":" This paper addresses the challenges associated with decentralized voltage\ncontrol in power grids due to an increase in distributed generations (DGs).\nTraditional model-based voltage control methods struggle with the rapid energy\nfluctuations and uncertainties of these DGs. While multi-agent reinforcement\nlearning (MARL) has shown potential for decentralized secondary control,\nscalability issues arise when dealing with a large number of DGs. This problem\nlies in the dominant centralized training and decentralized execution (CTDE)\nframework, where the critics take global observations and actions. To overcome\nthese challenges, we propose a scalable network-aware (SNA) framework that\nleverages network structure to truncate the input to the critic's Q-function,\nthereby improving scalability and reducing communication costs during training.\nFurther, the SNA framework is theoretically grounded with provable\napproximation guarantee, and it can seamlessly integrate with multiple\nmulti-agent actor-critic algorithms. The proposed SNA framework is successfully\ndemonstrated in a system with 114 DGs, providing a promising solution for\ndecentralized voltage control in increasingly complex power grid systems.\n","authors":["Han Xu","Jialin Zheng","Guannan Qu"],"pdf_url":"https://arxiv.org/pdf/2312.04371v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04370v1","updated":"2023-12-07T15:40:55Z","published":"2023-12-07T15:40:55Z","title":"Investigating the Design Space of Diffusion Models for Speech\n Enhancement","summary":" Diffusion models are a new class of generative models that have shown\noutstanding performance in image generation literature. As a consequence,\nstudies have attempted to apply diffusion models to other tasks, such as speech\nenhancement. A popular approach in adapting diffusion models to speech\nenhancement consists in modelling a progressive transformation between the\nclean and noisy speech signals. However, one popular diffusion model framework\npreviously laid in image generation literature did not account for such a\ntransformation towards the system input, which prevents from relating the\nexisting diffusion-based speech enhancement systems with the aforementioned\ndiffusion model framework. To address this, we extend this framework to account\nfor the progressive transformation between the clean and noisy speech signals.\nThis allows us to apply recent developments from image generation literature,\nand to systematically investigate design aspects of diffusion models that\nremain largely unexplored for speech enhancement, such as the neural network\npreconditioning, the training loss weighting, the stochastic differential\nequation (SDE), or the amount of stochasticity injected in the reverse process.\nWe show that the performance of previous diffusion-based speech enhancement\nsystems cannot be attributed to the progressive transformation between the\nclean and noisy speech signals. Moreover, we show that a proper choice of\npreconditioning, training loss weighting, SDE and sampler allows to outperform\na popular diffusion-based speech enhancement system in terms of perceptual\nmetrics while using fewer sampling steps, thus reducing the computational cost\nby a factor of four.\n","authors":["Philippe Gonzalez","Zheng-Hua Tan","Jan Østergaard","Jesper Jensen","Tommy Sonne Alstrøm","Tobias May"],"pdf_url":"https://arxiv.org/pdf/2312.04370v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.12409v2","updated":"2023-12-07T15:25:37Z","published":"2023-07-23T19:23:06Z","title":"A Machine Learning Approach to Two-Stage Adaptive Robust Optimization","summary":" We propose an approach based on machine learning to solve two-stage linear\nadaptive robust optimization (ARO) problems with binary here-and-now variables\nand polyhedral uncertainty sets. We encode the optimal here-and-now decisions,\nthe worst-case scenarios associated with the optimal here-and-now decisions,\nand the optimal wait-and-see decisions into what we denote as the strategy. We\nsolve multiple similar ARO instances in advance using the column and constraint\ngeneration algorithm and extract the optimal strategies to generate a training\nset. We train a machine learning model that predicts high-quality strategies\nfor the here-and-now decisions, the worst-case scenarios associated with the\noptimal here-and-now decisions, and the wait-and-see decisions. We also\nintroduce an algorithm to reduce the number of different target classes the\nmachine learning algorithm needs to be trained on. We apply the proposed\napproach to the facility location, the multi-item inventory control and the\nunit commitment problems. Our approach solves ARO problems drastically faster\nthan the state-of-the-art algorithms with high accuracy.\n","authors":["Dimitris Bertsimas","Cheol Woo Kim"],"pdf_url":"https://arxiv.org/pdf/2307.12409v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04356v1","updated":"2023-12-07T15:23:07Z","published":"2023-12-07T15:23:07Z","title":"NeuJeans: Private Neural Network Inference with Joint Optimization of\n Convolution and Bootstrapping","summary":" Fully homomorphic encryption (FHE) is a promising cryptographic primitive for\nrealizing private neural network inference (PI) services by allowing a client\nto fully offload the inference task to a cloud server while keeping the client\ndata oblivious to the server. This work proposes NeuJeans, an FHE-based\nsolution for the PI of deep convolutional neural networks (CNNs). NeuJeans\ntackles the critical problem of the enormous computational cost for the FHE\nevaluation of convolutional layers (conv2d), mainly due to the high cost of\ndata reordering and bootstrapping. We first propose an encoding method\nintroducing nested structures inside encoded vectors for FHE, which enables us\nto develop efficient conv2d algorithms with reduced data reordering costs.\nHowever, the new encoding method also introduces additional computations for\nconversion between encoding methods, which could negate its advantages. We\ndiscover that fusing conv2d with bootstrapping eliminates such computations\nwhile reducing the cost of bootstrapping. Then, we devise optimized execution\nflows for various types of conv2d and apply them to end-to-end implementation\nof CNNs. NeuJeans accelerates the performance of conv2d by up to 5.68 times\ncompared to state-of-the-art FHE-based PI work and performs the PI of a CNN at\nthe scale of ImageNet (ResNet18) within a mere few seconds\n","authors":["Jae Hyung Ju","Jaiyoung Park","Jongmin Kim","Donghwan Kim","Jung Ho Ahn"],"pdf_url":"https://arxiv.org/pdf/2312.04356v1.pdf","comment":"16 pages, 9 figures"},{"id":"http://arxiv.org/abs/2311.00519v2","updated":"2023-12-07T15:20:18Z","published":"2023-11-01T13:44:45Z","title":"Retrieval-Based Reconstruction For Time-series Contrastive Learning","summary":" The success of self-supervised contrastive learning hinges on identifying\npositive data pairs that, when pushed together in embedding space, encode\nuseful information for subsequent downstream tasks. However, in time-series,\nthis is challenging because creating positive pairs via augmentations may break\nthe original semantic meaning. We hypothesize that if we can retrieve\ninformation from one subsequence to successfully reconstruct another\nsubsequence, then they should form a positive pair. Harnessing this intuition,\nwe introduce our novel approach: REtrieval-BAsed Reconstruction (REBAR)\ncontrastive learning. First, we utilize a convolutional cross-attention\narchitecture to calculate the REBAR error between two different time-series.\nThen, through validation experiments, we show that the REBAR error is a\npredictor of mutual class membership, justifying its usage as a\npositive/negative labeler. Finally, once integrated into a contrastive learning\nframework, our REBAR method can learn an embedding that achieves\nstate-of-the-art performance on downstream tasks across various modalities.\n","authors":["Maxwell A. Xu","Alexander Moreno","Hui Wei","Benjamin M. Marlin","James M. Rehg"],"pdf_url":"https://arxiv.org/pdf/2311.00519v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04350v1","updated":"2023-12-07T15:12:12Z","published":"2023-12-07T15:12:12Z","title":"CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language\n Models","summary":" The ability to perform causal reasoning is widely considered a core feature\nof intelligence. In this work, we investigate whether large language models\n(LLMs) can coherently reason about causality. Much of the existing work in\nnatural language processing (NLP) focuses on evaluating commonsense causal\nreasoning in LLMs, thus failing to assess whether a model can perform causal\ninference in accordance with a set of well-defined formal rules. To address\nthis, we propose a new NLP task, causal inference in natural language, inspired\nby the \"causal inference engine\" postulated by Judea Pearl et al. We compose a\nlarge dataset, CLadder, with 10K samples: based on a collection of causal\ngraphs and queries (associational, interventional, and counterfactual), we\nobtain symbolic questions and ground-truth answers, through an oracle causal\ninference engine. These are then translated into natural language. We evaluate\nmultiple LLMs on our dataset, and we introduce and evaluate a bespoke\nchain-of-thought prompting strategy, CausalCoT. We show that our task is highly\nchallenging for LLMs, and we conduct an in-depth analysis to gain deeper\ninsight into the causal reasoning abilities of LLMs. Our data is open-sourced\nat https://huggingface.co/datasets/causalNLP/cladder, and our code can be found\nat https://github.com/causalNLP/cladder.\n","authors":["Zhijing Jin","Yuen Chen","Felix Leeb","Luigi Gresele","Ojasv Kamal","Zhiheng Lyu","Kevin Blin","Fernando Gonzalez Adauto","Max Kleiman-Weiner","Mrinmaya Sachan","Bernhard Schölkopf"],"pdf_url":"https://arxiv.org/pdf/2312.04350v1.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.04346v1","updated":"2023-12-07T15:06:06Z","published":"2023-12-07T15:06:06Z","title":"Improved Efficient Two-Stage Denoising Diffusion Power System\n Measurement Recovery Against False Data Injection Attacks and Data Losses","summary":" Measurement uncertainties, represented by cyber-attacks and data losses,\nseriously degrade the quality of power system measurements. Fortunately, the\npowerful generation ability of the denoising diffusion models can enable more\nprecise measurement generation for power system data recovery. However, the\ncontrollable data generation and efficient computing methods of denoising\ndiffusion models for deterministic trajectory still need further investigation.\nTo this end, this paper proposes an improved two-stage denoising diffusion\nmodel (TSDM) to identify and reconstruct the measurements with various\nmeasurement uncertainties. The first stage of the model comprises a\nclassifier-guided conditional anomaly detection component, while the second\nstage involves diffusion-based measurement imputation component. Moreover, the\nproposed TSDM adopts precise means and optimal variances to accelerate the\ndiffusion generation process with subsequence sampling. Extensive numerical\ncase studies demonstrate that the proposed TSDM can accurately recover power\nsystem measurements despite strong randomness under renewable energy\nintegration and highly nonlinear dynamics under complex cyber-physical\ncontingencies. Additionally, the proposed TSDM has stronger robustness compared\nto existing reconstruction networks and exhibits lower computational complexity\nthan general denoising diffusion models.\n","authors":["Jianhua Pei","Jingyu Wang","Dongyuan Shi","Ping Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04346v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04344v1","updated":"2023-12-07T15:05:59Z","published":"2023-12-07T15:05:59Z","title":"Enhancing Medical Task Performance in GPT-4V: A Comprehensive Study on\n Prompt Engineering Strategies","summary":" OpenAI's latest large vision-language model (LVLM), GPT-4V(ision), has piqued\nconsiderable interest for its potential in medical applications. Despite its\npromise, recent studies and internal reviews highlight its underperformance in\nspecialized medical tasks. This paper explores the boundary of GPT-4V's\ncapabilities in medicine, particularly in processing complex imaging data from\nendoscopies, CT scans, and MRIs etc. Leveraging open-source datasets, we\nassessed its foundational competencies, identifying substantial areas for\nenhancement. Our research emphasizes prompt engineering, an often-underutilized\nstrategy for improving AI responsiveness. Through iterative testing, we refined\nthe model's prompts, significantly improving its interpretative accuracy and\nrelevance in medical imaging. From our comprehensive evaluations, we distilled\n10 effective prompt engineering techniques, each fortifying GPT-4V's medical\nacumen. These methodical enhancements facilitate more reliable, precise, and\nclinically valuable insights from GPT-4V, advancing its operability in critical\nhealthcare environments. Our findings are pivotal for those employing AI in\nmedicine, providing clear, actionable guidance on harnessing GPT-4V's full\ndiagnostic potential.\n","authors":["Pengcheng Chen","Ziyan Huang","Zhongying Deng","Tianbin Li","Yanzhou Su","Haoyu Wang","Jin Ye","Yu Qiao","Junjun He"],"pdf_url":"https://arxiv.org/pdf/2312.04344v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04343v1","updated":"2023-12-07T15:05:26Z","published":"2023-12-07T15:05:26Z","title":"Causality and Explainability for Trustworthy Integrated Pest Management","summary":" Pesticides serve as a common tool in agricultural pest control but\nsignificantly contribute to the climate crisis. To combat this, Integrated Pest\nManagement (IPM) stands as a climate-smart alternative. Despite its potential,\nIPM faces low adoption rates due to farmers' skepticism about its\neffectiveness. To address this challenge, we introduce an advanced data\nanalysis framework tailored to enhance IPM adoption. Our framework provides i)\nrobust pest population predictions across diverse environments with invariant\nand causal learning, ii) interpretable pest presence predictions using\ntransparent models, iii) actionable advice through counterfactual explanations\nfor in-season IPM interventions, iv) field-specific treatment effect\nestimations, and v) assessments of the effectiveness of our advice using causal\ninference. By incorporating these features, our framework aims to alleviate\nskepticism and encourage wider adoption of IPM practices among farmers.\n","authors":["Ilias Tsoumas","Vasileios Sitokonstantinou","Georgios Giannarakis","Evagelia Lampiri","Christos Athanassiou","Gustau Camps-Valls","Charalampos Kontoes","Ioannis Athanasiadis"],"pdf_url":"https://arxiv.org/pdf/2312.04343v1.pdf","comment":"Accepted at NeurIPS 2023 Workshop on Tackling Climate Change with\n Machine Learning: Blending New and Existing Knowledge Systems"},{"id":"http://arxiv.org/abs/2312.04339v1","updated":"2023-12-07T14:59:15Z","published":"2023-12-07T14:59:15Z","title":"Merging by Matching Models in Task Subspaces","summary":" Model merging aims to cheaply combine individual task-specific models into a\nsingle multitask model. In this work, we view past merging methods as\nleveraging different notions of a ''task subspace'' in which models are matched\nbefore being merged. We connect the task subspace of a given model to its loss\nlandscape and formalize how this approach to model merging can be seen as\nsolving a linear system of equations. While past work has generally been\nlimited to linear systems that have a closed-form solution, we consider using\nthe conjugate gradient method to find a solution. We show that using the\nconjugate gradient method can outperform closed-form solutions, enables merging\nvia linear systems that are otherwise intractable to solve, and flexibly allows\nchoosing from a wide variety of initializations and estimates for the ''task\nsubspace''. We ultimately demonstrate that our merging framework called\n''Matching Models in their Task Subspace'' (MaTS) achieves state-of-the-art\nresults in multitask and intermediate-task model merging. We release all of the\ncode and checkpoints used in our work at https://github.com/r-three/mats.\n","authors":["Derek Tam","Mohit Bansal","Colin Raffel"],"pdf_url":"https://arxiv.org/pdf/2312.04339v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.13434v2","updated":"2023-12-07T14:59:02Z","published":"2023-11-22T14:47:54Z","title":"Recurrent neural networks and transfer learning for elasto-plasticity in\n woven composites","summary":" As a surrogate for computationally intensive meso-scale simulation of woven\ncomposites, this article presents Recurrent Neural Network (RNN) models.\nLeveraging the power of transfer learning, the initialization challenges and\nsparse data issues inherent in cyclic shear strain loads are addressed in the\nRNN models. A mean-field model generates a comprehensive data set representing\nelasto-plastic behavior. In simulations, arbitrary six-dimensional strain\nhistories are used to predict stresses under random walking as the source task\nand cyclic loading conditions as the target task. Incorporating sub-scale\nproperties enhances RNN versatility. In order to achieve accurate predictions,\nthe model uses a grid search method to tune network architecture and\nhyper-parameter configurations. The results of this study demonstrate that\ntransfer learning can be used to effectively adapt the RNN to varying strain\nconditions, which establishes its potential as a useful tool for modeling\npath-dependent responses in woven composites.\n","authors":["Ehsan Ghane","Martin Fagerström","Mohsen Mirkhalaf"],"pdf_url":"https://arxiv.org/pdf/2311.13434v2.pdf","comment":"There are 25 pages and 13 EPS images. The paper includes links to\n supporting materials"},{"id":"http://arxiv.org/abs/2312.04330v1","updated":"2023-12-07T14:48:30Z","published":"2023-12-07T14:48:30Z","title":"Surrogate Modelling for Sea Ice Concentration using Lightweight Neural\n Ensemble","summary":" The modeling and forecasting of sea ice conditions in the Arctic region are\nimportant tasks for ship routing, offshore oil production, and environmental\nmonitoring. We propose the adaptive surrogate modeling approach named LANE-SI\n(Lightweight Automated Neural Ensembling for Sea Ice) that uses ensemble of\nrelatively simple deep learning models with different loss functions for\nforecasting of spatial distribution for sea ice concentration in the specified\nwater area. Experimental studies confirm the quality of a long-term forecast\nbased on a deep learning model fitted to the specific water area is comparable\nto resource-intensive physical modeling, and for some periods of the year, it\nis superior. We achieved a 20% improvement against the state-of-the-art\nphysics-based forecast system SEAS5 for the Kara Sea.\n","authors":["Julia Borisova","Nikolay O. Nikitin"],"pdf_url":"https://arxiv.org/pdf/2312.04330v1.pdf","comment":"7 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.04327v1","updated":"2023-12-07T14:38:07Z","published":"2023-12-07T14:38:07Z","title":"Learning to sample in Cartesian MRI","summary":" Despite its exceptional soft tissue contrast, Magnetic Resonance Imaging\n(MRI) faces the challenge of long scanning times compared to other modalities\nlike X-ray radiography. Shortening scanning times is crucial in clinical\nsettings, as it increases patient comfort, decreases examination costs and\nimproves throughput. Recent advances in compressed sensing (CS) and deep\nlearning allow accelerated MRI acquisition by reconstructing high-quality\nimages from undersampled data. While reconstruction algorithms have received\nmost of the focus, designing acquisition trajectories to optimize\nreconstruction quality remains an open question. This thesis explores two\napproaches to address this gap in the context of Cartesian MRI. First, we\npropose two algorithms, lazy LBCS and stochastic LBCS, that significantly\nimprove upon G\\\"ozc\\\"u et al.'s greedy learning-based CS (LBCS) approach. These\nalgorithms scale to large, clinically relevant scenarios like multi-coil 3D MR\nand dynamic MRI, previously inaccessible to LBCS. Additionally, we demonstrate\nthat generative adversarial networks (GANs) can serve as a natural criterion\nfor adaptive sampling by leveraging variance in the measurement domain to guide\nacquisition. Second, we delve into the underlying structures or assumptions\nthat enable mask design algorithms to perform well in practice. Our experiments\nreveal that state-of-the-art deep reinforcement learning (RL) approaches, while\ncapable of adaptation and long-horizon planning, offer only marginal\nimprovements over stochastic LBCS, which is neither adaptive nor does long-term\nplanning. Altogether, our findings suggest that stochastic LBCS and similar\nmethods represent promising alternatives to deep RL. They shine in particular\nby their scalability and computational efficiency and could be key in the\ndeployment of optimized acquisition trajectories in Cartesian MRI.\n","authors":["Thomas Sanchez"],"pdf_url":"https://arxiv.org/pdf/2312.04327v1.pdf","comment":"PhD Thesis; 198 pages"},{"id":"http://arxiv.org/abs/2312.04323v1","updated":"2023-12-07T14:32:32Z","published":"2023-12-07T14:32:32Z","title":"Equivariant Scalar Fields for Molecular Docking with Fast Fourier\n Transforms","summary":" Molecular docking is critical to structure-based virtual screening, yet the\nthroughput of such workflows is limited by the expensive optimization of\nscoring functions involved in most docking algorithms. We explore how machine\nlearning can accelerate this process by learning a scoring function with a\nfunctional form that allows for more rapid optimization. Specifically, we\ndefine the scoring function to be the cross-correlation of multi-channel ligand\nand protein scalar fields parameterized by equivariant graph neural networks,\nenabling rapid optimization over rigid-body degrees of freedom with fast\nFourier transforms. The runtime of our approach can be amortized at several\nlevels of abstraction, and is particularly favorable for virtual screening\nsettings with a common binding pocket. We benchmark our scoring functions on\ntwo simplified docking-related tasks: decoy pose scoring and rigid conformer\ndocking. Our method attains similar but faster performance on crystal\nstructures compared to the widely-used Vina and Gnina scoring functions, and is\nmore robust on computationally predicted structures. Code is available at\nhttps://github.com/bjing2016/scalar-fields.\n","authors":["Bowen Jing","Tommi Jaakkola","Bonnie Berger"],"pdf_url":"https://arxiv.org/pdf/2312.04323v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04318v1","updated":"2023-12-07T14:21:31Z","published":"2023-12-07T14:21:31Z","title":"MIMo: A Multi-Modal Infant Model for Studying Cognitive Development","summary":" Human intelligence and human consciousness emerge gradually during the\nprocess of cognitive development. Understanding this development is an\nessential aspect of understanding the human mind and may facilitate the\nconstruction of artificial minds with similar properties. Importantly, human\ncognitive development relies on embodied interactions with the physical and\nsocial environment, which is perceived via complementary sensory modalities.\nThese interactions allow the developing mind to probe the causal structure of\nthe world. This is in stark contrast to common machine learning approaches,\ne.g., for large language models, which are merely passively ``digesting'' large\namounts of training data, but are not in control of their sensory inputs.\nHowever, computational modeling of the kind of self-determined embodied\ninteractions that lead to human intelligence and consciousness is a formidable\nchallenge. Here we present MIMo, an open-source multi-modal infant model for\nstudying early cognitive development through computer simulations. MIMo's body\nis modeled after an 18-month-old child with detailed five-fingered hands. MIMo\nperceives its surroundings via binocular vision, a vestibular system,\nproprioception, and touch perception through a full-body virtual skin, while\ntwo different actuation models allow control of his body. We describe the\ndesign and interfaces of MIMo and provide examples illustrating its use. All\ncode is available at https://github.com/trieschlab/MIMo .\n","authors":["Dominik Mattern","Pierre Schumacher","Francisco M. López","Marcel C. Raabe","Markus R. Ernst","Arthur Aubret","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2312.04318v1.pdf","comment":"11 pages, 8 figures. Submitted to IEEE Transactions on Congnitive and\n Developmental Systems (TCDS)"},{"id":"http://arxiv.org/abs/2312.04312v1","updated":"2023-12-07T14:09:27Z","published":"2023-12-07T14:09:27Z","title":"Stochastic-Constrained Stochastic Optimization with Markovian Data","summary":" This paper considers stochastic-constrained stochastic optimization where the\nstochastic constraint is to satisfy that the expectation of a random function\nis below a certain threshold. In particular, we study the setting where data\nsamples are drawn from a Markov chain and thus are not independent and\nidentically distributed. We generalize the drift-plus-penalty framework, a\nprimal-dual stochastic gradient method developed for the i.i.d. case, to the\nMarkov chain sampling setting. We propose two variants of drift-plus-penalty;\none is for the case when the mixing time of the underlying Markov chain is\nknown while the other is for the case of unknown mixing time. In fact, our\nalgorithms apply to a more general setting of constrained online convex\noptimization where the sequence of constraint functions follows a Markov chain.\nBoth algorithms are adaptive in that the first works without knowledge of the\ntime horizon while the second uses AdaGrad-style algorithm parameters, which is\nof independent interest. We demonstrate the effectiveness of our proposed\nmethods through numerical experiments on classification with fairness\nconstraints.\n","authors":["Yeongjong Kim","Dabeen Lee"],"pdf_url":"https://arxiv.org/pdf/2312.04312v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04311v1","updated":"2023-12-07T14:09:18Z","published":"2023-12-07T14:09:18Z","title":"Finding Interpretable Class-Specific Patterns through Efficient Neural\n Search","summary":" Discovering patterns in data that best describe the differences between\nclasses allows to hypothesize and reason about class-specific mechanisms. In\nmolecular biology, for example, this bears promise of advancing the\nunderstanding of cellular processes differing between tissues or diseases,\nwhich could lead to novel treatments. To be useful in practice, methods that\ntackle the problem of finding such differential patterns have to be readily\ninterpretable by domain experts, and scalable to the extremely high-dimensional\ndata.\n In this work, we propose a novel, inherently interpretable binary neural\nnetwork architecture DIFFNAPS that extracts differential patterns from data.\nDiffNaps is scalable to hundreds of thousands of features and robust to noise,\nthus overcoming the limitations of current state-of-the-art methods in\nlarge-scale applications such as in biology. We show on synthetic and real\nworld data, including three biological applications, that, unlike its\ncompetitors, DiffNaps consistently yields accurate, succinct, and interpretable\nclass descriptions\n","authors":["Nils Philipp Walter","Jonas Fischer","Jilles Vreeken"],"pdf_url":"https://arxiv.org/pdf/2312.04311v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04307v1","updated":"2023-12-07T14:04:38Z","published":"2023-12-07T14:04:38Z","title":"A Structural-Clustering Based Active Learning for Graph Neural Networks","summary":" In active learning for graph-structured data, Graph Neural Networks (GNNs)\nhave shown effectiveness. However, a common challenge in these applications is\nthe underutilization of crucial structural information. To address this\nproblem, we propose the Structural-Clustering PageRank method for improved\nActive learning (SPA) specifically designed for graph-structured data. SPA\nintegrates community detection using the SCAN algorithm with the PageRank\nscoring method for efficient and informative sample selection. SPA prioritizes\nnodes that are not only informative but also central in structure. Through\nextensive experiments, SPA demonstrates higher accuracy and macro-F1 score over\nexisting methods across different annotation budgets and achieves significant\nreductions in query time. In addition, the proposed method only adds two\nhyperparameters, $\\epsilon$ and $\\mu$ in the algorithm to finely tune the\nbalance between structural learning and node selection. This simplicity is a\nkey advantage in active learning scenarios, where extensive hyperparameter\ntuning is often impractical.\n","authors":["Ricky Maulana Fajri","Yulong Pei","Lu Yin","Mykola Pechenizkiy"],"pdf_url":"https://arxiv.org/pdf/2312.04307v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.11835v3","updated":"2023-12-07T13:54:20Z","published":"2023-02-23T07:51:00Z","title":"Reinforcement Learning for Combining Search Methods in the Calibration\n of Economic ABMs","summary":" Calibrating agent-based models (ABMs) in economics and finance typically\ninvolves a derivative-free search in a very large parameter space. In this\nwork, we benchmark a number of search methods in the calibration of a\nwell-known macroeconomic ABM on real data, and further assess the performance\nof \"mixed strategies\" made by combining different methods. We find that methods\nbased on random-forest surrogates are particularly efficient, and that\ncombining search methods generally increases performance since the biases of\nany single method are mitigated. Moving from these observations, we propose a\nreinforcement learning (RL) scheme to automatically select and combine search\nmethods on-the-fly during a calibration run. The RL agent keeps exploiting a\nspecific method only as long as this keeps performing well, but explores new\nstrategies when the specific method reaches a performance plateau. The\nresulting RL search scheme outperforms any other method or method combination\ntested, and does not rely on any prior information or trial and error\nprocedure.\n","authors":["Aldo Glielmo","Marco Favorito","Debmallya Chanda","Domenico Delli Gatti"],"pdf_url":"https://arxiv.org/pdf/2302.11835v3.pdf","comment":"9 pages and 5 figures, presented at the AAAI bridge program 'AI for\n Financial Institutions' (https://aaai23.bankit.art/), at the ICLR bridge\n program 'AI4ABM' (https://ai4abm.org/workshop_iclr2023/) and at ICAIF '23\n (https://ai-finance.org/). Proceedings of the Fourth ACM International\n Conference on AI in Finance, (ICAIF 23), Association for Computing Machinery,\n New York, NY, USA"},{"id":"http://arxiv.org/abs/2309.03755v2","updated":"2023-12-07T13:42:53Z","published":"2023-09-07T14:51:42Z","title":"TSGBench: Time Series Generation Benchmark","summary":" Synthetic Time Series Generation (TSG) is crucial in a range of applications,\nincluding data augmentation, anomaly detection, and privacy preservation.\nAlthough significant strides have been made in this field, existing methods\nexhibit three key limitations: (1) They often benchmark against similar model\ntypes, constraining a holistic view of performance capabilities. (2) The use of\nspecialized synthetic and private datasets introduces biases and hampers\ngeneralizability. (3) Ambiguous evaluation measures, often tied to custom\nnetworks or downstream tasks, hinder consistent and fair comparison.\n To overcome these limitations, we introduce \\textsf{TSGBench}, the inaugural\nTime Series Generation Benchmark, designed for a unified and comprehensive\nassessment of TSG methods. It comprises three modules: (1) a curated collection\nof publicly available, real-world datasets tailored for TSG, together with a\nstandardized preprocessing pipeline; (2) a comprehensive evaluation measures\nsuite including vanilla measures, new distance-based assessments, and\nvisualization tools; (3) a pioneering generalization test rooted in Domain\nAdaptation (DA), compatible with all methods. We have conducted comprehensive\nexperiments using \\textsf{TSGBench} across a spectrum of ten real-world\ndatasets from diverse domains, utilizing ten advanced TSG methods and twelve\nevaluation measures. The results highlight the reliability and efficacy of\n\\textsf{TSGBench} in evaluating TSG methods. Crucially, \\textsf{TSGBench}\ndelivers a statistical analysis of the performance rankings of these methods,\nilluminating their varying performance across different datasets and measures\nand offering nuanced insights into the effectiveness of each method.\n","authors":["Yihao Ang","Qiang Huang","Yifan Bao","Anthony K. H. Tung","Zhiyong Huang"],"pdf_url":"https://arxiv.org/pdf/2309.03755v2.pdf","comment":"Accepted and to appear in VLDB 2024"},{"id":"http://arxiv.org/abs/2312.04291v1","updated":"2023-12-07T13:18:36Z","published":"2023-12-07T13:18:36Z","title":"Simulating the Air Quality Impact of Prescribed Fires Using a Graph\n Neural Network-Based PM$_{2.5}$ Emissions Forecasting System","summary":" The increasing size and severity of wildfires across western North America\nhave generated dangerous levels of PM$_{2.5}$ pollution in recent years. In a\nwarming climate, expanding the use of prescribed fires is widely considered to\nbe the most robust fire mitigation strategy. However, reliably forecasting the\npotential air quality impact from these prescribed fires, a critical ingredient\nin determining the fires' location and time, at hourly to daily time scales\nremains a challenging problem. This paper proposes a novel integration of\nprescribed fire simulation with a spatio-temporal graph neural network-based\nPM$_{2.5}$ forecasting model. The experiments in this work focus on determining\nthe optimal time for implementing prescribed fires in California as well as\nquantifying the potential air quality trade-offs involved in conducting more\nprescribed fires outside the fire season.\n","authors":["Kyleen Liao","Jatan Buch","Kara Lamb","Pierre Gentine"],"pdf_url":"https://arxiv.org/pdf/2312.04291v1.pdf","comment":"4 + 3 pages; accepted to the Tackling Climate Change with Machine\n Learning Workshop at NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.04281v1","updated":"2023-12-07T13:05:47Z","published":"2023-12-07T13:05:47Z","title":"Factor-Assisted Federated Learning for Personalized Optimization with\n Heterogeneous Data","summary":" Federated learning is an emerging distributed machine learning framework\naiming at protecting data privacy. Data heterogeneity is one of the core\nchallenges in federated learning, which could severely degrade the convergence\nrate and prediction performance of deep neural networks. To address this issue,\nwe develop a novel personalized federated learning framework for heterogeneous\ndata, which we refer to as FedSplit. This modeling framework is motivated by\nthe finding that, data in different clients contain both common knowledge and\npersonalized knowledge. Then the hidden elements in each neural layer can be\nsplit into the shared and personalized groups. With this decomposition, a novel\nobjective function is established and optimized. We demonstrate FedSplit\nenjoyers a faster convergence speed than the standard federated learning method\nboth theoretically and empirically. The generalization bound of the FedSplit\nmethod is also studied. To practically implement the proposed method on real\ndatasets, factor analysis is introduced to facilitate the decoupling of hidden\nelements. This leads to a practically implemented model for FedSplit and we\nfurther refer to as FedFac. We demonstrated by simulation studies that, using\nfactor analysis can well recover the underlying shared/personalized\ndecomposition. The superior prediction performance of FedFac is further\nverified empirically by comparison with various state-of-the-art federated\nlearning methods on several real datasets.\n","authors":["Feifei Wang","Huiyun Tang","Yang Li"],"pdf_url":"https://arxiv.org/pdf/2312.04281v1.pdf","comment":"29 pages, 10 figures"},{"id":"http://arxiv.org/abs/2312.04275v1","updated":"2023-12-07T12:54:16Z","published":"2023-12-07T12:54:16Z","title":"Estimating Countries with Similar Maternal Mortality Rate using Cluster\n Analysis and Pairing Countries with Identical MMR","summary":" In the evolving world, we require more additionally the young era to flourish\nand evolve into developed land. Most of the population all around the world are\nunaware of the complications involved in the routine they follow while they are\npregnant and how hospital facilities affect maternal health. Maternal Mortality\nis the death of a pregnant woman due to intricacies correlated to pregnancy,\nunderlying circumstances exacerbated by the pregnancy or management of these\nsituations. It is crucial to consider the Maternal Mortality Rate (MMR) in\ndiverse locations and determine which human routines and hospital facilities\ndiminish the Maternal Mortality Rate (MMR). This research aims to examine and\ndiscover the countries which are keeping more lavish threats of MMR and\ncountries alike in MMR encountered. Data is examined and collected for various\ncountries, data consists of the earlier years' observation. From the\nperspective of Machine Learning, Unsupervised Machine Learning is implemented\nto perform Cluster Analysis. Therefore the pairs of countries with similar MMR\nas well as the extreme opposite pair concerning the MMR are found.\n","authors":["S. Nandini","Sanjjushri Varshini R"],"pdf_url":"https://arxiv.org/pdf/2312.04275v1.pdf","comment":"14 pages, 6 figures"},{"id":"http://arxiv.org/abs/2312.04273v1","updated":"2023-12-07T12:53:05Z","published":"2023-12-07T12:53:05Z","title":"Invariant Random Forest: Tree-Based Model Solution for OOD\n Generalization","summary":" Out-Of-Distribution (OOD) generalization is an essential topic in machine\nlearning. However, recent research is only focusing on the corresponding\nmethods for neural networks. This paper introduces a novel and effective\nsolution for OOD generalization of decision tree models, named Invariant\nDecision Tree (IDT). IDT enforces a penalty term with regard to the\nunstable/varying behavior of a split across different environments during the\ngrowth of the tree. Its ensemble version, the Invariant Random Forest (IRF), is\nconstructed. Our proposed method is motivated by a theoretical result under\nmild conditions, and validated by numerical tests with both synthetic and real\ndatasets. The superior performance compared to non-OOD tree models implies that\nconsidering OOD generalization for tree models is absolutely necessary and\nshould be given more attention.\n","authors":["Yufan Liao","Qi Wu","Xing Yan"],"pdf_url":"https://arxiv.org/pdf/2312.04273v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2305.02041v3","updated":"2023-12-07T12:41:06Z","published":"2023-05-03T11:11:46Z","title":"Low-complexity subspace-descent over symmetric positive definite\n manifold","summary":" This work puts forth low-complexity Riemannian subspace descent algorithms\nfor the minimization of functions over the symmetric positive definite (SPD)\nmanifold. Different from the existing Riemannian gradient descent variants, the\nproposed approach utilizes carefully chosen subspaces that allow the update to\nbe written as a product of the Cholesky factor of the iterate and a sparse\nmatrix. The resulting updates avoid the costly matrix operations like matrix\nexponentiation and dense matrix multiplication, which are generally required in\nalmost all other Riemannian optimization algorithms on SPD manifold. We further\nidentify a broad class of functions, arising in diverse applications, such as\nkernel matrix learning, covariance estimation of Gaussian distributions,\nmaximum likelihood parameter estimation of elliptically contoured\ndistributions, and parameter estimation in Gaussian mixture model problems,\nover which the Riemannian gradients can be calculated efficiently. The proposed\nuni-directional and multi-directional Riemannian subspace descent variants\nincur per-iteration complexities of $\\O(n)$ and $\\O(n^2)$ respectively, as\ncompared to the $\\O(n^3)$ or higher complexity incurred by all existing\nRiemannian gradient descent variants. The superior runtime and low\nper-iteration complexity of the proposed algorithms is also demonstrated via\nnumerical tests on large-scale covariance estimation and matrix square root\nproblems.\n","authors":["Yogesh Darmwal","Ketan Rajawat"],"pdf_url":"https://arxiv.org/pdf/2305.02041v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2302.10903v2","updated":"2023-12-07T12:27:56Z","published":"2023-02-11T06:22:50Z","title":"Trajectory-User Linking via Hierarchical Spatio-Temporal Attention\n Networks","summary":" Trajectory-User Linking (TUL) is crucial for human mobility modeling by\nlinking diferent trajectories to users with the exploration of complex mobility\npatterns. Existing works mainly rely on the recurrent neural framework to\nencode the temporal dependencies in trajectories, have fall short in capturing\nspatial-temporal global context for TUL prediction. To ill this gap, this work\npresents a new hierarchical spatio-temporal attention neural network, called\nAttnTUL, to jointly encode the local trajectory transitional patterns and\nglobal spatial dependencies for TUL. Speciically, our irst model component is\nbuilt over the graph neural architecture to preserve the local and global\ncontext and enhance the representation paradigm of geographical regions and\nuser trajectories. Additionally, a hierarchically structured attention network\nis designed to simultaneously encode the intra-trajectory and inter-trajectory\ndependencies, with the integration of the temporal attention mechanism and\nglobal elastic attentional encoder. Extensive experiments demonstrate the\nsuperiority of our AttnTUL method as compared to state-of-the-art baselines on\nvarious trajectory datasets. The source code of our model is available at\nhttps://github.com/Onedean/AttnTUL.\n","authors":["Wei Chen","Chao Huang","Yanwei Yu","Yongguo Jiang","Junyu Dong"],"pdf_url":"https://arxiv.org/pdf/2302.10903v2.pdf","comment":"22 pages, 8 figures, accepted by ACM Trans. Knowl. Discov. Data\n Journal (TKDD)"},{"id":"http://arxiv.org/abs/2311.14388v2","updated":"2023-12-07T12:20:25Z","published":"2023-11-24T10:07:14Z","title":"A Parameterized Generative Adversarial Network Using Cyclic Projection\n for Explainable Medical Image Classification","summary":" Although current data augmentation methods are successful to alleviate the\ndata insufficiency, conventional augmentation are primarily intra-domain while\nadvanced generative adversarial networks (GANs) generate images remaining\nuncertain, particularly in small-scale datasets. In this paper, we propose a\nparameterized GAN (ParaGAN) that effectively controls the changes of synthetic\nsamples among domains and highlights the attention regions for downstream\nclassification. Specifically, ParaGAN incorporates projection distance\nparameters in cyclic projection and projects the source images to the decision\nboundary to obtain the class-difference maps. Our experiments show that ParaGAN\ncan consistently outperform the existing augmentation methods with explainable\nclassification on two small-scale medical datasets.\n","authors":["Xiangyu Xiong","Yue Sun","Xiaohong Liu","Chan-Tong Lam","Tong Tong","Hao Chen","Qinquan Gao","Wei Ke","Tao Tan"],"pdf_url":"https://arxiv.org/pdf/2311.14388v2.pdf","comment":"5 pages, 4 figures. This work has been submitted to the IEEE for\n possible publication. Copyright may be transferred without notice, after\n which this version may no longer be accessible"},{"id":"http://arxiv.org/abs/2311.00502v2","updated":"2023-12-07T12:16:42Z","published":"2023-11-01T13:08:50Z","title":"Efficient LLM Inference on CPUs","summary":" Large language models (LLMs) have demonstrated remarkable performance and\ntremendous potential across a wide range of tasks. However, deploying these\nmodels has been challenging due to the astronomical amount of model parameters,\nwhich requires a demand for large memory capacity and high memory bandwidth. In\nthis paper, we propose an effective approach that can make the deployment of\nLLMs more efficiently. We support an automatic INT4 weight-only quantization\nflow and design a special LLM runtime with highly-optimized kernels to\naccelerate the LLM inference on CPUs. We demonstrate the general applicability\nof our approach on popular LLMs including Llama2, Llama, GPT-NeoX, and showcase\nthe extreme inference efficiency on CPUs. The code is publicly available at:\nhttps://github.com/intel/intel-extension-for-transformers.\n","authors":["Haihao Shen","Hanwen Chang","Bo Dong","Yu Luo","Hengyu Meng"],"pdf_url":"https://arxiv.org/pdf/2311.00502v2.pdf","comment":"NeurIPS'2023 on Efficient Natural Language and Speech Processing"},{"id":"http://arxiv.org/abs/2307.14023v2","updated":"2023-12-07T11:53:31Z","published":"2023-07-26T08:07:37Z","title":"Are Transformers with One Layer Self-Attention Using Low-Rank Weight\n Matrices Universal Approximators?","summary":" Existing analyses of the expressive capacity of Transformer models have\nrequired excessively deep layers for data memorization, leading to a\ndiscrepancy with the Transformers actually used in practice. This is primarily\ndue to the interpretation of the softmax function as an approximation of the\nhardmax function. By clarifying the connection between the softmax function and\nthe Boltzmann operator, we prove that a single layer of self-attention with\nlow-rank weight matrices possesses the capability to perfectly capture the\ncontext of an entire input sequence. As a consequence, we show that one-layer\nand single-head Transformers have a memorization capacity for finite samples,\nand that Transformers consisting of one self-attention layer with two\nfeed-forward neural networks are universal approximators for continuous\npermutation equivariant functions on a compact domain.\n","authors":["Tokio Kajitsuka","Issei Sato"],"pdf_url":"https://arxiv.org/pdf/2307.14023v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04234v1","updated":"2023-12-07T11:40:32Z","published":"2023-12-07T11:40:32Z","title":"Graph Convolutions Enrich the Self-Attention in Transformers!","summary":" Transformers, renowned for their self-attention mechanism, have achieved\nstate-of-the-art performance across various tasks in natural language\nprocessing, computer vision, time-series modeling, etc. However, one of the\nchallenges with deep Transformer models is the oversmoothing problem, where\nrepresentations across layers converge to indistinguishable values, leading to\nsignificant performance degradation. We interpret the original self-attention\nas a simple graph filter and redesign it from a graph signal processing (GSP)\nperspective. We propose graph-filter-based self-attention (GFSA) to learn a\ngeneral yet effective one, whose complexity, however, is slightly larger than\nthat of the original self-attention mechanism. We demonstrate that GFSA\nimproves the performance of Transformers in various fields, including computer\nvision, natural language processing, graph pattern classification, speech\nrecognition, and code classification.\n","authors":["Jeongwhan Choi","Hyowon Wi","Jayoung Kim","Yehjin Shin","Kookjin Lee","Nathaniel Trask","Noseong Park"],"pdf_url":"https://arxiv.org/pdf/2312.04234v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.13423v3","updated":"2023-12-07T11:39:58Z","published":"2023-07-25T11:42:52Z","title":"Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals\n using Self Supervised Speech Representations","summary":" Self-supervised speech representations (SSSRs) have been successfully applied\nto a number of speech-processing tasks, e.g. as feature extractor for speech\nquality (SQ) prediction, which is, in turn, relevant for assessment and\ntraining speech enhancement systems for users with normal or impaired hearing.\nHowever, exact knowledge of why and how quality-related information is encoded\nwell in such representations remains poorly understood. In this work,\ntechniques for non-intrusive prediction of SQ ratings are extended to the\nprediction of intelligibility for hearing-impaired users. It is found that\nself-supervised representations are useful as input features to non-intrusive\nprediction models, achieving competitive performance to more complex systems. A\ndetailed analysis of the performance depending on Clarity Prediction Challenge\n1 listeners and enhancement systems indicates that more data might be needed to\nallow generalisation to unknown systems and (hearing-impaired) individuals\n","authors":["George Close","Thomas Hain","Stefan Goetze"],"pdf_url":"https://arxiv.org/pdf/2307.13423v3.pdf","comment":"Accepted @ ASRU 2023 SPARKS workshop"},{"id":"http://arxiv.org/abs/2311.05587v3","updated":"2023-12-07T11:25:57Z","published":"2023-11-09T18:47:33Z","title":"Bayesian Methods for Media Mix Modelling with shape and funnel effects","summary":" In recent years, significant progress in generative AI has highlighted the\nimportant role of physics-inspired models that utilize advanced mathematical\nconcepts based on fundamental physics principles to enhance artificial\nintelligence capabilities. Among these models, those based on diffusion\nequations have greatly improved image quality. This study aims to explore the\npotential uses of Maxwell-Boltzmann equation, which forms the basis of the\nkinetic theory of gases, and the Michaelis-Menten model in Marketing Mix\nModelling (MMM) applications. We propose incorporating these equations into\nHierarchical Bayesian models to analyse consumer behaviour in the context of\nadvertising. These equation sets excel in accurately describing the random\ndynamics in complex systems like social interactions and consumer-advertising\ninteractions.\n","authors":["Javier Marin"],"pdf_url":"https://arxiv.org/pdf/2311.05587v3.pdf","comment":"Rev. 3, December 2023"},{"id":"http://arxiv.org/abs/2210.11407v3","updated":"2023-12-07T11:24:37Z","published":"2022-10-20T16:56:47Z","title":"Similarity of Neural Architectures using Adversarial Attack\n Transferability","summary":" In recent years, many deep neural architectures have been developed for image\nclassification. Whether they are similar or dissimilar and what factors\ncontribute to their (dis)similarities remains curious. To address this\nquestion, we aim to design a quantitative and scalable similarity measure\nbetween neural architectures. We propose Similarity by Attack Transferability\n(SAT) from the observation that adversarial attack transferability contains\ninformation related to input gradients and decision boundaries widely used to\nunderstand model behaviors. We conduct a large-scale analysis on 69\nstate-of-the-art ImageNet classifiers using our proposed similarity function to\nanswer the question. Moreover, we observe neural architecture-related phenomena\nusing model similarity that model diversity can lead to better performance on\nmodel ensembles and knowledge distillation under specific conditions. Our\nresults provide insights into why developing diverse neural architectures with\ndistinct components is necessary.\n","authors":["Jaehui Hwang","Dongyoon Han","Byeongho Heo","Song Park","Sanghyuk Chun","Jong-Seok Lee"],"pdf_url":"https://arxiv.org/pdf/2210.11407v3.pdf","comment":"20pages, 13 figures, 2.3MB"},{"id":"http://arxiv.org/abs/2302.13711v2","updated":"2023-12-07T11:23:25Z","published":"2023-02-27T12:18:19Z","title":"Internal-Coordinate Density Modelling of Protein Structure: Covariance\n Matters","summary":" After the recent ground-breaking advances in protein structure prediction,\none of the remaining challenges in protein machine learning is to reliably\npredict distributions of structural states. Parametric models of fluctuations\nare difficult to fit due to complex covariance structures between degrees of\nfreedom in the protein chain, often causing models to either violate local or\nglobal structural constraints. In this paper, we present a new strategy for\nmodelling protein densities in internal coordinates, which uses constraints in\n3D space to induce covariance structure between the internal degrees of\nfreedom. We illustrate the potential of the procedure by constructing a\nvariational autoencoder with full covariance output induced by the constraints\nimplied by the conditional mean in 3D, and demonstrate that our approach makes\nit possible to scale density models of internal coordinates to full protein\nbackbones in two settings: 1) a unimodal setting for proteins exhibiting small\nfluctuations and limited amounts of available data, and 2) a multimodal setting\nfor larger conformational changes in a high data regime.\n","authors":["Marloes Arts","Jes Frellsen","Wouter Boomsma"],"pdf_url":"https://arxiv.org/pdf/2302.13711v2.pdf","comment":"Pages: 9 main, 3 references, 8 appendix. Figures: 5 main, 6 appendix"},{"id":"http://arxiv.org/abs/2306.04843v2","updated":"2023-12-07T11:19:57Z","published":"2023-06-08T00:31:27Z","title":"Classical Verification of Quantum Learning","summary":" Quantum data access and quantum processing can make certain classically\nintractable learning tasks feasible. However, quantum capabilities will only be\navailable to a select few in the near future. Thus, reliable schemes that allow\nclassical clients to delegate learning to untrusted quantum servers are\nrequired to facilitate widespread access to quantum learning advantages.\nBuilding on a recently introduced framework of interactive proof systems for\nclassical machine learning, we develop a framework for classical verification\nof quantum learning. We exhibit learning problems that a classical learner\ncannot efficiently solve on their own, but that they can efficiently and\nreliably solve when interacting with an untrusted quantum prover. Concretely,\nwe consider the problems of agnostic learning parities and Fourier-sparse\nfunctions with respect to distributions with uniform input marginal. We propose\na new quantum data access model that we call \"mixture-of-superpositions\"\nquantum examples, based on which we give efficient quantum learning algorithms\nfor these tasks. Moreover, we prove that agnostic quantum parity and\nFourier-sparse learning can be efficiently verified by a classical verifier\nwith only random example or statistical query access. Finally, we showcase two\ngeneral scenarios in learning and verification in which quantum\nmixture-of-superpositions examples do not lead to sample complexity\nimprovements over classical data. Our results demonstrate that the potential\npower of quantum data for learning tasks, while not unlimited, can be utilized\nby classical agents through interaction with untrusted quantum entities.\n","authors":["Matthias C. Caro","Marcel Hinsche","Marios Ioannou","Alexander Nietner","Ryan Sweke"],"pdf_url":"https://arxiv.org/pdf/2306.04843v2.pdf","comment":"12 + 46 + 23 pages, 1 table, 1 figure; V2 fixes some typos and\n includes new results in Section 6.3"},{"id":"http://arxiv.org/abs/2310.09819v2","updated":"2023-12-07T11:11:18Z","published":"2023-10-15T12:35:27Z","title":"Optimizing K-means for Big Data: A Comparative Study","summary":" This paper presents a comparative analysis of different optimization\ntechniques for the K-means algorithm in the context of big data. K-means is a\nwidely used clustering algorithm, but it can suffer from scalability issues\nwhen dealing with large datasets. The paper explores different approaches to\novercome these issues, including parallelization, approximation, and sampling\nmethods. The authors evaluate the performance of these techniques on various\nbenchmark datasets and compare them in terms of speed, quality of clustering,\nand scalability according to the LIMA dominance criterion. The results show\nthat different techniques are more suitable for different types of datasets and\nprovide insights into the trade-offs between speed and accuracy in K-means\nclustering for big data. Overall, the paper offers a comprehensive guide for\npractitioners and researchers on how to optimize K-means for big data\napplications.\n","authors":["Ravil Mussabayev","Rustam Mussabayev"],"pdf_url":"https://arxiv.org/pdf/2310.09819v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04216v1","updated":"2023-12-07T11:04:37Z","published":"2023-12-07T11:04:37Z","title":"CODEX: A Cluster-Based Method for Explainable Reinforcement Learning","summary":" Despite the impressive feats demonstrated by Reinforcement Learning (RL),\nthese algorithms have seen little adoption in high-risk, real-world\napplications due to current difficulties in explaining RL agent actions and\nbuilding user trust. We present Counterfactual Demonstrations for Explanation\n(CODEX), a method that incorporates semantic clustering, which can effectively\nsummarize RL agent behavior in the state-action space. Experimentation on the\nMiniGrid and StarCraft II gaming environments reveals the semantic clusters\nretain temporal as well as entity information, which is reflected in the\nconstructed summary of agent behavior. Furthermore, clustering the\ndiscrete+continuous game-state latent representations identifies the most\ncrucial episodic events, demonstrating a relationship between the latent and\nsemantic spaces. This work contributes to the growing body of work that strives\nto unlock the power of RL for widespread use by leveraging and extending\ntechniques from Natural Language Processing.\n","authors":["Timothy K. Mathes","Jessica Inman","Andrés Colón","Simon Khan"],"pdf_url":"https://arxiv.org/pdf/2312.04216v1.pdf","comment":"Presented at the International Joint Conference on Artificial\n Intelligence (IJCAI) 2023 Workshop on Explainable Artificial Intelligence\n (XAI)"},{"id":"http://arxiv.org/abs/2312.04215v1","updated":"2023-12-07T11:03:42Z","published":"2023-12-07T11:03:42Z","title":"Guided Reconstruction with Conditioned Diffusion Models for Unsupervised\n Anomaly Detection in Brain MRIs","summary":" Unsupervised anomaly detection in Brain MRIs aims to identify abnormalities\nas outliers from a healthy training distribution. Reconstruction-based\napproaches that use generative models to learn to reconstruct healthy brain\nanatomy are commonly used for this task. Diffusion models are an emerging class\nof deep generative models that show great potential regarding reconstruction\nfidelity. However, they face challenges in preserving intensity characteristics\nin the reconstructed images, limiting their performance in anomaly detection.\nTo address this challenge, we propose to condition the denoising mechanism of\ndiffusion models with additional information about the image to reconstruct\ncoming from a latent representation of the noise-free input image. This\nconditioning enables high-fidelity reconstruction of healthy brain structures\nwhile aligning local intensity characteristics of input-reconstruction pairs.\nWe evaluate our method's reconstruction quality, domain adaptation features and\nfinally segmentation performance on publicly available data sets with various\npathologies. Using our proposed conditioning mechanism we can reduce the\nfalse-positive predictions and enable a more precise delineation of anomalies\nwhich significantly enhances the anomaly detection performance compared to\nestablished state-of-the-art approaches to unsupervised anomaly detection in\nbrain MRI. Furthermore, our approach shows promise in domain adaptation across\ndifferent MRI acquisitions and simulated contrasts, a crucial property of\ngeneral anomaly detection methods.\n","authors":["Finn Behrendt","Debayan Bhattacharya","Robin Mieling","Lennart Maack","Julia Krüger","Roland Opfer","Alexander Schlaefer"],"pdf_url":"https://arxiv.org/pdf/2312.04215v1.pdf","comment":"Preprint"},{"id":"http://arxiv.org/abs/2312.04209v1","updated":"2023-12-07T10:52:06Z","published":"2023-12-07T10:52:06Z","title":"Constrained Hierarchical Clustering via Graph Coarsening and Optimal\n Cuts","summary":" Motivated by extracting and summarizing relevant information in short\nsentence settings, such as satisfaction questionnaires, hotel reviews, and\nX/Twitter, we study the problem of clustering words in a hierarchical fashion.\nIn particular, we focus on the problem of clustering with horizontal and\nvertical structural constraints. Horizontal constraints are typically\ncannot-link and must-link among words, while vertical constraints are\nprecedence constraints among cluster levels. We overcome state-of-the-art\nbottlenecks by formulating the problem in two steps: first, as a\nsoft-constrained regularized least-squares which guides the result of a\nsequential graph coarsening algorithm towards the horizontal feasible set.\nThen, flat clusters are extracted from the resulting hierarchical tree by\ncomputing optimal cut heights based on the available constraints. We show that\nthe resulting approach compares very well with respect to existing algorithms\nand is computationally light.\n","authors":["Eliabelle Mauduit","Andrea Simonetto"],"pdf_url":"https://arxiv.org/pdf/2312.04209v1.pdf","comment":"5 pages, appeared at the Asilomar Conference on Signals, Systems, and\n Computer, 11/2023"},{"id":"http://arxiv.org/abs/2312.04204v1","updated":"2023-12-07T10:40:37Z","published":"2023-12-07T10:40:37Z","title":"Wavelength-multiplexed Delayed Inputs for Memory Enhancement of\n Microring-based Reservoir Computing","summary":" We numerically demonstrate a silicon add-drop microring-based reservoir\ncomputing scheme that combines parallel delayed inputs and wavelength division\nmultiplexing. The scheme solves memory-demanding tasks like time-series\nprediction with good performance without requiring external optical feedback.\n","authors":["Bernard J. Giron Castro","Christophe Peucheret","Francesco Da Ros"],"pdf_url":"https://arxiv.org/pdf/2312.04204v1.pdf","comment":"2 pages, 2 figures. Submitted to Conference on Lasers and\n Electro-Optics (CLEO) 2024"},{"id":"http://arxiv.org/abs/2312.03510v2","updated":"2023-12-07T10:38:56Z","published":"2023-12-06T14:13:30Z","title":"Towards Sobolev Pruning","summary":" The increasing use of stochastic models for describing complex phenomena\nwarrants surrogate models that capture the reference model characteristics at a\nfraction of the computational cost, foregoing potentially expensive Monte Carlo\nsimulation. The predominant approach of fitting a large neural network and then\npruning it to a reduced size has commonly neglected shortcomings. The produced\nsurrogate models often will not capture the sensitivities and uncertainties\ninherent in the original model. In particular, (higher-order) derivative\ninformation of such surrogates could differ drastically. Given a large enough\nnetwork, we expect this derivative information to match. However, the pruned\nmodel will almost certainly not share this behavior.\n In this paper, we propose to find surrogate models by using sensitivity\ninformation throughout the learning and pruning process. We build on work using\nInterval Adjoint Significance Analysis for pruning and combine it with the\nrecent advancements in Sobolev Training to accurately model the original\nsensitivity information in the pruned neural network based surrogate model. We\nexperimentally underpin the method on an example of pricing a multidimensional\nBasket option modelled through a stochastic differential equation with Brownian\nmotion. The proposed method is, however, not limited to the domain of\nquantitative finance, which was chosen as a case study for intuitive\ninterpretations of the sensitivities. It serves as a foundation for building\nfurther surrogate modelling techniques considering sensitivity information.\n","authors":["Neil Kichler","Sher Afghan","Uwe Naumann"],"pdf_url":"https://arxiv.org/pdf/2312.03510v2.pdf","comment":"11 pages"},{"id":"http://arxiv.org/abs/2306.12816v2","updated":"2023-12-07T10:37:09Z","published":"2023-06-22T11:31:11Z","title":"XAI-TRIS: Non-linear image benchmarks to quantify false positive\n post-hoc attribution of feature importance","summary":" The field of 'explainable' artificial intelligence (XAI) has produced highly\ncited methods that seek to make the decisions of complex machine learning (ML)\nmethods 'understandable' to humans, for example by attributing 'importance'\nscores to input features. Yet, a lack of formal underpinning leaves it unclear\nas to what conclusions can safely be drawn from the results of a given XAI\nmethod and has also so far hindered the theoretical verification and empirical\nvalidation of XAI methods. This means that challenging non-linear problems,\ntypically solved by deep neural networks, presently lack appropriate remedies.\nHere, we craft benchmark datasets for three different non-linear classification\nscenarios, in which the important class-conditional features are known by\ndesign, serving as ground truth explanations. Using novel quantitative metrics,\nwe benchmark the explanation performance of a wide set of XAI methods across\nthree deep learning model architectures. We show that popular XAI methods are\noften unable to significantly outperform random performance baselines and edge\ndetection methods. Moreover, we demonstrate that explanations derived from\ndifferent model architectures can be vastly different; thus, prone to\nmisinterpretation even under controlled conditions.\n","authors":["Benedict Clark","Rick Wilming","Stefan Haufe"],"pdf_url":"https://arxiv.org/pdf/2306.12816v2.pdf","comment":"Under review"},{"id":"http://arxiv.org/abs/2312.04193v1","updated":"2023-12-07T10:21:22Z","published":"2023-12-07T10:21:22Z","title":"Language Model Knowledge Distillation for Efficient Question Answering\n in Spanish","summary":" Recent advances in the development of pre-trained Spanish language models has\nled to significant progress in many Natural Language Processing (NLP) tasks,\nsuch as question answering. However, the lack of efficient models imposes a\nbarrier for the adoption of such models in resource-constrained environments.\nTherefore, smaller distilled models for the Spanish language could be proven to\nbe highly scalable and facilitate their further adoption on a variety of tasks\nand scenarios. In this work, we take one step in this direction by developing\nSpanishTinyRoBERTa, a compressed language model based on RoBERTa for efficient\nquestion answering in Spanish. To achieve this, we employ knowledge\ndistillation from a large model onto a lighter model that allows for a wider\nimplementation, even in areas with limited computational resources, whilst\nattaining negligible performance sacrifice. Our experiments show that the dense\ndistilled model can still preserve the performance of its larger counterpart,\nwhile significantly increasing inference speedup. This work serves as a\nstarting point for further research and investigation of model compression\nefforts for Spanish language models across various NLP tasks.\n","authors":["Adrián Bazaga","Pietro Liò","Gos Micklem"],"pdf_url":"https://arxiv.org/pdf/2312.04193v1.pdf","comment":"6 pages, 2 tables"},{"id":"http://arxiv.org/abs/2310.04856v2","updated":"2023-12-07T10:02:06Z","published":"2023-10-07T15:31:38Z","title":"LIPEx-Locally Interpretable Probabilistic Explanations-To Look Beyond\n The True Class","summary":" In this work, we instantiate a novel perturbation-based multi-class\nexplanation framework, LIPEx (Locally Interpretable Probabilistic Explanation).\nWe demonstrate that LIPEx not only locally replicates the probability\ndistributions output by the widely used complex classification models but also\nprovides insight into how every feature deemed to be important affects the\nprediction probability for each of the possible classes. We achieve this by\ndefining the explanation as a matrix obtained via regression with respect to\nthe Hellinger distance in the space of probability distributions. Ablation\ntests on text and image data, show that LIPEx-guided removal of important\nfeatures from the data causes more change in predictions for the underlying\nmodel than similar tests based on other saliency-based or feature\nimportance-based Explainable AI (XAI) methods. It is also shown that compared\nto LIME, LIPEx is more data efficient in terms of using a lesser number of\nperturbations of the data to obtain a reliable explanation. This\ndata-efficiency is seen to manifest as LIPEx being able to compute its\nexplanation matrix around 53% faster than all-class LIME, for classification\nexperiments with text data.\n","authors":["Hongbo Zhu","Angelo Cangelosi","Procheta Sen","Anirbit Mukherjee"],"pdf_url":"https://arxiv.org/pdf/2310.04856v2.pdf","comment":"19 pages, 9 figures"},{"id":"http://arxiv.org/abs/2312.04174v1","updated":"2023-12-07T09:49:05Z","published":"2023-12-07T09:49:05Z","title":"Coherent energy and force uncertainty in deep learning force fields","summary":" In machine learning energy potentials for atomic systems, forces are commonly\nobtained as the negative derivative of the energy function with respect to\natomic positions. To quantify aleatoric uncertainty in the predicted energies,\na widely used modeling approach involves predicting both a mean and variance\nfor each energy value. However, this model is not differentiable under the\nusual white noise assumption, so energy uncertainty does not naturally\ntranslate to force uncertainty. In this work we propose a machine learning\npotential energy model in which energy and force aleatoric uncertainty are\nlinked through a spatially correlated noise process. We demonstrate our\napproach on an equivariant messages passing neural network potential trained on\nenergies and forces on two out-of-equilibrium molecular datasets. Furthermore,\nwe also show how to obtain epistemic uncertainties in this setting based on a\nBayesian interpretation of deep ensemble models.\n","authors":["Peter Bjørn Jørgensen","Jonas Busk","Ole Winther","Mikkel N. Schmidt"],"pdf_url":"https://arxiv.org/pdf/2312.04174v1.pdf","comment":"Presented at Advancing Molecular Machine Learning - Overcoming\n Limitations [ML4Molecules], ELLIS workshop, VIRTUAL, December 8, 2023,\n unofficial NeurIPS 2023 side-event"},{"id":"http://arxiv.org/abs/2312.04171v1","updated":"2023-12-07T09:45:14Z","published":"2023-12-07T09:45:14Z","title":"A novel feature selection framework for incomplete data","summary":" Feature selection on incomplete datasets is an exceptionally challenging\ntask. Existing methods address this challenge by first employing imputation\nmethods to complete the incomplete data and then conducting feature selection\nbased on the imputed data. Since imputation and feature selection are entirely\nindependent steps, the importance of features cannot be considered during\nimputation. However, in real-world scenarios or datasets, different features\nhave varying degrees of importance. To address this, we propose a novel\nincomplete data feature selection framework that considers feature importance.\nThe framework mainly consists of two alternating iterative stages: the M-stage\nand the W-stage. In the M-stage, missing values are imputed based on a given\nfeature importance vector and multiple initial imputation results. In the\nW-stage, an improved reliefF algorithm is employed to learn the feature\nimportance vector based on the imputed data. Specifically, the feature\nimportance vector obtained in the current iteration of the W-stage serves as\ninput for the next iteration of the M-stage. Experimental results on both\nartificially generated and real incomplete datasets demonstrate that the\nproposed method outperforms other approaches significantly.\n","authors":["Cong Guo"],"pdf_url":"https://arxiv.org/pdf/2312.04171v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04168v1","updated":"2023-12-07T09:37:28Z","published":"2023-12-07T09:37:28Z","title":"Augmentation-Free Dense Contrastive Knowledge Distillation for Efficient\n Semantic Segmentation","summary":" In recent years, knowledge distillation methods based on contrastive learning\nhave achieved promising results on image classification and object detection\ntasks. However, in this line of research, we note that less attention is paid\nto semantic segmentation. Existing methods heavily rely on data augmentation\nand memory buffer, which entail high computational resource demands when\napplying them to handle semantic segmentation that requires to preserve\nhigh-resolution feature maps for making dense pixel-wise predictions. In order\nto address this problem, we present Augmentation-free Dense Contrastive\nKnowledge Distillation (Af-DCD), a new contrastive distillation learning\nparadigm to train compact and accurate deep neural networks for semantic\nsegmentation applications. Af-DCD leverages a masked feature mimicking\nstrategy, and formulates a novel contrastive learning loss via taking advantage\nof tactful feature partitions across both channel and spatial dimensions,\nallowing to effectively transfer dense and structured local knowledge learnt by\nthe teacher model to a target student model while maintaining training\nefficiency. Extensive experiments on five mainstream benchmarks with various\nteacher-student network pairs demonstrate the effectiveness of our approach.\nFor instance, the DeepLabV3-Res18|DeepLabV3-MBV2 model trained by Af-DCD\nreaches 77.03%|76.38% mIOU on Cityscapes dataset when choosing DeepLabV3-Res101\nas the teacher, setting new performance records. Besides that, Af-DCD achieves\nan absolute mIOU improvement of 3.26%|3.04%|2.75%|2.30%|1.42% compared with\nindividually trained counterpart on Cityscapes|Pascal\nVOC|Camvid|ADE20K|COCO-Stuff-164K. Code is available at\nhttps://github.com/OSVAI/Af-DCD\n","authors":["Jiawei Fan","Chao Li","Xiaolong Liu","Meina Song","Anbang Yao"],"pdf_url":"https://arxiv.org/pdf/2312.04168v1.pdf","comment":"The paper of Af-DCD is accepted to NeurIPS 2023. Code and models are\n available at https://github.com/OSVAI/Af-DCD"},{"id":"http://arxiv.org/abs/2312.04167v1","updated":"2023-12-07T09:36:31Z","published":"2023-12-07T09:36:31Z","title":"Mixture of Dynamical Variational Autoencoders for Multi-Source\n Trajectory Modeling and Separation","summary":" In this paper, we propose a latent-variable generative model called mixture\nof dynamical variational autoencoders (MixDVAE) to model the dynamics of a\nsystem composed of multiple moving sources. A DVAE model is pre-trained on a\nsingle-source dataset to capture the source dynamics. Then, multiple instances\nof the pre-trained DVAE model are integrated into a multi-source mixture model\nwith a discrete observation-to-source assignment latent variable. The posterior\ndistributions of both the discrete observation-to-source assignment variable\nand the continuous DVAE variables representing the sources content/position are\nestimated using a variational expectation-maximization algorithm, leading to\nmulti-source trajectories estimation. We illustrate the versatility of the\nproposed MixDVAE model on two tasks: a computer vision task, namely\nmulti-object tracking, and an audio processing task, namely single-channel\naudio source separation. Experimental results show that the proposed method\nworks well on these two tasks, and outperforms several baseline methods.\n","authors":["Xiaoyu Lin","Laurent Girin","Xavier Alameda-Pineda"],"pdf_url":"https://arxiv.org/pdf/2312.04167v1.pdf","comment":"arXiv admin note: substantial text overlap with arXiv:2202.09315"},{"id":"http://arxiv.org/abs/2312.04166v1","updated":"2023-12-07T09:36:18Z","published":"2023-12-07T09:36:18Z","title":"Improving Communication Efficiency of Federated Distillation via\n Accumulating Local Updates","summary":" As an emerging federated learning paradigm, federated distillation enables\ncommunication-efficient model training by transmitting only small-scale\nknowledge during the learning process. To further improve the communication\nefficiency of federated distillation, we propose a novel technique, ALU, which\naccumulates multiple rounds of local updates before transferring the knowledge\nto the central server. ALU drastically decreases the frequency of communication\nin federated distillation, thereby significantly reducing the communication\noverhead during the training process. Empirical experiments demonstrate the\nsubstantial effect of ALU in improving the communication efficiency of\nfederated distillation.\n","authors":["Zhiyuan Wu","Sheng Sun","Yuwei Wang","Min Liu","Tian Wen","Wen Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04166v1.pdf","comment":"2 pages, 3 figures"},{"id":"http://arxiv.org/abs/2312.04163v1","updated":"2023-12-07T09:26:58Z","published":"2023-12-07T09:26:58Z","title":"Multi-scale Residual Transformer for VLF Lightning Transients\n Classification","summary":" The utilization of Very Low Frequency (VLF) electromagnetic signals in\nnavigation systems is widespread. However, the non-stationary behavior of\nlightning signals can affect VLF electromagnetic signal transmission.\nAccurately classifying lightning signals is important for reducing interference\nand noise in VLF, thereby improving the reliability and overall performance of\nnavigation systems. In recent years, the evolution of deep learning,\nspecifically Convolutional Neural Network (CNNs), has sparked a transformation\nin lightning classification, surpassing traditional statistical methodologies.\nExisting CNN models have limitations as they overlook the diverse attributes of\nlightning signals across different scales and neglect the significance of\ntemporal sequencing in sequential signals. This study introduces an innovative\nmulti-scale residual transform (MRTransformer) that not only has the ability to\ndiscern intricate fine-grained patterns while also weighing the significance of\ndifferent aspects within the input lightning signal sequence. This model\nperforms the attributes of the lightning signal across different scales and the\nlevel of accuracy reached 90% in the classification. In future work, this model\nhas the potential applied to a comprehensive understanding of the localization\nand waveform characteristics of lightning signals.\n","authors":["Jinghao Sun","Tingting Ji","Guoyu Wang","Rui Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04163v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08705v3","updated":"2023-12-07T09:22:03Z","published":"2022-11-16T06:37:32Z","title":"Resource Allocation of Federated Learning for the Metaverse with Mobile\n Augmented Reality","summary":" The Metaverse has received much attention recently. Metaverse applications\nvia mobile augmented reality (MAR) require rapid and accurate object detection\nto mix digital data with the real world. Federated learning (FL) is an\nintriguing distributed machine learning approach due to its privacy-preserving\ncharacteristics. Due to privacy concerns and the limited computation resources\non mobile devices, we incorporate FL into MAR systems of the Metaverse to train\na model cooperatively. Besides, to balance the trade-off between energy,\nexecution latency and model accuracy, thereby accommodating different demands\nand application scenarios, we formulate an optimization problem to minimize a\nweighted combination of total energy consumption, completion time and model\naccuracy. Through decomposing the non-convex optimization problem into two\nsubproblems, we devise a resource allocation algorithm to determine the\nbandwidth allocation, transmission power, CPU frequency and video frame\nresolution for each participating device. We further present the convergence\nanalysis and computational complexity of the proposed algorithm. Numerical\nresults show that our proposed algorithm has better performance (in terms of\nenergy consumption, completion time and model accuracy) under different weight\nparameters compared to existing benchmarks.\n","authors":["Xinyu Zhou","Chang Liu","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2211.08705v3.pdf","comment":"This paper appears in IEEE Transactions on Wireless Communications.\n DOI: https://doi.org/10.1109/TWC.2023.3326884 It is the journal version of\n 2022 IEEE 42nd International Conference on Distributed Computing Systems\n (ICDCS) paper: arXiv:2209.14900; i.e.,\n https://doi.org/10.1109/ICDCS54860.2022.00101"},{"id":"http://arxiv.org/abs/2312.04159v1","updated":"2023-12-07T09:21:41Z","published":"2023-12-07T09:21:41Z","title":"Zero-Touch Networks: Towards Next-Generation Network Automation","summary":" The Zero-touch network and Service Management (ZSM) framework represents an\nemerging paradigm in the management of the fifth-generation (5G) and Beyond\n(5G+) networks, offering automated self-management and self-healing\ncapabilities to address the escalating complexity and the growing data volume\nof modern networks. ZSM frameworks leverage advanced technologies such as\nMachine Learning (ML) to enable intelligent decision-making and reduce human\nintervention. This paper presents a comprehensive survey of Zero-Touch Networks\n(ZTNs) within the ZSM framework, covering network optimization, traffic\nmonitoring, energy efficiency, and security aspects of next-generational\nnetworks. The paper explores the challenges associated with ZSM, particularly\nthose related to ML, which necessitate the need to explore diverse network\nautomation solutions. In this context, the study investigates the application\nof Automated ML (AutoML) in ZTNs, to reduce network management costs and\nenhance performance. AutoML automates the selection and tuning process of a ML\nmodel for a given task. Specifically, the focus is on AutoML's ability to\npredict application throughput and autonomously adapt to data drift.\nExperimental results demonstrate the superiority of the proposed AutoML\npipeline over traditional ML in terms of prediction accuracy. Integrating\nAutoML and ZSM concepts significantly reduces network configuration and\nmanagement efforts, allowing operators to allocate more time and resources to\nother important tasks. The paper also provides a high-level 5G system\narchitecture incorporating AutoML and ZSM concepts. This research highlights\nthe potential of ZTNs and AutoML to revolutionize the management of 5G+\nnetworks, enabling automated decision-making and empowering network operators\nto achieve higher efficiency, improved performance, and enhanced user\nexperience.\n","authors":["Mirna El Rajab","Li Yang","Abdallah Shami"],"pdf_url":"https://arxiv.org/pdf/2312.04159v1.pdf","comment":"115 pages, 12 figures, 15 tables, submitted to Computer Networks"},{"id":"http://arxiv.org/abs/2312.04155v1","updated":"2023-12-07T09:12:26Z","published":"2023-12-07T09:12:26Z","title":"Resource Allocation for Semantic Communication under Physical-layer\n Security","summary":" Semantic communication is deemed as a revolution of Shannon's paradigm in the\nsix-generation (6G) wireless networks. It aims at transmitting the extracted\ninformation rather than the original data, which receivers will try to recover.\nIntuitively, the larger extracted information, the longer latency of semantic\ncommunication will be. Besides, larger extracted information will result in\nmore accurate reconstructed information, thereby causing a higher utility of\nthe semantic communication system. Shorter latency and higher utility are\ndesirable objectives for the system, so there will be a trade-off between\nutility and latency. This paper proposes a joint optimization algorithm for\ntotal latency and utility. Moreover, security is essential for the semantic\ncommunication system. We incorporate the secrecy rate, a physical-layer\nsecurity method, into the optimization problem. The secrecy rate is the\ncommunication rate at which no information is disclosed to an eavesdropper.\nExperimental results demonstrate that the proposed algorithm obtains the best\njoint optimization performance compared to the baselines.\n","authors":["Yang Li","Xinyu Zhou","Jun Zhao"],"pdf_url":"https://arxiv.org/pdf/2312.04155v1.pdf","comment":"This paper appears in IEEE Global Communications Conference\n (GLOBECOM) 2023"},{"id":"http://arxiv.org/abs/2309.06240v2","updated":"2023-12-07T09:00:53Z","published":"2023-09-12T13:58:04Z","title":"Calibration in Machine Learning Uncertainty Quantification: beyond\n consistency to target adaptivity","summary":" Reliable uncertainty quantification (UQ) in machine learning (ML) regression\ntasks is becoming the focus of many studies in materials and chemical science.\nIt is now well understood that average calibration is insufficient, and most\nstudies implement additional methods testing the conditional calibration with\nrespect to uncertainty, i.e. consistency. Consistency is assessed mostly by\nso-called reliability diagrams. There exists however another way beyond average\ncalibration, which is conditional calibration with respect to input features,\ni.e. adaptivity. In practice, adaptivity is the main concern of the final users\nof a ML-UQ method, seeking for the reliability of predictions and uncertainties\nfor any point in features space. This article aims to show that consistency and\nadaptivity are complementary validation targets, and that a good consistency\ndoes not imply a good adaptivity. Adapted validation methods are proposed and\nillustrated on a representative example.\n","authors":["Pascal Pernot"],"pdf_url":"https://arxiv.org/pdf/2309.06240v2.pdf","comment":"arXiv admin note: text overlap with arXiv:2303.07170"},{"id":"http://arxiv.org/abs/2312.04145v1","updated":"2023-12-07T08:59:20Z","published":"2023-12-07T08:59:20Z","title":"Diffusing Colors: Image Colorization with Text Guided Diffusion","summary":" The colorization of grayscale images is a complex and subjective task with\nsignificant challenges. Despite recent progress in employing large-scale\ndatasets with deep neural networks, difficulties with controllability and\nvisual quality persist. To tackle these issues, we present a novel image\ncolorization framework that utilizes image diffusion techniques with granular\ntext prompts. This integration not only produces colorization outputs that are\nsemantically appropriate but also greatly improves the level of control users\nhave over the colorization process. Our method provides a balance between\nautomation and control, outperforming existing techniques in terms of visual\nquality and semantic coherence. We leverage a pretrained generative Diffusion\nModel, and show that we can finetune it for the colorization task without\nlosing its generative power or attention to text prompts. Moreover, we present\na novel CLIP-based ranking model that evaluates color vividness, enabling\nautomatic selection of the most suitable level of vividness based on the\nspecific scene semantics. Our approach holds potential particularly for color\nenhancement and historical image colorization.\n","authors":["Nir Zabari","Aharon Azulay","Alexey Gorkor","Tavi Halperin","Ohad Fried"],"pdf_url":"https://arxiv.org/pdf/2312.04145v1.pdf","comment":"SIGGRAPH Asia 2023"},{"id":"http://arxiv.org/abs/2312.00209v2","updated":"2023-12-07T08:57:50Z","published":"2023-11-30T21:42:15Z","title":"On the Interplay Between Stepsize Tuning and Progressive Sharpening","summary":" Recent empirical work has revealed an intriguing property of deep learning\nmodels by which the sharpness (largest eigenvalue of the Hessian) increases\nthroughout optimization until it stabilizes around a critical value at which\nthe optimizer operates at the edge of stability, given a fixed stepsize (Cohen\net al, 2022). We investigate empirically how the sharpness evolves when using\nstepsize-tuners, the Armijo linesearch and Polyak stepsizes, that adapt the\nstepsize along the iterations to local quantities such as, implicitly, the\nsharpness itself. We find that the surprisingly poor performance of a classical\nArmijo linesearch may be well explained by its tendency to ever-increase the\nsharpness of the objective in the full or large batch regimes. On the other\nhand, we observe that Polyak stepsizes operate generally at the edge of\nstability or even slightly beyond, while outperforming its Armijo and constant\nstepsizes counterparts. We conclude with an analysis that suggests unlocking\nstepsize tuners requires an understanding of the joint dynamics of the step\nsize and the sharpness.\n","authors":["Vincent Roulet","Atish Agarwala","Fabian Pedregosa"],"pdf_url":"https://arxiv.org/pdf/2312.00209v2.pdf","comment":"Presented at the NeurIPS 2023 OPT Wokshop"},{"id":"http://arxiv.org/abs/2312.04142v1","updated":"2023-12-07T08:56:44Z","published":"2023-12-07T08:56:44Z","title":"TimeDRL: Disentangled Representation Learning for Multivariate\n Time-Series","summary":" Multivariate time-series data in numerous real-world applications (e.g.,\nhealthcare and industry) are informative but challenging due to the lack of\nlabels and high dimensionality. Recent studies in self-supervised learning have\nshown their potential in learning rich representations without relying on\nlabels, yet they fall short in learning disentangled embeddings and addressing\nissues of inductive bias (e.g., transformation-invariance). To tackle these\nchallenges, we propose TimeDRL, a generic multivariate time-series\nrepresentation learning framework with disentangled dual-level embeddings.\nTimeDRL is characterized by three novel features: (i) disentangled derivation\nof timestamp-level and instance-level embeddings from patched time-series data\nusing a [CLS] token strategy; (ii) utilization of timestamp-predictive and\ninstance-contrastive tasks for disentangled representation learning, with the\nformer optimizing timestamp-level embeddings with predictive loss, and the\nlatter optimizing instance-level embeddings with contrastive loss; and (iii)\navoidance of augmentation methods to eliminate inductive biases, such as\ntransformation-invariance from cropping and masking. Comprehensive experiments\non 6 time-series forecasting datasets and 5 time-series classification datasets\nhave shown that TimeDRL consistently surpasses existing representation learning\napproaches, achieving an average improvement of forecasting by 57.98% in MSE\nand classification by 1.25% in accuracy. Furthermore, extensive ablation\nstudies confirmed the relative contribution of each component in TimeDRL's\narchitecture, and semi-supervised learning evaluations demonstrated its\neffectiveness in real-world scenarios, even with limited labeled data.\n","authors":["Ching Chang","Chiao-Tung Chan","Wei-Yao Wang","Wen-Chih Peng","Tien-Fu Chen"],"pdf_url":"https://arxiv.org/pdf/2312.04142v1.pdf","comment":"This paper is currently under review. The code will be made available\n upon acceptance"},{"id":"http://arxiv.org/abs/2312.04135v1","updated":"2023-12-07T08:50:25Z","published":"2023-12-07T08:50:25Z","title":"A Novel Federated Learning-based Intrusion Detection System for Flying\n Ad Hoc Networks","summary":" Unmanned aerial vehicles (UAVs) in flying ad-hoc networks (FANETs) face\nsecurity challenges due to the dynamic and distributed nature of these\nnetworks. This paper presents the Federated Learning-based Intrusion Detection\nSystem (FL-IDS), an innovative approach designed to improve FANET security.\nFL-IDS leverages federated learning to address privacy concerns of centralized\nintrusion detection systems. FL-IDS operates in a decentralized manner,\nenabling UAVs to collaboratively train a global intrusion detection model\nwithout sharing raw data. Local models are assigned to each UAV, using\nclient-specific data, and only updated model weights are shared with a central\nserver. This preserves privacy while utilizing collective intelligence for\neffective intrusion detection. Experimental results show FL-IDS's competitive\nperformance with Central IDS (C-IDS) while mitigating privacy concerns. The\nBias Towards Specific Clients (BTSC) method further enhances FL-IDS\nperformance, surpassing C-IDS even at lower attacker ratios. A comparative\nanalysis with traditional intrusion detection methods, including Local IDS\n(L-IDS), provides insights into FL-IDS's strengths. This study significantly\ncontributes to FANET security by introducing a privacy-aware, decentralized\nintrusion detection approach tailored to the unique challenges of UAV networks.\n","authors":["Ozlem Ceviz","Pinar Sadioglu","Sevil Sen","Vassilios G. Vassilakis"],"pdf_url":"https://arxiv.org/pdf/2312.04135v1.pdf","comment":"13 pages"},{"id":"http://arxiv.org/abs/2208.08089v2","updated":"2023-12-07T08:22:34Z","published":"2022-08-17T06:05:41Z","title":"Constrained Few-Shot Learning: Human-Like Low Sample Complexity Learning\n and Non-Episodic Text Classification","summary":" Few-shot learning (FSL) is an emergent paradigm of learning that attempts to\nlearn to reason with low sample complexity to mimic the way humans learn,\ngeneralise and extrapolate from only a few seen examples. While FSL attempts to\nmimic these human characteristics, fundamentally, the task of FSL as\nconventionally formulated using meta-learning with episodic-based training does\nnot in actuality align with how humans acquire and reason with knowledge. FSL\nwith episodic training, while only requires $K$ instances of each test class,\nstill requires a large number of labelled training instances from disjoint\nclasses. In this paper, we introduce the novel task of constrained few-shot\nlearning (CFSL), a special case of FSL where $M$, the number of instances of\neach training class is constrained such that $M \\leq K$ thus applying a similar\nrestriction during FSL training and test. We propose a method for CFSL\nleveraging Cat2Vec using a novel categorical contrastive loss inspired by\ncognitive theories such as fuzzy trace theory and prototype theory.\n","authors":["Jaron Mar","Jiamou Liu"],"pdf_url":"https://arxiv.org/pdf/2208.08089v2.pdf","comment":"Add additional references Update various sections for clarity"},{"id":"http://arxiv.org/abs/2312.04118v1","updated":"2023-12-07T08:18:40Z","published":"2023-12-07T08:18:40Z","title":"Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic\n Play","summary":" Infants' ability to recognize and categorize objects develops gradually. The\nsecond year of life is marked by both the emergence of more semantic visual\nrepresentations and a better understanding of word meaning. This suggests that\nlanguage input may play an important role in shaping visual representations.\nHowever, even in suitable contexts for word learning like dyadic play sessions,\ncaregivers utterances are sparse and ambiguous, often referring to objects that\nare different from the one to which the child attends. Here, we systematically\ninvestigate to what extent caregivers' utterances can nevertheless enhance\nvisual representations. For this we propose a computational model of visual\nrepresentation learning during dyadic play. We introduce a synthetic dataset of\nego-centric images perceived by a toddler-agent that moves and rotates toy\nobjects in different parts of its home environment while hearing caregivers'\nutterances, modeled as captions. We propose to model toddlers' learning as\nsimultaneously aligning representations for 1) close-in-time images and 2)\nco-occurring images and utterances. We show that utterances with statistics\nmatching those of real caregivers give rise to representations supporting\nimproved category recognition. Our analysis reveals that a small\ndecrease/increase in object-relevant naming frequencies can drastically impact\nthe learned representations. This affects the attention on object names within\nan utterance, which is required for efficient visuo-linguistic alignment.\nOverall, our results support the hypothesis that caregivers' naming utterances\ncan improve toddlers' visual representations.\n","authors":["Timothy Schaumlöffel","Arthur Aubret","Gemma Roig","Jochen Triesch"],"pdf_url":"https://arxiv.org/pdf/2312.04118v1.pdf","comment":"Proceedings of the 2023 IEEE International Conference on Development\n and Learning (ICDL)"},{"id":"http://arxiv.org/abs/2311.07625v2","updated":"2023-12-07T07:59:58Z","published":"2023-11-13T08:18:44Z","title":"Activity Sparsity Complements Weight Sparsity for Efficient RNN\n Inference","summary":" Artificial neural networks open up unprecedented machine learning\ncapabilities at the cost of ever growing computational requirements.\nSparsifying the parameters, often achieved through weight pruning, has been\nidentified as a powerful technique to compress the number of model parameters\nand reduce the computational operations of neural networks. Yet, sparse\nactivations, while omnipresent in both biological neural networks and deep\nlearning systems, have not been fully utilized as a compression technique in\ndeep learning. Moreover, the interaction between sparse activations and weight\npruning is not fully understood. In this work, we demonstrate that activity\nsparsity can compose multiplicatively with parameter sparsity in a recurrent\nneural network model based on the GRU that is designed to be activity sparse.\nWe achieve up to $20\\times$ reduction of computation while maintaining\nperplexities below $60$ on the Penn Treebank language modeling task. This\nmagnitude of reduction has not been achieved previously with solely sparsely\nconnected LSTMs, and the language modeling performance of our model has not\nbeen achieved previously with any sparsely activated recurrent neural networks\nor spiking neural networks. Neuromorphic computing devices are especially good\nat taking advantage of the dynamic activity sparsity, and our results provide\nstrong evidence that making deep learning models activity sparse and porting\nthem to neuromorphic devices can be a viable strategy that does not compromise\non task performance. Our results also drive further convergence of methods from\ndeep learning and neuromorphic computing for efficient machine learning.\n","authors":["Rishav Mukherji","Mark Schöne","Khaleelulla Khan Nazeer","Christian Mayr","Anand Subramoney"],"pdf_url":"https://arxiv.org/pdf/2311.07625v2.pdf","comment":"Accepted to the First MLNCP Workshop @ NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.04111v1","updated":"2023-12-07T07:54:11Z","published":"2023-12-07T07:54:11Z","title":"Breaking the Entanglement of Homophily and Heterophily in\n Semi-supervised Node Classification","summary":" Recently, graph neural networks (GNNs) have shown prominent performance in\nsemi-supervised node classification by leveraging knowledge from the graph\ndatabase. However, most existing GNNs follow the homophily assumption, where\nconnected nodes are more likely to exhibit similar feature distributions and\nthe same labels, and such an assumption has proven to be vulnerable in a\ngrowing number of practical applications. As a supplement, heterophily reflects\ndissimilarity in connected nodes, which has gained significant attention in\ngraph learning. To this end, data engineers aim to develop a powerful GNN model\nthat can ensure performance under both homophily and heterophily. Despite\nnumerous attempts, most existing GNNs struggle to achieve optimal node\nrepresentations due to the constraints of undirected graphs. The neglect of\ndirected edges results in sub-optimal graph representations, thereby hindering\nthe capacity of GNNs. To address this issue, we introduce AMUD, which\nquantifies the relationship between node profiles and topology from a\nstatistical perspective, offering valuable insights for \\underline{A}daptively\n\\underline{M}odeling the natural directed graphs as the \\underline{U}ndirected\nor \\underline{D}irected graph to maximize the benefits from subsequent graph\nlearning. Furthermore, we propose \\underline{A}daptive \\underline{D}irected\n\\underline{P}attern \\underline{A}ggregation (ADPA) as a new directed graph\nlearning paradigm for AMUD. Empirical studies have demonstrated that AMUD\nguides efficient graph learning. Meanwhile, extensive experiments on 14\nbenchmark datasets substantiate the impressive performance of ADPA,\noutperforming baselines by significant margins of 3.96\\%.\n","authors":["Henan Sun","Xunkai Li","Zhengyu Wu","Daohan Su","Rong-Hua Li","Guoren Wang"],"pdf_url":"https://arxiv.org/pdf/2312.04111v1.pdf","comment":"14 pages, 7 figures"},{"id":"http://arxiv.org/abs/2312.04110v1","updated":"2023-12-07T07:53:00Z","published":"2023-12-07T07:53:00Z","title":"Small Area Estimation of Case Growths for Timely COVID-19 Outbreak\n Detection","summary":" The COVID-19 pandemic has exerted a profound impact on the global economy and\ncontinues to exact a significant toll on human lives. The COVID-19 case growth\nrate stands as a key epidemiological parameter to estimate and monitor for\neffective detection and containment of the resurgence of outbreaks. A\nfundamental challenge in growth rate estimation and hence outbreak detection is\nbalancing the accuracy-speed tradeoff, where accuracy typically degrades with\nshorter fitting windows. In this paper, we develop a machine learning (ML)\nalgorithm, which we call Transfer Learning Generalized Random Forest (TLGRF),\nthat balances this accuracy-speed tradeoff. Specifically, we estimate the\ninstantaneous COVID-19 exponential growth rate for each U.S. county by using\nTLGRF that chooses an adaptive fitting window size based on relevant day-level\nand county-level features affecting the disease spread. Through transfer\nlearning, TLGRF can accurately estimate case growth rates for counties with\nsmall sample sizes. Out-of-sample prediction analysis shows that TLGRF\noutperforms established growth rate estimation methods. Furthermore, we\nconducted a case study based on outbreak case data from the state of Colorado\nand showed that the timely detection of outbreaks could have been improved by\nup to 224% using TLGRF when compared to the decisions made by Colorado's\nDepartment of Health and Environment (CDPHE). To facilitate implementation, we\nhave developed a publicly available outbreak detection tool for timely\ndetection of COVID-19 outbreaks in each U.S. county, which received substantial\nattention from policymakers.\n","authors":["Zhaowei She","Zilong Wang","Jagpreet Chhatwal","Turgay Ayer"],"pdf_url":"https://arxiv.org/pdf/2312.04110v1.pdf","comment":"Equal contributions by co-first authors Zhaowei She, Zilong Wang (in\n alphabetical order)"},{"id":"http://arxiv.org/abs/2312.04103v1","updated":"2023-12-07T07:37:15Z","published":"2023-12-07T07:37:15Z","title":"Enhancing the Rationale-Input Alignment for Self-explaining\n Rationalization","summary":" Rationalization empowers deep learning models with self-explaining\ncapabilities through a cooperative game, where a generator selects a\nsemantically consistent subset of the input as a rationale, and a subsequent\npredictor makes predictions based on the selected rationale. In this paper, we\ndiscover that rationalization is prone to a problem named \\emph{rationale\nshift}, which arises from the algorithmic bias of the cooperative game.\nRationale shift refers to a situation where the semantics of the selected\nrationale may deviate from the original input, but the predictor still produces\naccurate predictions based on the deviation, resulting in a compromised\ngenerator with misleading feedback.\n To address this issue, we first demonstrate the importance of the alignment\nbetween the rationale and the full input through both empirical observations\nand theoretical analysis. Subsequently, we introduce a novel approach called\nDAR (\\textbf{D}iscriminatively \\textbf{A}ligned \\textbf{R}ationalization),\nwhich utilizes an auxiliary module pretrained on the full input to\ndiscriminatively align the selected rationale and the original input. We\ntheoretically illustrate how DAR accomplishes the desired alignment, thereby\novercoming the rationale shift problem. The experiments on two widely used\nreal-world benchmarks show that the proposed method significantly improves the\nexplanation quality (measured by the overlap between the model-selected\nexplanation and the human-annotated rationale) as compared to state-of-the-art\ntechniques. Additionally, results on two synthetic settings further validate\nthe effectiveness of DAR in addressing the rationale shift problem.\n","authors":["Wei Liu","Haozhao Wang","Jun Wang","Zhiying Deng","YuanKai Zhang","Cheng Wang","Ruixuan Li"],"pdf_url":"https://arxiv.org/pdf/2312.04103v1.pdf","comment":"Accept at ICDE 2024"},{"id":"http://arxiv.org/abs/2307.08360v2","updated":"2023-12-07T07:20:27Z","published":"2023-07-17T09:55:35Z","title":"Universal Online Learning with Gradient Variations: A Multi-layer Online\n Ensemble Approach","summary":" In this paper, we propose an online convex optimization approach with two\ndifferent levels of adaptivity. On a higher level, our approach is agnostic to\nthe unknown types and curvatures of the online functions, while at a lower\nlevel, it can exploit the unknown niceness of the environments and attain\nproblem-dependent guarantees. Specifically, we obtain $\\mathcal{O}(\\log V_T)$,\n$\\mathcal{O}(d \\log V_T)$ and $\\widehat{\\mathcal{O}}(\\sqrt{V_T})$ regret bounds\nfor strongly convex, exp-concave and convex loss functions, respectively, where\n$d$ is the dimension, $V_T$ denotes problem-dependent gradient variations and\nthe $\\widehat{\\mathcal{O}}(\\cdot)$-notation omits $\\log V_T$ factors. Our\nresult not only safeguards the worst-case guarantees but also directly implies\nthe small-loss bounds in analysis. Moreover, when applied to\nadversarial/stochastic convex optimization and game theory problems, our result\nenhances the existing universal guarantees. Our approach is based on a\nmulti-layer online ensemble framework incorporating novel ingredients,\nincluding a carefully designed optimism for unifying diverse function types and\ncascaded corrections for algorithmic stability. Notably, despite its\nmulti-layer structure, our algorithm necessitates only one gradient query per\nround, making it favorable when the gradient evaluation is time-consuming. This\nis facilitated by a novel regret decomposition with carefully designed\nsurrogate losses.\n","authors":["Yu-Hu Yan","Peng Zhao","Zhi-Hua Zhou"],"pdf_url":"https://arxiv.org/pdf/2307.08360v2.pdf","comment":"NeurIPS 2023"},{"id":"http://arxiv.org/abs/2312.04095v1","updated":"2023-12-07T07:17:24Z","published":"2023-12-07T07:17:24Z","title":"Learn to Unlearn for Deep Neural Networks: Minimizing Unlearning\n Interference with Gradient Projection","summary":" Recent data-privacy laws have sparked interest in machine unlearning, which\ninvolves removing the effect of specific training samples from a learnt model\nas if they were never present in the original training dataset. The challenge\nof machine unlearning is to discard information about the ``forget'' data in\nthe learnt model without altering the knowledge about the remaining dataset and\nto do so more efficiently than the naive retraining approach. To achieve this,\nwe adopt a projected-gradient based learning method, named as\nProjected-Gradient Unlearning (PGU), in which the model takes steps in the\northogonal direction to the gradient subspaces deemed unimportant for the\nretaining dataset, so as to its knowledge is preserved. By utilizing Stochastic\nGradient Descent (SGD) to update the model weights, our method can efficiently\nscale to any model and dataset size. We provide empirically evidence to\ndemonstrate that our unlearning method can produce models that behave similar\nto models retrained from scratch across various metrics even when the training\ndataset is no longer accessible. Our code is available at\nhttps://github.com/hnanhtuan/projected_gradient_unlearning.\n","authors":["Tuan Hoang","Santu Rana","Sunil Gupta","Svetha Venkatesh"],"pdf_url":"https://arxiv.org/pdf/2312.04095v1.pdf","comment":"Accepted to WACV 2024"},{"id":"http://arxiv.org/abs/2312.04083v1","updated":"2023-12-07T06:51:55Z","published":"2023-12-07T06:51:55Z","title":"On the adaptation of in-context learners for system identification","summary":" In-context system identification aims at constructing meta-models to describe\nclasses of systems, differently from traditional approaches that model single\nsystems. This paradigm facilitates the leveraging of knowledge acquired from\nobserving the behaviour of different, yet related dynamics. This paper\ndiscusses the role of meta-model adaptation. Through numerical examples, we\ndemonstrate how meta-model adaptation can enhance predictive performance in\nthree realistic scenarios: tailoring the meta-model to describe a specific\nsystem rather than a class; extending the meta-model to capture the behaviour\nof systems beyond the initial training class; and recalibrating the model for\nnew prediction tasks. Results highlight the effectiveness of meta-model\nadaptation to achieve a more robust and versatile meta-learning framework for\nsystem identification.\n","authors":["Dario Piga","Filippo Pura","Marco Forgione"],"pdf_url":"https://arxiv.org/pdf/2312.04083v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2212.14578v2","updated":"2023-12-07T06:38:10Z","published":"2022-12-30T07:37:40Z","title":"MAUVE Scores for Generative Models: Theory and Practice","summary":" Generative artificial intelligence has made significant strides, producing\ntext indistinguishable from human prose and remarkably photorealistic images.\nAutomatically measuring how close the generated data distribution is to the\ntarget distribution is central to diagnosing existing models and developing\nbetter ones. We present MAUVE, a family of comparison measures between pairs of\ndistributions such as those encountered in the generative modeling of text or\nimages. These scores are statistical summaries of divergence frontiers\ncapturing two types of errors in generative modeling. We explore three\napproaches to statistically estimate these scores: vector quantization,\nnon-parametric estimation, and classifier-based estimation. We provide\nstatistical bounds for the vector quantization approach.\n Empirically, we find that the proposed scores paired with a range of\n$f$-divergences and statistical estimation methods can quantify the gaps\nbetween the distributions of human-written text and those of modern neural\nlanguage models by correlating with human judgments and identifying known\nproperties of the generated texts. We demonstrate in the vision domain that\nMAUVE can identify known properties of generated images on par with or better\nthan existing metrics. In conclusion, we present practical recommendations for\nusing MAUVE effectively with language and image modalities.\n","authors":["Krishna Pillutla","Lang Liu","John Thickstun","Sean Welleck","Swabha Swayamdipta","Rowan Zellers","Sewoong Oh","Yejin Choi","Zaid Harchaoui"],"pdf_url":"https://arxiv.org/pdf/2212.14578v2.pdf","comment":"Published in Journal of Machine Learning Research"},{"id":"http://arxiv.org/abs/2312.04070v1","updated":"2023-12-07T06:27:48Z","published":"2023-12-07T06:27:48Z","title":"A Transformer Model for Symbolic Regression towards Scientific Discovery","summary":" Symbolic Regression (SR) searches for mathematical expressions which best\ndescribe numerical datasets. This allows to circumvent interpretation issues\ninherent to artificial neural networks, but SR algorithms are often\ncomputationally expensive. This work proposes a new Transformer model aiming at\nSymbolic Regression particularly focused on its application for Scientific\nDiscovery. We propose three encoder architectures with increasing flexibility\nbut at the cost of column-permutation equivariance violation. Training results\nindicate that the most flexible architecture is required to prevent from\noverfitting. Once trained, we apply our best model to the SRSD datasets\n(Symbolic Regression for Scientific Discovery datasets) which yields\nstate-of-the-art results using the normalized tree-based edit distance, at no\nextra computational cost.\n","authors":["Florian Lalande","Yoshitomo Matsubara","Naoya Chiba","Tatsunori Taniai","Ryo Igarashi","Yoshitala Ushiku"],"pdf_url":"https://arxiv.org/pdf/2312.04070v1.pdf","comment":"Accepted for oral presentation at NeurIPS2023 AI4Science Workshop.\n OpenReview: https://openreview.net/forum?id=AIfqWNHKjo"},{"id":"http://arxiv.org/abs/2312.04068v1","updated":"2023-12-07T06:23:17Z","published":"2023-12-07T06:23:17Z","title":"Making Translators Privacy-aware on the User's Side","summary":" We propose PRISM to enable users of machine translation systems to preserve\nthe privacy of data on their own initiative. There is a growing demand to apply\nmachine translation systems to data that require privacy protection. While\nseveral machine translation engines claim to prioritize privacy, the extent and\nspecifics of such protection are largely ambiguous. First, there is often a\nlack of clarity on how and to what degree the data is protected. Even if\nservice providers believe they have sufficient safeguards in place,\nsophisticated adversaries might still extract sensitive information. Second,\nvulnerabilities may exist outside of these protective measures, such as within\ncommunication channels, potentially leading to data leakage. As a result, users\nare hesitant to utilize machine translation engines for data demanding high\nlevels of privacy protection, thereby missing out on their benefits. PRISM\nresolves this problem. Instead of relying on the translation service to keep\ndata safe, PRISM provides the means to protect data on the user's side. This\napproach ensures that even machine translation engines with inadequate privacy\nmeasures can be used securely. For platforms already equipped with privacy\nsafeguards, PRISM acts as an additional protection layer, reinforcing their\nsecurity furthermore. PRISM adds these privacy features without significantly\ncompromising translation accuracy. Our experiments demonstrate the\neffectiveness of PRISM using real-world translators, T5 and ChatGPT\n(GPT-3.5-turbo), and the datasets with two languages. PRISM effectively\nbalances privacy protection with translation accuracy.\n","authors":["Ryoma Sato"],"pdf_url":"https://arxiv.org/pdf/2312.04068v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04067v1","updated":"2023-12-07T06:19:39Z","published":"2023-12-07T06:19:39Z","title":"MeanCut: A Greedy-Optimized Graph Clustering via Path-based Similarity\n and Degree Descent Criterion","summary":" As the most typical graph clustering method, spectral clustering is popular\nand attractive due to the remarkable performance, easy implementation, and\nstrong adaptability. Classical spectral clustering measures the edge weights of\ngraph using pairwise Euclidean-based metric, and solves the optimal graph\npartition by relaxing the constraints of indicator matrix and performing\nLaplacian decomposition. However, Euclidean-based similarity might cause skew\ngraph cuts when handling non-spherical data distributions, and the relaxation\nstrategy introduces information loss. Meanwhile, spectral clustering requires\nspecifying the number of clusters, which is hard to determine without enough\nprior knowledge. In this work, we leverage the path-based similarity to enhance\nintra-cluster associations, and propose MeanCut as the objective function and\ngreedily optimize it in degree descending order for a nondestructive graph\npartition. This algorithm enables the identification of arbitrary shaped\nclusters and is robust to noise. To reduce the computational complexity of\nsimilarity calculation, we transform optimal path search into generating the\nmaximum spanning tree (MST), and develop a fast MST (FastMST) algorithm to\nfurther improve its time-efficiency. Moreover, we define a density gradient\nfactor (DGF) for separating the weakly connected clusters. The validity of our\nalgorithm is demonstrated by testifying on real-world benchmarks and\napplication of face recognition. The source code of MeanCut is available at\nhttps://github.com/ZPGuiGroupWhu/MeanCut-Clustering.\n","authors":["Dehua Peng","Zhipeng Gui","Huayi Wu"],"pdf_url":"https://arxiv.org/pdf/2312.04067v1.pdf","comment":"17 pages, 8 figures, 6 tables"},{"id":"http://arxiv.org/abs/2303.18187v2","updated":"2023-12-07T06:09:41Z","published":"2023-03-30T02:40:28Z","title":"Contrastive-Signal-Dependent Plasticity: Forward-Forward Learning of\n Spiking Neural Systems","summary":" We develop a neuro-mimetic architecture, composed of spiking neuronal units,\nwhere individual layers of neurons operate in parallel and adapt their synaptic\nefficacies without the use of feedback pathways. Specifically, we propose an\nevent-based generalization of forward-forward learning, which we call\ncontrastive-signal-dependent plasticity (CSDP), for a spiking neural system\nthat iteratively processes sensory input over a stimulus window. The dynamics\nthat underwrite this recurrent circuit entail computing the membrane potential\nof each processing element, in each layer, as a function of local bottom-up,\ntop-down, and lateral signals, facilitating a dynamic, layer-wise parallel form\nof neural computation. Unlike other models, such as spiking predictive coding,\nwhich rely on feedback synapses to adjust neural electrical activity, our model\noperates purely online and forward in time, offering a promising way to learn\ndistributed representations of sensory data patterns, with and without labeled\ncontext information. Notably, our experimental results on several pattern\ndatasets demonstrate that the CSDP process works well for training a dynamic\nrecurrent spiking network capable of both classification and reconstruction.\n","authors":["Alexander Ororbia"],"pdf_url":"https://arxiv.org/pdf/2303.18187v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04065v1","updated":"2023-12-07T06:09:21Z","published":"2023-12-07T06:09:21Z","title":"A Robust and Efficient Boundary Point Detection Method by Measuring\n Local Direction Dispersion","summary":" Boundary points pose a significant challenge for machine learning tasks,\nincluding classification, clustering, and dimensionality reduction. Due to the\nsimilarity of features, boundary areas can result in mixed-up classes or\nclusters, leading to a crowding problem in dimensionality reduction. To address\nthis challenge, numerous boundary point detection methods have been developed,\nbut they are insufficiently to accurately and efficiently identify the boundary\npoints in non-convex structures and high-dimensional manifolds. In this work,\nwe propose a robust and efficient method for detecting boundary points using\nLocal Direction Dispersion (LoDD). LoDD considers that internal points are\nsurrounded by neighboring points in all directions, while neighboring points of\na boundary point tend to be distributed only in a certain directional range.\nLoDD adopts a density-independent K-Nearest Neighbors (KNN) method to determine\nneighboring points, and defines a statistic-based metric using the eigenvalues\nof the covariance matrix of KNN coordinates to measure the centrality of a\nquery point. We demonstrated the validity of LoDD on five synthetic datasets\n(2-D and 3-D) and ten real-world benchmarks, and tested its clustering\nperformance by equipping with two typical clustering methods, K-means and Ncut.\nOur results show that LoDD achieves promising and robust detection accuracy in\na time-efficient manner.\n","authors":["Dehua Peng","Zhipeng Gui","Huayi Wu"],"pdf_url":"https://arxiv.org/pdf/2312.04065v1.pdf","comment":"11 pages, 6 figures, 3 tables"},{"id":"http://arxiv.org/abs/2312.04064v1","updated":"2023-12-07T06:05:39Z","published":"2023-12-07T06:05:39Z","title":"DiscoBAX: Discovery of Optimal Intervention Sets in Genomic Experiment\n Design","summary":" The discovery of therapeutics to treat genetically-driven pathologies relies\non identifying genes involved in the underlying disease mechanisms. Existing\napproaches search over the billions of potential interventions to maximize the\nexpected influence on the target phenotype. However, to reduce the risk of\nfailure in future stages of trials, practical experiment design aims to find a\nset of interventions that maximally change a target phenotype via diverse\nmechanisms. We propose DiscoBAX, a sample-efficient method for maximizing the\nrate of significant discoveries per experiment while simultaneously probing for\na wide range of diverse mechanisms during a genomic experiment campaign. We\nprovide theoretical guarantees of approximate optimality under standard\nassumptions, and conduct a comprehensive experimental evaluation covering both\nsynthetic as well as real-world experimental design tasks. DiscoBAX outperforms\nexisting state-of-the-art methods for experimental design, selecting effective\nand diverse perturbations in biological systems.\n","authors":["Clare Lyle","Arash Mehrjou","Pascal Notin","Andrew Jesson","Stefan Bauer","Yarin Gal","Patrick Schwab"],"pdf_url":"https://arxiv.org/pdf/2312.04064v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2307.11086v2","updated":"2023-12-07T06:04:58Z","published":"2023-07-20T17:59:33Z","title":"PAPR: Proximity Attention Point Rendering","summary":" Learning accurate and parsimonious point cloud representations of scene\nsurfaces from scratch remains a challenge in 3D representation learning.\nExisting point-based methods often suffer from the vanishing gradient problem\nor require a large number of points to accurately model scene geometry and\ntexture. To address these limitations, we propose Proximity Attention Point\nRendering (PAPR), a novel method that consists of a point-based scene\nrepresentation and a differentiable renderer. Our scene representation uses a\npoint cloud where each point is characterized by its spatial position,\ninfluence score, and view-independent feature vector. The renderer selects the\nrelevant points for each ray and produces accurate colours using their\nassociated features. PAPR effectively learns point cloud positions to represent\nthe correct scene geometry, even when the initialization drastically differs\nfrom the target geometry. Notably, our method captures fine texture details\nwhile using only a parsimonious set of points. We also demonstrate four\npractical applications of our method: zero-shot geometry editing, object\nmanipulation, texture transfer, and exposure control. More results and code are\navailable on our project website at https://zvict.github.io/papr/.\n","authors":["Yanshu Zhang","Shichong Peng","Alireza Moazeni","Ke Li"],"pdf_url":"https://arxiv.org/pdf/2307.11086v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11642v2","updated":"2023-12-07T05:47:50Z","published":"2023-11-20T10:01:13Z","title":"Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging","summary":" Video face re-aging deals with altering the apparent age of a person to the\ntarget age in videos. This problem is challenging due to the lack of paired\nvideo datasets maintaining temporal consistency in identity and age. Most\nre-aging methods process each image individually without considering the\ntemporal consistency of videos. While some existing works address the issue of\ntemporal coherence through video facial attribute manipulation in latent space,\nthey often fail to deliver satisfactory performance in age transformation. To\ntackle the issues, we propose (1) a novel synthetic video dataset that features\nsubjects across a diverse range of age groups; (2) a baseline architecture\ndesigned to validate the effectiveness of our proposed dataset, and (3) the\ndevelopment of three novel metrics tailored explicitly for evaluating the\ntemporal consistency of video re-aging techniques. Our comprehensive\nexperiments on public datasets, such as VFHQ and CelebV-HQ, show that our\nmethod outperforms the existing approaches in terms of both age transformation\nand temporal consistency.\n","authors":["Abdul Muqeet","Kyuchul Lee","Bumsoo Kim","Yohan Hong","Hyungrae Lee","Woonggon Kim","KwangHee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11642v2.pdf","comment":"8 pages, 6 figures, 4 tables, Project page:\n https://video-reaging.github.io/"},{"id":"http://arxiv.org/abs/2312.04055v1","updated":"2023-12-07T05:27:24Z","published":"2023-12-07T05:27:24Z","title":"Jointly spatial-temporal representation learning for individual\n trajectories","summary":" Individual trajectories, containing substantial information on\nhuman-environment interactions across space and time, is a crucial input for\ngeospatial foundation models (GeoFMs). However, existing attempts, leveraging\ntrajectory data for various applications have overlooked the implicit\nspatial-temporal dependency within trajectories and failed to encode and\nrepresent it in a format friendly to deep learning, posing a challenge in\nobtaining general-purpose trajectory representations. Therefore, this paper\nproposes a spatial-temporal joint representation learning method (ST-GraphRL)\nto formalize learnable spatial-temporal dependencies into trajectory\nrepresentations. The proposed ST-GraphRL consists of three compositions: (i) a\nweighted directed spatial-temporal graph to explicitly construct mobility\ninteractions over both space and time dimensions; (ii) a two-stage jointly\nencoder (i.e., decoupling and fusion) to learn entangled spatial-temporal\ndependencies by independently decomposing and jointly aggregating space and\ntime information; (iii) a decoder guides ST-GraphRL to learn explicit mobility\nregularities by simulating the spatial-temporal distributions of trajectories.\nTested on three real-world human mobility datasets, the proposed ST-GraphRL\noutperformed all the baseline models in predicting movement spatial-temporal\ndistributions and preserving trajectory similarity with high spatial-temporal\ncorrelations. We also explore how spatial-temporal features presented in latent\nspace, validating that ST-GraphRL understands spatial-temporal patterns. This\nmethod is also transferable for general-purpose geospatial data representations\nfor broad downstream tasks, as well advancing GeoFMs developing.\n","authors":["Fei Huang","Jianrong Lv","Yang Yue"],"pdf_url":"https://arxiv.org/pdf/2312.04055v1.pdf","comment":"27 pages, 3 tables, 7 figures"},{"id":"http://arxiv.org/abs/2309.03409v2","updated":"2023-12-07T05:25:15Z","published":"2023-09-07T00:07:15Z","title":"Large Language Models as Optimizers","summary":" Optimization is ubiquitous. While derivative-based algorithms have been\npowerful tools for various problems, the absence of gradient imposes challenges\non many real-world applications. In this work, we propose Optimization by\nPROmpting (OPRO), a simple and effective approach to leverage large language\nmodels (LLMs) as optimizers, where the optimization task is described in\nnatural language. In each optimization step, the LLM generates new solutions\nfrom the prompt that contains previously generated solutions with their values,\nthen the new solutions are evaluated and added to the prompt for the next\noptimization step. We first showcase OPRO on linear regression and traveling\nsalesman problems, then move on to prompt optimization where the goal is to\nfind instructions that maximize the task accuracy. With a variety of LLMs, we\ndemonstrate that the best prompts optimized by OPRO outperform human-designed\nprompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks. Code at\nhttps://github.com/google-deepmind/opro.\n","authors":["Chengrun Yang","Xuezhi Wang","Yifeng Lu","Hanxiao Liu","Quoc V. Le","Denny Zhou","Xinyun Chen"],"pdf_url":"https://arxiv.org/pdf/2309.03409v2.pdf","comment":"42 pages, 26 figures, 15 tables. Code at\n https://github.com/google-deepmind/opro"},{"id":"http://arxiv.org/abs/2312.04038v1","updated":"2023-12-07T04:43:04Z","published":"2023-12-07T04:43:04Z","title":"Reconstruction of dynamical systems from data without time labels","summary":" In this paper, we study the method to reconstruct dynamical systems from data\nwithout time labels. Data without time labels appear in many applications, such\nas molecular dynamics, single-cell RNA sequencing etc. Reconstruction of\ndynamical system from time sequence data has been studied extensively. However,\nthese methods do not apply if time labels are unknown. Without time labels,\nsequence data becomes distribution data. Based on this observation, we propose\nto treat the data as samples from a probability distribution and try to\nreconstruct the underlying dynamical system by minimizing the distribution\nloss, sliced Wasserstein distance more specifically. Extensive experiment\nresults demonstrate the effectiveness of the proposed method.\n","authors":["Zhijun Zeng","Pipi Hu","Chenglong Bao","Yi Zhu","Zuoqiang Shi"],"pdf_url":"https://arxiv.org/pdf/2312.04038v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04036v1","updated":"2023-12-07T04:39:22Z","published":"2023-12-07T04:39:22Z","title":"DiffusionPhase: Motion Diffusion in Frequency Domain","summary":" In this study, we introduce a learning-based method for generating\nhigh-quality human motion sequences from text descriptions (e.g., ``A person\nwalks forward\"). Existing techniques struggle with motion diversity and smooth\ntransitions in generating arbitrary-length motion sequences, due to limited\ntext-to-motion datasets and the pose representations used that often lack\nexpressiveness or compactness. To address these issues, we propose the first\nmethod for text-conditioned human motion generation in the frequency domain of\nmotions. We develop a network encoder that converts the motion space into a\ncompact yet expressive parameterized phase space with high-frequency details\nencoded, capturing the local periodicity of motions in time and space with high\naccuracy. We also introduce a conditional diffusion model for predicting\nperiodic motion parameters based on text descriptions and a start pose,\nefficiently achieving smooth transitions between motion sequences associated\nwith different text descriptions. Experiments demonstrate that our approach\noutperforms current methods in generating a broader variety of high-quality\nmotions, and synthesizing long sequences with natural transitions.\n","authors":["Weilin Wan","Yiming Huang","Shutong Wu","Taku Komura","Wenping Wang","Dinesh Jayaraman","Lingjie Liu"],"pdf_url":"https://arxiv.org/pdf/2312.04036v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04032v1","updated":"2023-12-07T04:23:36Z","published":"2023-12-07T04:23:36Z","title":"RoAST: Robustifying Language Models via Adversarial Perturbation with\n Selective Training","summary":" Fine-tuning pre-trained language models (LMs) has become the de facto\nstandard in many NLP tasks. Nevertheless, fine-tuned LMs are still prone to\nrobustness issues, such as adversarial robustness and model calibration.\nSeveral perspectives of robustness for LMs have been studied independently, but\nlacking a unified consideration in multiple perspectives. In this paper, we\npropose Robustifying LMs via Adversarial perturbation with Selective Training\n(RoAST), a simple yet effective fine-tuning technique to enhance the\nmulti-perspective robustness of LMs in a unified way. RoAST effectively\nincorporates two important sources for the model robustness, robustness on the\nperturbed inputs and generalizable knowledge in pre-trained LMs. To be\nspecific, RoAST introduces adversarial perturbation during fine-tuning while\nthe model parameters are selectively updated upon their relative importance to\nminimize unnecessary deviation. Under a unified evaluation of fine-tuned LMs by\nincorporating four representative perspectives of model robustness, we\ndemonstrate the effectiveness of RoAST compared to state-of-the-art fine-tuning\nmethods on six different types of LMs, which indicates its usefulness in\npractice.\n","authors":["Jaehyung Kim","Yuning Mao","Rui Hou","Hanchao Yu","Davis Liang","Pascale Fung","Qifan Wang","Fuli Feng","Lifu Huang","Madian Khabsa"],"pdf_url":"https://arxiv.org/pdf/2312.04032v1.pdf","comment":"33 pages, accepted at EMNLP 2023 Findings"},{"id":"http://arxiv.org/abs/2312.04030v1","updated":"2023-12-07T03:55:51Z","published":"2023-12-07T03:55:51Z","title":"Modeling Boundedly Rational Agents with Latent Inference Budgets","summary":" We study the problem of modeling a population of agents pursuing unknown\ngoals subject to unknown computational constraints. In standard models of\nbounded rationality, sub-optimal decision-making is simulated by adding\nhomoscedastic noise to optimal decisions rather than explicitly simulating\nconstrained inference. In this work, we introduce a latent inference budget\nmodel (L-IBM) that models agents' computational constraints explicitly, via a\nlatent variable (inferred jointly with a model of agents' goals) that controls\nthe runtime of an iterative inference algorithm. L-IBMs make it possible to\nlearn agent models using data from diverse populations of suboptimal actors. In\nthree modeling tasks -- inferring navigation goals from routes, inferring\ncommunicative intents from human utterances, and predicting next moves in human\nchess games -- we show that L-IBMs match or outperform Boltzmann models of\ndecision-making under uncertainty. Inferred inference budgets are themselves\nmeaningful, efficient to compute, and correlated with measures of player skill,\npartner skill and task difficulty.\n","authors":["Athul Paul Jacob","Abhishek Gupta","Jacob Andreas"],"pdf_url":"https://arxiv.org/pdf/2312.04030v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04027v1","updated":"2023-12-07T03:53:17Z","published":"2023-12-07T03:53:17Z","title":"The sample complexity of multi-distribution learning","summary":" Multi-distribution learning generalizes the classic PAC learning to handle\ndata coming from multiple distributions. Given a set of $k$ data distributions\nand a hypothesis class of VC dimension $d$, the goal is to learn a hypothesis\nthat minimizes the maximum population loss over $k$ distributions, up to\n$\\epsilon$ additive error. In this paper, we settle the sample complexity of\nmulti-distribution learning by giving an algorithm of sample complexity\n$\\widetilde{O}((d+k)\\epsilon^{-2}) \\cdot (k/\\epsilon)^{o(1)}$. This matches the\nlower bound up to sub-polynomial factor and resolves the COLT 2023 open problem\nof Awasthi, Haghtalab and Zhao [AHZ23].\n","authors":["Binghui Peng"],"pdf_url":"https://arxiv.org/pdf/2312.04027v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2211.08586v2","updated":"2023-12-07T03:44:37Z","published":"2022-11-16T00:10:35Z","title":"Bandit Algorithms for Prophet Inequality and Pandora's Box","summary":" The Prophet Inequality and Pandora's Box problems are fundamental stochastic\nproblem with applications in Mechanism Design, Online Algorithms, Stochastic\nOptimization, Optimal Stopping, and Operations Research. A usual assumption in\nthese works is that the probability distributions of the $n$ underlying random\nvariables are given as input to the algorithm. Since in practice these\ndistributions need to be learned, we initiate the study of such stochastic\nproblems in the Multi-Armed Bandits model.\n In the Multi-Armed Bandits model we interact with $n$ unknown distributions\nover $T$ rounds: in round $t$ we play a policy $x^{(t)}$ and receive a partial\n(bandit) feedback on the performance of $x^{(t)}$. The goal is to minimize the\nregret, which is the difference over $T$ rounds in the total value of the\noptimal algorithm that knows the distributions vs. the total value of our\nalgorithm that learns the distributions from the partial feedback. Our main\nresults give near-optimal $\\tilde{O}(\\mathsf{poly}(n)\\sqrt{T})$ total regret\nalgorithms for both Prophet Inequality and Pandora's Box.\n Our proofs proceed by maintaining confidence intervals on the unknown indices\nof the optimal policy. The exploration-exploitation tradeoff prevents us from\ndirectly refining these confidence intervals, so the main technique is to\ndesign a regret upper bound that is learnable while playing low-regret Bandit\npolicies.\n","authors":["Khashayar Gatmiry","Thomas Kesselheim","Sahil Singla","Yifan Wang"],"pdf_url":"https://arxiv.org/pdf/2211.08586v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04024v1","updated":"2023-12-07T03:42:48Z","published":"2023-12-07T03:42:48Z","title":"k* Distribution: Evaluating the Latent Space of Deep Neural Networks\n using Local Neighborhood Analysis","summary":" Most examinations of neural networks' learned latent spaces typically employ\ndimensionality reduction techniques such as t-SNE or UMAP. While these methods\neffectively capture the overall sample distribution in the entire learned\nlatent space, they tend to distort the structure of sample distributions within\nspecific classes in the subset of the latent space. This distortion complicates\nthe task of easily distinguishing classes identifiable by neural networks. In\nresponse to this challenge, we introduce the k* Distribution methodology. This\napproach focuses on capturing the characteristics and structure of sample\ndistributions for individual classes within the subset of the learned latent\nspace using local neighborhood analysis. The key concept is to facilitate easy\ncomparison of different k* distributions, enabling analysis of how various\nclasses are processed by the same neural network. This provides a more profound\nunderstanding of existing contemporary visualizations. Our study reveals three\ndistinct distributions of samples within the learned latent space subset: a)\nFractured, b) Overlapped, and c) Clustered. We note and demonstrate that the\ndistribution of samples within the network's learned latent space significantly\nvaries depending on the class. Furthermore, we illustrate that our analysis can\nbe applied to explore the latent space of diverse neural network architectures,\nvarious layers within neural networks, transformations applied to input\nsamples, and the distribution of training and testing data for neural networks.\nWe anticipate that our approach will facilitate more targeted investigations\ninto neural networks by collectively examining the distribution of different\nsamples within the learned latent space.\n","authors":["Shashank Kotyan","Ueda Tatsuya","Danilo Vasconcellos Vargas"],"pdf_url":"https://arxiv.org/pdf/2312.04024v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.09200v2","updated":"2023-12-07T03:39:42Z","published":"2023-11-15T18:43:29Z","title":"ExpM+NF Tractable Exponential Mechanism via Normalizing Flow, A Path\n through the Accuracy-Privacy Ceiling Constraining Differentially Private ML","summary":" The Exponential Mechanism (ExpM), a differentially private optimization\nmethod, promises many advantages over Differentially Private Stochastic\nGradient Descent (DPSGD), the state-of-the-art (SOTA) and de facto method for\ndifferentially private machine learning (ML). Yet, ExpM has been historically\nstymied from differentially private training of modern ML algorithms by two\nobstructions: ExpM requires a sensitivity bound for the given loss function;\nExpM requires sampling from a historically intractable density. We prove a\nsensitivity bound for $\\ell(2)$ loss, and investigate using Normalizing Flows\n(NFs), deep networks furnishing approximate sampling from the otherwise\nintractable ExpM distribution. We prove that as the NF output converges to ExpM\ndistribution, the privacy ($\\varepsilon$) of an NF sample converges to that of\nthe ExpM distribution. Under the assumption that the NF output distribution is\nthe ExpM distribution, we empirically test ExpM+NF against DPSGD using the SOTA\nimplementation (Opacus \\cite{opacus} with PRV accounting) in multiple\nclassification tasks on the Adult Dataset (census data) and MIMIC-III Dataset\n(healthcare records) using Logistic Regression and GRU-D, a deep learning\nrecurrent neural network with \\smallsim 20K-100K parameters. In all experiments\nwe find ExpM+NF achieves greater than 94\\% of the non-private training accuracy\n(AUC) with $\\varepsilon$-DP for $\\varepsilon$ a low as $1\\mathrm{e}{-3}$ --\nthree orders of magnitude stronger privacy with similar accuracy. Further,\nperformance results show ExpM+NF training time is comparable to (slightly less)\nthan DPSGD. Limitations and future directions are provided; notably, research\non NF approximation accuracy and its effect on privacy are a promising avenue\nto substantially advancing the field. Code for these experiments \\hl{will be\nprovided after review}.\n","authors":["Robert A. Bridges","Vandy J. Tombs","Christopher B. Stanley"],"pdf_url":"https://arxiv.org/pdf/2311.09200v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04021v1","updated":"2023-12-07T03:37:39Z","published":"2023-12-07T03:37:39Z","title":"A Study on the Calibration of In-context Learning","summary":" Modern auto-regressive language models are trained to minimize log loss on\nbroad data by predicting the next token so they are expected to get calibrated\nanswers when framing a problem as a next-token prediction task. We study this\nfor in-context learning (ICL), a widely used way to adapt frozen large language\nmodels (LLMs) via crafting prompts, and investigate the trade-offs between\nperformance and calibration on a wide range of natural language understanding\nand reasoning tasks. We conduct extensive experiments to show that such\ntrade-offs may get worse as we increase model size, incorporate more ICL\nexamples, and fine-tune models using instruction, dialog, or reinforcement\nlearning from human feedback (RLHF) on carefully curated datasets. Furthermore,\nwe find that common recalibration techniques that are widely effective such as\ntemperature scaling provide limited gains in calibration errors, suggesting\nthat new methods may be required for settings where models are expected to be\nreliable.\n","authors":["Hanlin Zhang","Yi-Fan Zhang","Yaodong Yu","Dhruv Madeka","Dean Foster","Eric Xing","Hima Lakkaraju","Sham Kakade"],"pdf_url":"https://arxiv.org/pdf/2312.04021v1.pdf","comment":"Spotlight Talk at NeurIPS 2023 Workshop on Failure Modes in the Age\n of Foundation Models"},{"id":"http://arxiv.org/abs/2306.05566v2","updated":"2023-12-07T03:27:14Z","published":"2023-06-08T21:18:25Z","title":"Data-Adaptive Probabilistic Likelihood Approximation for Ordinary\n Differential Equations","summary":" Estimating the parameters of ordinary differential equations (ODEs) is of\nfundamental importance in many scientific applications. While ODEs are\ntypically approximated with deterministic algorithms, new research on\nprobabilistic solvers indicates that they produce more reliable parameter\nestimates by better accounting for numerical errors. However, many ODE systems\nare highly sensitive to their parameter values. This produces deep local maxima\nin the likelihood function -- a problem which existing probabilistic solvers\nhave yet to resolve. Here we present a novel probabilistic ODE likelihood\napproximation, DALTON, which can dramatically reduce parameter sensitivity by\nlearning from noisy ODE measurements in a data-adaptive manner. Our\napproximation scales linearly in both ODE variables and time discretization\npoints, and is applicable to ODEs with both partially-unobserved components and\nnon-Gaussian measurement models. Several examples demonstrate that DALTON\nproduces more accurate parameter estimates via numerical optimization than\nexisting probabilistic ODE solvers, and even in some cases than the exact ODE\nlikelihood itself.\n","authors":["Mohan Wu","Martin Lysy"],"pdf_url":"https://arxiv.org/pdf/2306.05566v2.pdf","comment":"8 pages, 7 figures"},{"id":"http://arxiv.org/abs/2311.15327v2","updated":"2023-12-07T03:21:06Z","published":"2023-11-26T15:11:17Z","title":"FRAC-Q-Learning: A Reinforcement Learning with Boredom Avoidance\n Processes for Social Robots","summary":" The reinforcement learning algorithms have often been applied to social\nrobots. However, most reinforcement learning algorithms were not optimized for\nthe use of social robots, and consequently they may bore users. We proposed a\nnew reinforcement learning method specialized for the social robot, the\nFRAC-Q-learning, that can avoid user boredom. The proposed algorithm consists\nof a forgetting process in addition to randomizing and categorizing processes.\nThis study evaluated interest and boredom hardness scores of the\nFRAC-Q-learning by a comparison with the traditional Q-learning. The\nFRAC-Q-learning showed significantly higher trend of interest score, and\nindicated significantly harder to bore users compared to the traditional\nQ-learning. Therefore, the FRAC-Q-learning can contribute to develop a social\nrobot that will not bore users. The proposed algorithm can also find\napplications in Web-based communication and educational systems. This paper\npresents the entire process, detailed implementation and a detailed evaluation\nmethod of the of the FRAC-Q-learning for the first time.\n","authors":["Akinari Onishi"],"pdf_url":"https://arxiv.org/pdf/2311.15327v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2310.02563v2","updated":"2023-12-07T02:50:42Z","published":"2023-10-04T03:47:21Z","title":"Practical, Private Assurance of the Value of Collaboration","summary":" Two parties wish to collaborate on their datasets. However, before they\nreveal their datasets to each other, the parties want to have the guarantee\nthat the collaboration would be fruitful. We look at this problem from the\npoint of view of machine learning, where one party is promised an improvement\non its prediction model by incorporating data from the other party. The parties\nwould only wish to collaborate further if the updated model shows an\nimprovement in accuracy. Before this is ascertained, the two parties would not\nwant to disclose their models and datasets. In this work, we construct an\ninteractive protocol for this problem based on the fully homomorphic encryption\nscheme over the Torus (TFHE) and label differential privacy, where the\nunderlying machine learning model is a neural network. Label differential\nprivacy is used to ensure that computations are not done entirely in the\nencrypted domain, which is a significant bottleneck for neural network training\naccording to the current state-of-the-art FHE implementations. We prove the\nsecurity of our scheme in the universal composability framework assuming\nhonest-but-curious parties, but where one party may not have any expertise in\nlabelling its initial dataset. Experiments show that we can obtain the output,\ni.e., the accuracy of the updated model, with time many orders of magnitude\nfaster than a protocol using entirely FHE operations.\n","authors":["Hassan Jameel Asghar","Zhigang Lu","Zhongrui Zhao","Dali Kaafar"],"pdf_url":"https://arxiv.org/pdf/2310.02563v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.00271v2","updated":"2023-12-07T02:49:11Z","published":"2023-12-01T01:11:16Z","title":"Towards Clinical Prediction with Transparency: An Explainable AI\n Approach to Survival Modelling in Residential Aged Care","summary":" Background: Accurate survival time estimates aid end-of-life medical\ndecision-making. Objectives: Develop an interpretable survival model for\nelderly residential aged care residents using advanced machine learning.\nSetting: A major Australasian residential aged care provider. Participants:\nResidents aged 65+ admitted for long-term care from July 2017 to August 2023.\nSample size: 11,944 residents across 40 facilities. Predictors: Factors include\nage, gender, health status, co-morbidities, cognitive function, mood,\nnutrition, mobility, smoking, sleep, skin integrity, and continence. Outcome:\nProbability of survival post-admission, specifically calibrated for 6-month\nsurvival estimates. Statistical Analysis: Tested CoxPH, EN, RR, Lasso, GB, XGB,\nand RF models in 20 experiments with a 90/10 train/test split. Evaluated\naccuracy using C-index, Harrell's C-index, dynamic AUROC, IBS, and calibrated\nROC. Chose XGB for its performance and calibrated it for 1, 3, 6, and 12-month\npredictions using Platt scaling. Employed SHAP values to analyze predictor\nimpacts. Results: GB, XGB, and RF models showed the highest C-Index values\n(0.714, 0.712, 0.712). The optimal XGB model demonstrated a 6-month survival\nprediction AUROC of 0.746 (95% CI 0.744-0.749). Key mortality predictors\ninclude age, male gender, mobility, health status, pressure ulcer risk, and\nappetite. Conclusions: The study successfully applies machine learning to\ncreate a survival model for aged care, aligning with clinical insights on\nmortality risk factors and enhancing model interpretability and clinical\nutility through explainable AI.\n","authors":["Teo Susnjak","Elise Griffin","Mitchell McCutcheon","Kathleen Potter"],"pdf_url":"https://arxiv.org/pdf/2312.00271v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.04000v1","updated":"2023-12-07T02:31:28Z","published":"2023-12-07T02:31:28Z","title":"LiDAR: Sensing Linear Probing Performance in Joint Embedding SSL\n Architectures","summary":" Joint embedding (JE) architectures have emerged as a promising avenue for\nacquiring transferable data representations. A key obstacle to using JE\nmethods, however, is the inherent challenge of evaluating learned\nrepresentations without access to a downstream task, and an annotated dataset.\nWithout efficient and reliable evaluation, it is difficult to iterate on\narchitectural and training choices for JE methods. In this paper, we introduce\nLiDAR (Linear Discriminant Analysis Rank), a metric designed to measure the\nquality of representations within JE architectures. Our metric addresses\nseveral shortcomings of recent approaches based on feature covariance rank by\ndiscriminating between informative and uninformative features. In essence,\nLiDAR quantifies the rank of the Linear Discriminant Analysis (LDA) matrix\nassociated with the surrogate SSL task -- a measure that intuitively captures\nthe information content as it pertains to solving the SSL task. We empirically\ndemonstrate that LiDAR significantly surpasses naive rank based approaches in\nits predictive power of optimal hyperparameters. Our proposed criterion\npresents a more robust and intuitive means of assessing the quality of\nrepresentations within JE architectures, which we hope facilitates broader\nadoption of these powerful techniques in various domains.\n","authors":["Vimal Thilak","Chen Huang","Omid Saremi","Laurent Dinh","Hanlin Goh","Preetum Nakkiran","Joshua M. Susskind","Etai Littwin"],"pdf_url":"https://arxiv.org/pdf/2312.04000v1.pdf","comment":"Technical report"},{"id":"http://arxiv.org/abs/2312.03998v1","updated":"2023-12-07T02:30:40Z","published":"2023-12-07T02:30:40Z","title":"Series2Vec: Similarity-based Self-supervised Representation Learning for\n Time Series Classification","summary":" We argue that time series analysis is fundamentally different in nature to\neither vision or natural language processing with respect to the forms of\nmeaningful self-supervised learning tasks that can be defined. Motivated by\nthis insight, we introduce a novel approach called \\textit{Series2Vec} for\nself-supervised representation learning. Unlike other self-supervised methods\nin time series, which carry the risk of positive sample variants being less\nsimilar to the anchor sample than series in the negative set, Series2Vec is\ntrained to predict the similarity between two series in both temporal and\nspectral domains through a self-supervised task. Series2Vec relies primarily on\nthe consistency of the unsupervised similarity step, rather than the intrinsic\nquality of the similarity measurement, without the need for hand-crafted data\naugmentation. To further enforce the network to learn similar representations\nfor similar time series, we propose a novel approach that applies\norder-invariant attention to each representation within the batch during\ntraining. Our evaluation of Series2Vec on nine large real-world datasets, along\nwith the UCR/UEA archive, shows enhanced performance compared to current\nstate-of-the-art self-supervised techniques for time series. Additionally, our\nextensive experiments show that Series2Vec performs comparably with fully\nsupervised training and offers high efficiency in datasets with limited-labeled\ndata. Finally, we show that the fusion of Series2Vec with other representation\nlearning models leads to enhanced performance for time series classification.\nCode and models are open-source at\n\\url{https://github.com/Navidfoumani/Series2Vec.}\n","authors":["Navid Mohammadi Foumani","Chang Wei Tan","Geoffrey I. Webb","Mahsa Salehi"],"pdf_url":"https://arxiv.org/pdf/2312.03998v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03651v2","updated":"2023-12-07T02:26:52Z","published":"2023-12-06T18:13:21Z","title":"MIRACLE: Inverse Reinforcement and Curriculum Learning Model for\n Human-inspired Mobile Robot Navigation","summary":" In emergency scenarios, mobile robots must navigate like humans, interpreting\nstimuli to locate potential victims rapidly without interfering with first\nresponders. Existing socially-aware navigation algorithms face computational\nand adaptability challenges. To overcome these, we propose a solution, MIRACLE\n-- an inverse reinforcement and curriculum learning model, that employs\ngamified learning to gather stimuli-driven human navigational data. This data\nis then used to train a Deep Inverse Maximum Entropy Reinforcement Learning\nmodel, reducing reliance on demonstrator abilities. Testing reveals a low loss\nof 2.7717 within a 400-sized environment, signifying human-like response\nreplication. Current databases lack comprehensive stimuli-driven data,\nnecessitating our approach. By doing so, we enable robots to navigate emergency\nsituations with human-like perception, enhancing their life-saving\ncapabilities.\n","authors":["Nihal Gunukula","Kshitij Tiwari","Aniket Bera"],"pdf_url":"https://arxiv.org/pdf/2312.03651v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03991v1","updated":"2023-12-07T02:17:45Z","published":"2023-12-07T02:17:45Z","title":"MICRO: Model-Based Offline Reinforcement Learning with a Conservative\n Bellman Operator","summary":" Offline reinforcement learning (RL) faces a significant challenge of\ndistribution shift. Model-free offline RL penalizes the Q value for\nout-of-distribution (OOD) data or constrains the policy closed to the behavior\npolicy to tackle this problem, but this inhibits the exploration of the OOD\nregion. Model-based offline RL, which uses the trained environment model to\ngenerate more OOD data and performs conservative policy optimization within\nthat model, has become an effective method for this problem. However, the\ncurrent model-based algorithms rarely consider agent robustness when\nincorporating conservatism into policy. Therefore, the new model-based offline\nalgorithm with a conservative Bellman operator (MICRO) is proposed. This method\ntrades off performance and robustness via introducing the robust Bellman\noperator into the algorithm. Compared with previous model-based algorithms with\nrobust adversarial models, MICRO can significantly reduce the computation cost\nby only choosing the minimal Q value in the state uncertainty set. Extensive\nexperiments demonstrate that MICRO outperforms prior RL algorithms in offline\nRL benchmark and is considerably robust to adversarial perturbations.\n","authors":["Xiao-Yin Liu","Xiao-Hu Zhou","Guo-Tao Li","Hao Li","Mei-Jiang Gui","Tian-Yu Xiang","De-Xing Huang","Zeng-Guang Hou"],"pdf_url":"https://arxiv.org/pdf/2312.03991v1.pdf","comment":"13 pages, 5 figures"},{"id":"http://arxiv.org/abs/2312.03989v1","updated":"2023-12-07T02:14:39Z","published":"2023-12-07T02:14:39Z","title":"Rapid detection of rare events from in situ X-ray diffraction data using\n machine learning","summary":" High-energy X-ray diffraction methods can non-destructively map the 3D\nmicrostructure and associated attributes of metallic polycrystalline\nengineering materials in their bulk form. These methods are often combined with\nexternal stimuli such as thermo-mechanical loading to take snapshots over time\nof the evolving microstructure and attributes. However, the extreme data\nvolumes and the high costs of traditional data acquisition and reduction\napproaches pose a barrier to quickly extracting actionable insights and\nimproving the temporal resolution of these snapshots. Here we present a fully\nautomated technique capable of rapidly detecting the onset of plasticity in\nhigh-energy X-ray microscopy data. Our technique is computationally faster by\nat least 50 times than the traditional approaches and works for data sets that\nare up to 9 times sparser than a full data set. This new technique leverages\nself-supervised image representation learning and clustering to transform\nmassive data into compact, semantic-rich representations of visually salient\ncharacteristics (e.g., peak shapes). These characteristics can be a rapid\nindicator of anomalous events such as changes in diffraction peak shapes. We\nanticipate that this technique will provide just-in-time actionable information\nto drive smarter experiments that effectively deploy multi-modal X-ray\ndiffraction methods that span many decades of length scales.\n","authors":["Weijian Zheng","Jun-Sang Park","Peter Kenesei","Ahsan Ali","Zhengchun Liu","Ian T. Foster","Nicholas Schwarz","Rajkumar Kettimuthu","Antonino Miceli","Hemant Sharma"],"pdf_url":"https://arxiv.org/pdf/2312.03989v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.15549v2","updated":"2023-12-07T02:08:13Z","published":"2023-11-27T05:29:43Z","title":"From Prediction to Action: Critical Role of Performance Estimation for\n Machine-Learning-Driven Materials Discovery","summary":" Materials discovery driven by statistical property models is an iterative\ndecision process, during which an initial data collection is extended with new\ndata proposed by a model-informed acquisition function--with the goal to\nmaximize a certain \"reward\" over time, such as the maximum property value\ndiscovered so far. While the materials science community achieved much progress\nin developing property models that predict well on average with respect to the\ntraining distribution, this form of in-distribution performance measurement is\nnot directly coupled with the discovery reward. This is because an iterative\ndiscovery process has a shifting reward distribution that is\nover-proportionally determined by the model performance for exceptional\nmaterials. We demonstrate this problem using the example of bulk modulus\nmaximization among double perovskite oxides. We find that the in-distribution\npredictive performance suggests random forests as superior to Gaussian process\nregression, while the results are inverse in terms of the discovery rewards. We\nargue that the lack of proper performance estimation methods from pre-computed\ndata collections is a fundamental problem for improving data-driven materials\ndiscovery, and we propose a novel such estimator that, in contrast to na\\\"ive\nreward estimation, successfully predicts Gaussian processes with the \"expected\nimprovement\" acquisition function as the best out of four options in our\ndemonstrational study for double perovskites. Importantly, it does so without\nrequiring the over thousand ab initio computations that were needed to confirm\nthis prediction.\n","authors":["Mario Boley","Felix Luong","Simon Teshuva","Daniel F Schmidt","Lucas Foppa","Matthias Scheffler"],"pdf_url":"https://arxiv.org/pdf/2311.15549v2.pdf","comment":"Simplified notation"},{"id":"http://arxiv.org/abs/2206.04979v4","updated":"2023-12-07T02:03:25Z","published":"2022-06-10T10:28:55Z","title":"Convolutional layers are equivariant to discrete shifts but not\n continuous translations","summary":" The purpose of this short and simple note is to clarify a common\nmisconception about convolutional neural networks (CNNs). CNNs are made up of\nconvolutional layers which are shift equivariant due to weight sharing.\nHowever, convolutional layers are not translation equivariant, even when\nboundary effects are ignored and when pooling and subsampling are absent. This\nis because shift equivariance is a discrete symmetry while translation\nequivariance is a continuous symmetry. This fact is well known among\nresearchers in equivariant machine learning, but is usually overlooked among\nnon-experts. To minimize confusion, we suggest using the term `shift\nequivariance' to refer to discrete shifts in pixels and `translation\nequivariance' to refer to continuous translations.\n","authors":["Nick McGreivy","Ammar Hakim"],"pdf_url":"https://arxiv.org/pdf/2206.04979v4.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03196v2","updated":"2023-12-07T01:58:54Z","published":"2023-12-06T00:28:08Z","title":"Domain Invariant Representation Learning and Sleep Dynamics Modeling for\n Automatic Sleep Staging","summary":" Sleep staging has become a critical task in diagnosing and treating sleep\ndisorders to prevent sleep related diseases. With rapidly growing large scale\npublic sleep databases and advances in machine learning, significant progress\nhas been made toward automatic sleep staging. However, previous studies face\nsome critical problems in sleep studies; the heterogeneity of subjects'\nphysiological signals, the inability to extract meaningful information from\nunlabeled sleep signal data to improve predictive performances, the difficulty\nin modeling correlations between sleep stages, and the lack of an effective\nmechanism to quantify predictive uncertainty. In this study, we propose a\nneural network based automatic sleep staging model, named DREAM, to learn\ndomain generalized representations from physiological signals and models sleep\ndynamics. DREAM learns sleep related and subject invariant representations from\ndiverse subjects' sleep signal segments and models sleep dynamics by capturing\ninteractions between sequential signal segments and between sleep stages. In\nthe experiments, we demonstrate that DREAM outperforms the existing sleep\nstaging methods on three datasets. The case study demonstrates that our model\ncan learn the generalized decision function resulting in good prediction\nperformances for the new subjects, especially in case there are differences\nbetween testing and training subjects. The usage of unlabeled data shows the\nbenefit of leveraging unlabeled EEG data. Further, uncertainty quantification\ndemonstrates that DREAM provides prediction uncertainty, making the model\nreliable and helping sleep experts in real world applications.\n","authors":["Seungyeon Lee","Thai-Hoang Pham","Zhao Cheng","Ping Zhang"],"pdf_url":"https://arxiv.org/pdf/2312.03196v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03406v2","updated":"2023-12-07T01:24:54Z","published":"2023-12-06T10:42:40Z","title":"SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting","summary":" Spatiotemporal forecasting tasks, such as weather forecasting and traffic\nprediction, offer significant societal benefits. These tasks can be effectively\napproached as image forecasting problems using computer vision models. Vector\nquantization (VQ) is a well-known method for discrete representation that\nimproves the latent space, leading to enhanced generalization and transfer\nlearning capabilities. One of the main challenges in using VQ for\nspatiotemporal forecasting is how to balance between keeping enough details and\nremoving noises from the original patterns for better generalization. We\naddress this challenge by developing sparse vector quantization, or {\\bf SVQ}\nfor short, that leverages sparse regression to make better trade-off between\nthe two objectives. The main innovation of this work is to approximate sparse\nregression by a two-layer MLP and a randomly fixed or learnable matrix,\ndramatically improving its computational efficiency. Through experiments\nconducted on diverse datasets in multiple fields including weather forecasting,\ntraffic flow prediction, and video forecasting, we unequivocally demonstrate\nthat our proposed method consistently enhances the performance of base models\nand achieves state-of-the-art results across all benchmarks.\n","authors":["Chao Chen","Tian Zhou","Yanjun Zhao","Hui Liu","Liang Sun","Rong Jin"],"pdf_url":"https://arxiv.org/pdf/2312.03406v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2312.03979v1","updated":"2023-12-07T01:24:48Z","published":"2023-12-07T01:24:48Z","title":"Node-aware Bi-smoothing: Certified Robustness against Graph Injection\n Attacks","summary":" Deep Graph Learning (DGL) has emerged as a crucial technique across various\ndomains. However, recent studies have exposed vulnerabilities in DGL models,\nsuch as susceptibility to evasion and poisoning attacks. While empirical and\nprovable robustness techniques have been developed to defend against graph\nmodification attacks (GMAs), the problem of certified robustness against graph\ninjection attacks (GIAs) remains largely unexplored. To bridge this gap, we\nintroduce the node-aware bi-smoothing framework, which is the first certifiably\nrobust approach for general node classification tasks against GIAs. Notably,\nthe proposed node-aware bi-smoothing scheme is model-agnostic and is applicable\nfor both evasion and poisoning attacks. Through rigorous theoretical analysis,\nwe establish the certifiable conditions of our smoothing scheme. We also\nexplore the practical implications of our node-aware bi-smoothing schemes in\ntwo contexts: as an empirical defense approach against real-world GIAs and in\nthe context of recommendation systems. Furthermore, we extend two\nstate-of-the-art certified robustness frameworks to address node injection\nattacks and compare our approach against them. Extensive evaluations\ndemonstrate the effectiveness of our proposed certificates.\n","authors":["Yuni Lai","Yulin Zhu","Bailin Pan","Kai Zhou"],"pdf_url":"https://arxiv.org/pdf/2312.03979v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2202.09517v2","updated":"2023-12-07T01:07:13Z","published":"2022-02-19T03:48:20Z","title":"Deep Learning for Hate Speech Detection: A Comparative Study","summary":" Automated hate speech detection is an important tool in combating the spread\nof hate speech, particularly in social media. Numerous methods have been\ndeveloped for the task, including a recent proliferation of deep-learning based\napproaches. A variety of datasets have also been developed, exemplifying\nvarious manifestations of the hate-speech detection problem. We present here a\nlarge-scale empirical comparison of deep and shallow hate-speech detection\nmethods, mediated through the three most commonly used datasets. Our goal is to\nilluminate progress in the area, and identify strengths and weaknesses in the\ncurrent state-of-the-art. We particularly focus our analysis on measures of\npractical performance, including detection accuracy, computational efficiency,\ncapability in using pre-trained models, and domain generalization. In doing so\nwe aim to provide guidance as to the use of hate-speech detection in practice,\nquantify the state-of-the-art, and identify future research directions. Code\nand dataset are available at\nhttps://github.com/jmjmalik22/Hate-Speech-Detection.\n","authors":["Jitendra Singh Malik","Hezhe Qiao","Guansong Pang","Anton van den Hengel"],"pdf_url":"https://arxiv.org/pdf/2202.09517v2.pdf","comment":"18 pages, 4 figures, and 6 tables"},{"id":"http://arxiv.org/abs/2312.03957v1","updated":"2023-12-07T00:07:35Z","published":"2023-12-07T00:07:35Z","title":"PerSival: Neural-network-based visualisation for pervasive\n continuum-mechanical simulations in musculoskeletal biomechanics","summary":" This paper presents a novel neural network architecture for the purpose of\npervasive visualisation of a 3D human upper limb musculoskeletal system model.\nBringing simulation capabilities to resource-poor systems like mobile devices\nis of growing interest across many research fields, to widen applicability of\nmethods and results. Until recently, this goal was thought to be out of reach\nfor realistic continuum-mechanical simulations of musculoskeletal systems, due\nto prohibitive computational cost. Within this work we use a sparse grid\nsurrogate to capture the surface deformation of the m.~biceps brachii in order\nto train a deep learning model, used for real-time visualisation of the same\nmuscle. Both these surrogate models take 5 muscle activation levels as input\nand output Cartesian coordinate vectors for each mesh node on the muscle's\nsurface. Thus, the neural network architecture features a significantly lower\ninput than output dimension. 5 muscle activation levels were sufficient to\nachieve an average error of 0.97 +/- 0.16 mm, or 0.57 +/- 0.10 % for the 2809\nmesh node positions of the biceps. The model achieved evaluation times of 9.88\nms per predicted deformation state on CPU only and 3.48 ms with GPU-support,\nleading to theoretical frame rates of 101 fps and 287 fps respectively. Deep\nlearning surrogates thus provide a way to make continuum-mechanical simulations\naccessible for visual real-time applications.\n","authors":["David Rosin","Johannes Kässinger","Xingyao Yu","Okan Avci","Christian Bleiler","Oliver Röhrle"],"pdf_url":"https://arxiv.org/pdf/2312.03957v1.pdf","comment":"10 pages, 4 figures, 5 tables, to be submitted to Medical Image\n Analysis"}],"Multimedia":[{"id":"http://arxiv.org/abs/2312.04552v1","updated":"2023-12-07T18:59:20Z","published":"2023-12-07T18:59:20Z","title":"Generating Illustrated Instructions","summary":" We introduce the new task of generating Illustrated Instructions, i.e.,\nvisual instructions customized to a user's needs. We identify desiderata unique\nto this task, and formalize it through a suite of automatic and human\nevaluation metrics, designed to measure the validity, consistency, and efficacy\nof the generations. We combine the power of large language models (LLMs)\ntogether with strong text-to-image generation diffusion models to propose a\nsimple approach called StackedDiffusion, which generates such illustrated\ninstructions given text as input. The resulting model strongly outperforms\nbaseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases,\nusers even prefer it to human-generated articles. Most notably, it enables\nvarious new and exciting applications far beyond what static articles on the\nweb can provide, such as personalized instructions complete with intermediate\nsteps and pictures in response to a user's individual situation.\n","authors":["Sachit Menon","Ishan Misra","Rohit Girdhar"],"pdf_url":"https://arxiv.org/pdf/2312.04552v1.pdf","comment":"Project website:\n http://facebookresearch.github.io/IllustratedInstructions"},{"id":"http://arxiv.org/abs/2312.04461v1","updated":"2023-12-07T17:32:29Z","published":"2023-12-07T17:32:29Z","title":"PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding","summary":" Recent advances in text-to-image generation have made remarkable progress in\nsynthesizing realistic human photos conditioned on given text prompts. However,\nexisting personalized generation methods cannot simultaneously satisfy the\nrequirements of high efficiency, promising identity (ID) fidelity, and flexible\ntext controllability. In this work, we introduce PhotoMaker, an efficient\npersonalized text-to-image generation method, which mainly encodes an arbitrary\nnumber of input ID images into a stack ID embedding for preserving ID\ninformation. Such an embedding, serving as a unified ID representation, can not\nonly encapsulate the characteristics of the same input ID comprehensively, but\nalso accommodate the characteristics of different IDs for subsequent\nintegration. This paves the way for more intriguing and practically valuable\napplications. Besides, to drive the training of our PhotoMaker, we propose an\nID-oriented data construction pipeline to assemble the training data. Under the\nnourishment of the dataset constructed through the proposed pipeline, our\nPhotoMaker demonstrates better ID preservation ability than test-time\nfine-tuning based methods, yet provides significant speed improvements,\nhigh-quality generation results, strong generalization capabilities, and a wide\nrange of applications. Our project page is available at\nhttps://photo-maker.github.io/\n","authors":["Zhen Li","Mingdeng Cao","Xintao Wang","Zhongang Qi","Ming-Ming Cheng","Ying Shan"],"pdf_url":"https://arxiv.org/pdf/2312.04461v1.pdf","comment":"Tech report; Project page: https://photo-maker.github.io/"},{"id":"http://arxiv.org/abs/2312.04435v1","updated":"2023-12-07T16:57:38Z","published":"2023-12-07T16:57:38Z","title":"Deep3DSketch: 3D modeling from Free-hand Sketches with View- and\n Structural-Aware Adversarial Training","summary":" This work aims to investigate the problem of 3D modeling using single\nfree-hand sketches, which is one of the most natural ways we humans express\nideas. Although sketch-based 3D modeling can drastically make the 3D modeling\nprocess more accessible, the sparsity and ambiguity of sketches bring\nsignificant challenges for creating high-fidelity 3D models that reflect the\ncreators' ideas. In this work, we propose a view- and structural-aware deep\nlearning approach, \\textit{Deep3DSketch}, which tackles the ambiguity and fully\nuses sparse information of sketches, emphasizing the structural information.\nSpecifically, we introduced random pose sampling on both 3D shapes and 2D\nsilhouettes, and an adversarial training scheme with an effective progressive\ndiscriminator to facilitate learning of the shape structures. Extensive\nexperiments demonstrated the effectiveness of our approach, which outperforms\nexisting methods -- with state-of-the-art (SOTA) performance on both synthetic\nand real datasets.\n","authors":["Tianrun Chen","Chenglong Fu","Lanyun Zhu","Papa Mao","Jia Zhang","Ying Zang","Lingyun Sun"],"pdf_url":"https://arxiv.org/pdf/2312.04435v1.pdf","comment":"ICASSP 2023. arXiv admin note: substantial text overlap with\n arXiv:2310.18148"},{"id":"http://arxiv.org/abs/2312.04293v1","updated":"2023-12-07T13:27:37Z","published":"2023-12-07T13:27:37Z","title":"GPT-4V with Emotion: A Zero-shot Benchmark for Multimodal Emotion\n Understanding","summary":" Recently, GPT-4 with Vision (GPT-4V) has shown remarkable performance across\nvarious multimodal tasks. However, its efficacy in emotion recognition remains\na question. This paper quantitatively evaluates GPT-4V's capabilities in\nmultimodal emotion understanding, encompassing tasks such as facial emotion\nrecognition, visual sentiment analysis, micro-expression recognition, dynamic\nfacial emotion recognition, and multimodal emotion recognition. Our experiments\nshow that GPT-4V exhibits impressive multimodal and temporal understanding\ncapabilities, even surpassing supervised systems in some tasks. Despite these\nachievements, GPT-4V is currently tailored for general domains. It performs\npoorly in micro-expression recognition that requires specialized expertise. The\nmain purpose of this paper is to present quantitative results of GPT-4V on\nemotion understanding and establish a zero-shot benchmark for future research.\nCode and evaluation results are available at:\nhttps://github.com/zeroQiaoba/gpt4v-emotion.\n","authors":["Zheng Lian","Licai Sun","Haiyang Sun","Kang Chen","Zhuofan Wen","Hao Gu","Shun Chen","Bin Liu","Jianhua Tao"],"pdf_url":"https://arxiv.org/pdf/2312.04293v1.pdf","comment":null},{"id":"http://arxiv.org/abs/2205.08324v3","updated":"2023-12-07T10:08:04Z","published":"2022-05-17T13:20:30Z","title":"Exploring the Interactive Guidance for Unified and Effective Image\n Matting","summary":" Recent image matting studies are developing towards proposing trimap-free or\ninteractive methods for complete complex image matting tasks. Although avoiding\nthe extensive labors of trimap annotation, existing methods still suffer from\ntwo limitations: (1) For the single image with multiple objects, it is\nessential to provide extra interaction information to help determining the\nmatting target; (2) For transparent objects, the accurate regression of alpha\nmatte from RGB image is much more difficult compared with the opaque ones. In\nthis work, we propose a Unified Interactive image Matting method, named UIM,\nwhich solves the limitations and achieves satisfying matting results for any\nscenario. Specifically, UIM leverages multiple types of user interaction to\navoid the ambiguity of multiple matting targets, and we compare the pros and\ncons of different annotation types in detail. To unify the matting performance\nfor transparent and opaque objects, we decouple image matting into two stages,\ni.e., foreground segmentation and transparency prediction. Moreover, we design\na multi-scale attentive fusion module to alleviate the vagueness in the\nboundary region. Experimental results demonstrate that UIM achieves\nstate-of-the-art performance on the Composition-1K test set and a synthetic\nunified dataset.\n","authors":["Dinghao Yang","Bin Wang","Weijia Li","Yiqi Lin","Conghui He"],"pdf_url":"https://arxiv.org/pdf/2205.08324v3.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.02183v2","updated":"2023-12-07T08:48:37Z","published":"2023-11-03T18:27:43Z","title":"A New Fine-grained Alignment Method for Image-text Matching","summary":" Image-text retrieval is a widely studied topic in the field of computer\nvision due to the exponential growth of multimedia data, whose core concept is\nto measure the similarity between images and text. However, most existing\nretrieval methods heavily rely on cross-attention mechanisms for cross-modal\nfine-grained alignment, which takes into account excessive irrelevant regions\nand treats prominent and non-significant words equally, thereby limiting\nretrieval accuracy. This paper aims to investigate an alignment approach that\nreduces the involvement of non-significant fragments in images and text while\nenhancing the alignment of prominent segments. For this purpose, we introduce\nthe Cross-Modal Prominent Fragments Enhancement Aligning Network(CPFEAN), which\nachieves improved retrieval accuracy by diminishing the participation of\nirrelevant regions during alignment and relatively increasing the alignment\nsimilarity of prominent words. Additionally, we incorporate prior textual\ninformation into image regions to reduce misalignment occurrences. In practice,\nwe first design a novel intra-modal fragments relationship reasoning method,\nand subsequently employ our proposed alignment mechanism to compute the\nsimilarity between images and text. Extensive quantitative comparative\nexperiments on MS-COCO and Flickr30K datasets demonstrate that our approach\noutperforms state-of-the-art methods by about 5% to 10% in the rSum metric.\n","authors":["Yang Zhang"],"pdf_url":"https://arxiv.org/pdf/2311.02183v2.pdf","comment":null},{"id":"http://arxiv.org/abs/2311.11642v2","updated":"2023-12-07T05:47:50Z","published":"2023-11-20T10:01:13Z","title":"Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging","summary":" Video face re-aging deals with altering the apparent age of a person to the\ntarget age in videos. This problem is challenging due to the lack of paired\nvideo datasets maintaining temporal consistency in identity and age. Most\nre-aging methods process each image individually without considering the\ntemporal consistency of videos. While some existing works address the issue of\ntemporal coherence through video facial attribute manipulation in latent space,\nthey often fail to deliver satisfactory performance in age transformation. To\ntackle the issues, we propose (1) a novel synthetic video dataset that features\nsubjects across a diverse range of age groups; (2) a baseline architecture\ndesigned to validate the effectiveness of our proposed dataset, and (3) the\ndevelopment of three novel metrics tailored explicitly for evaluating the\ntemporal consistency of video re-aging techniques. Our comprehensive\nexperiments on public datasets, such as VFHQ and CelebV-HQ, show that our\nmethod outperforms the existing approaches in terms of both age transformation\nand temporal consistency.\n","authors":["Abdul Muqeet","Kyuchul Lee","Bumsoo Kim","Yohan Hong","Hyungrae Lee","Woonggon Kim","KwangHee Lee"],"pdf_url":"https://arxiv.org/pdf/2311.11642v2.pdf","comment":"8 pages, 6 figures, 4 tables, Project page:\n https://video-reaging.github.io/"}]}} \ No newline at end of file diff --git a/favicon.ico b/favicon.ico new file mode 100644 index 00000000..7f5166c7 Binary files /dev/null and b/favicon.ico differ diff --git a/index.css b/index.css new file mode 100644 index 00000000..9ded9d94 --- /dev/null +++ b/index.css @@ -0,0 +1,355 @@ +:root { + /* Palette: Nord (https://www.nordtheme.com)*/ + --nord00: #2e3440; + --nord01: #3b4252; + --nord02: #434c5e; + --nord03: #4c566a; + --nord04: #d8dee9; + --nord05: #e5e9f0; + --nord06: #eceff4; + --nord07: #8fbcbb; + --nord08: #88c0d0; + --nord09: #81a1c1; + --nord0A: #5e81ac; + --nord0B: #bf616a; + --nord0C: #d08770; + --nord0D: #ebcb8b; + --nord0E: #a3be8c; + --nord0F: #b48ead; + + + /* Typograph */ + --font-family-default: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Oxygen-Sans, Ubuntu, Cantarell, "Helvetica Neue", + sans-serif; + --font-size-scaler: 62.5%; + --font-size-m: 1.6rem; + --font-size-s: 1.4rem; + + /* Components */ + --body-color: var(--nord06); + --body-bg: var(--nord00); + + --header-title: var(--nord06); + --header-container: var(--nord00); + --header-title-preffix: var(--nord0F); + + --chip-font: var(--nord08); + --chip-color: var(--nord0B); + + --icons: var(--nord06); + --icons-hover: var(--nord0F); + + --day-container: var(--nord01); + --date: var(--nord09); + + --summary: var(--nord0E); + --summary-hover: var(--nord0F); + + --details-open: var(--nord02); + --details-content: var(--nord05); + --details-a: var(--nord07); + --details-a-hover: var(--nord0F); + + --highlight-title: var(--nord0B); + --highlight-author: var(--nord0B); + + --article-summary-hover-color: var(--nord0D); + --article-summary-color: var(--nord04); + + --article-title-color: var(--nord05); + --article-title-hover-color: var(--nord0E); + + --accordion-content-rail-color: var(--nord01); + --accordion-content-hover-rail-color: var(--nord0D); + --accordion-title-marker-color: var(--nord01); + --accordion-title-hover-marker-color: var(--nord0E); + + --footer-color: var(--nord04); + --footer-link-hover-color: var(--nord0D); +} + +[data-theme="light"] { + /* Theme design */ + + --color-primary: var(--nord07); + --color-primary-second: var(--nord00); + --color-info: var(--nord0A); + --color-success: var(--nord0E); + --color-warning: var(--nord0C); + --color-danger: var(--nord0B); + + --color-text: var(--nord00); + --color-hover: var(--nord0D); + --color-shadow: var(--nord03); + + --color-primary-h: var(--nord09); + --color-primary-s: var(--nord08); + --color-primary-l: var(--nord07); + + --color-contrast-higher-h: var(--nord01); + --color-contrast-higher-l: var(--nord02); + --color-contrast-higher-s: var(--nord03); + + --color-content: white; + + --background: var(--nord06); + --background-content: var(--nord05); + --background-color: var(--nord04); + + /* Components */ + + --chip-font: var(--nord06); + --chip-color: var(--nord09); + + --body-color: var(--background-color); + --body-bg: var(--background); + + --header-title: var(--color-shadow); + --header-container: var(--background); + --header-title-preffix: var(--color-primary-h); + + --icons: var(--color-shadow); + --icons-hover: var(--color-hover); + + --day-container: var(--background-content); + --date: var(--color-primary-l); + + --summary: var(--color-info); + --summary-hover: var(--color-success); + + --details-open: var(--color-content); + --details-content: var(--color-text); + --details-a: var(--color-primary-h); + --details-a-hover: var(--color-hover); + + --highlight-title: var(--color-danger); + --highlight-author: var(--color-warning); + + --article-summary-color: var(--color-text); + --article-summary-hover-color: var(--color-primary-s); + + --article-title-color: var(--color-primary); + --article-title-hover-color: var(--color-success); + + --accordion-content-rail-color: var(--color-warning); + --accordion-content-hover-rail-color: var(--color-warning); + --accordion-title-marker-color: var(--color-success); + --accordion-title-hover-marker-color: var(--color-success); + + --footer-color: var(--color-text); + --footer-link-hover-color: var(--color-hover); +} + +html { + font-size: var(--font-size-scaler); +} + +body { + background-color: var(--body-bg); + font-family: var(--font-family-default); + color: var(--body-color); + margin: 0; + padding-top: 16px; + display: grid; +} + +.header-container { + width: 90%; + max-width: 1200px; + background: var(--header-container); + margin: 0 auto; +} + +.header-title { + font-size: 32px; + font-weight: bold; + color: var(--header-title); + margin: 0; + padding-bottom: 14px; +} + +.header-title-preffix { + color: var(--header-title-preffix); +} + +.icons { + color: var(--icons); + padding-bottom: 16px; +} + +.icons a { + color: var(--icons); + text-decoration: none; +} + +.icons a:hover { + color: var(--icons-hover); +} + +.day-container { + padding: 16px 16px 16px 16px; + background: var(--day-container); + width: 90%; + max-width: 1200px; + margin: 0 auto; + margin-bottom: 8px; + border-radius: 10px; +} + +.date { + font-size: 24px; + font-weight: 700; + margin: 0; + color: var(--date); +} + +p { + margin: 0; +} + +summary { + font-weight: 600; + color: var(--summary); +} + +summary:hover { + text-decoration: underline; + cursor: pointer; + color: var(--summary-hover); +} + +details { + --border-color: transparent; + + padding: 2px 4px; + font-size: 20px; + border: 1px solid var(--border-color); + border-radius: 4px; +} + +details[open] { + background-color: var(--details-open); + margin-bottom: 8px; +} + +.details-content { + padding: 12px 3px; + gap: 16px; + color: var(--details-content); +} + +details a { + color: var(--details-a); +} + +details a:hover { + color: var(--details-a-hover); +} + +footer { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + justify-content: space-between; +} + +.description { + margin: 0 auto; + color: var(--footer-color); + font-size: var(--font-size-s); + display: flex; + padding: 0 16px; + text-align: center; +} + +.highlight-author { + color: var(--highlight-author); + font-weight: bold; +} + +.highlight-title { + color: var(--highlight-title); + font-weight: bold; +} + +.channel-description { + text-align: center; + font-size: var(--font-size-scaler); +} + +.article-summary-link { + color: var(--article-summary-color); + font-size: var(--font-size-s); + text-decoration: none; +} + +.article-summary-link:hover { + color: var(--article-summary-hover-color); + --accordion-content-rail-color: var(--accordion-content-hover-rail-color); +} + +.article-summary-box-outer { + display: block; + padding: 4px 8px 8px 4px; +} + +.article-summary-box-inner { + padding-left: 8px; + border-left: 1px solid var(--accordion-content-rail-color); + font-size: var(--font-size-m); +} + +.article-expander { + padding: 10px 4px; + border-radius: 4px; +} + +.article-authors { + font-size: var(--font-size-m); + padding: 0.25em 1em; +} + +.article-authors a { + text-decoration: none; +} + +.article-expander-title { + font-size: var(--font-size-m); + font-weight: 600; +} + +.article-expander-title:hover { + cursor: pointer; +} + +.article-expander-title::marker { + color: var(--accordion-title-marker-color); +} + +.article-expander-title:hover::marker { + color: var(--accordion-title-hover-marker-color); +} + +/* for switcher */ +.theme-switch { + display: inline-block; + position: relative; +} + +.theme-switch input { + display: none; +} + +/* chip */ +.chip { + font-size: 90%; + align-items: center; + color: var(--chip-font); + background: var(--chip-color); + border-radius: 5rem; + display: inline-flex; + padding: .2rem .4rem; + vertical-align: middle; +} \ No newline at end of file diff --git a/index.html b/index.html new file mode 100644 index 00000000..2ddcf8dd --- /dev/null +++ b/index.html @@ -0,0 +1,75948 @@ + + + + + MyArxiv + + + + + + + + + + + + + + + +
+
+
+
+ MyArxiv +
+
+ +
+ +
+
+
+ +
+
+ +
+
+
+ + Computation and Language 53 + +
+
+
+ + ☆ Large Language Models for Mathematicians + + +
+ Large language models (LLMs) such as ChatGPT have received immense interest +for their general-purpose language understanding and, in particular, their +ability to generate high-quality text or computer code. For many professions, +LLMs represent an invaluable tool that can speed up and improve the quality of +work. In this note, we discuss to what extent they can aid professional +mathematicians. We first provide a mathematical description of the transformer +model used in all modern language models. Based on recent studies, we then +outline best practices and potential issues and report on the mathematical +abilities of language models. Finally, we shed light on the potential of LMMs +to change how mathematicians work. + +
+
+
+
+
+ + ☆ Improved Visual Grounding through Self-Consistent Explanations + + +
+ Vision-and-language models trained to match images with text can be combined +with visual explanation methods to point to the locations of specific objects +in an image. Our work shows that the localization --"grounding"-- abilities of +these models can be further improved by finetuning for self-consistent visual +explanations. We propose a strategy for augmenting existing text-image datasets +with paraphrases using a large language model, and SelfEQ, a weakly-supervised +strategy on visual explanation maps for paraphrases that encourages +self-consistency. Specifically, for an input textual phrase, we attempt to +generate a paraphrase and finetune the model so that the phrase and paraphrase +map to the same region in the image. We posit that this both expands the +vocabulary that the model is able to handle, and improves the quality of the +object locations highlighted by gradient-based visual explanation methods (e.g. +GradCAM). We demonstrate that SelfEQ improves performance on Flickr30k, +ReferIt, and RefCOCO+ over a strong baseline method and several prior works. +Particularly, comparing to other methods that do not use any type of box +annotations, we obtain 84.07% on Flickr30k (an absolute improvement of 4.69%), +67.40% on ReferIt (an absolute improvement of 7.68%), and 75.10%, 55.49% on +RefCOCO+ test sets A and B respectively (an absolute improvement of 3.74% on +average). + +
+
+ comment: Project Page: https://catherine-r-he.github.io/SelfEQ/ +
+
+
+
+
+ + ☆ Efficient Monotonic Multihead Attention + + +
+ We introduce the Efficient Monotonic Multihead Attention (EMMA), a +state-of-the-art simultaneous translation model with numerically-stable and +unbiased monotonic alignment estimation. In addition, we present improved +training and inference strategies, including simultaneous fine-tuning from an +offline translation model and reduction of monotonic alignment variance. The +experimental results demonstrate that the proposed model attains +state-of-the-art performance in simultaneous speech-to-text translation on the +Spanish and English translation task. + +
+
+
+
+
+ + ☆ An LLM Compiler for Parallel Function Calling + + +
+ Large Language Models (LLMs) have shown remarkable results on various complex +reasoning benchmarks. The reasoning capabilities of LLMs enable them to execute +function calls, using user-provided functions to overcome their inherent +limitations, such as knowledge cutoffs, poor arithmetic skills, or lack of +access to private data. This development has expanded LLMs' scope to include +multi-function calling, where LLMs are equipped with a variety of functions and +select the proper functions based on the context. Multi-function calling +abilities of LLMs have catalyzed LLM-based software development, allowing them +to tackle more complex problems. However, current methods for multi-function +calling often require sequential reasoning and acting for each function which +can result in high latency, cost, and sometimes inaccurate behavior. To address +this, we introduce LLMCompiler, which executes functions in parallel to +efficiently orchestrate multi-function calling. Drawing from the principles of +classical compilers, LLMCompiler streamlines parallel function calling with +three components: (i) an LLM Planner, formulating execution strategies and +dependencies; (ii) a Task Fetching Unit, dispatching function calling tasks; +and (iii) an Executor, executing these tasks in parallel. LLMCompiler +automatically computes an optimized orchestration for the function calls and +can be used with open-source models such as LLaMA-2. We have benchmarked +LLMCompiler on a range of tasks including cases with non-trivial +inter-dependency between function calls, as well as cases that require dynamic +replanning based on intermediate results. We observe consistent latency speedup +of up to 3.7x, cost savings of up to 6.7x, and accuracy improvement of up to +~9% as compared to ReAct. Additionally, LLMCompiler achieves up to 1.35x +latency gain over OpenAI's recent parallel function calling, while achieving +similar accuracy. + +
+
+
+
+
+ + ☆ A Block Metropolis-Hastings Sampler for Controllable Energy-based Text + Generation + + +
+ Recent work has shown that energy-based language modeling is an effective +framework for controllable text generation because it enables flexible +integration of arbitrary discriminators. However, because energy-based LMs are +globally normalized, approximate techniques like Metropolis-Hastings (MH) are +required for inference. Past work has largely explored simple proposal +distributions that modify a single token at a time, like in Gibbs sampling. In +this paper, we develop a novel MH sampler that, in contrast, proposes re-writes +of the entire sequence in each step via iterative prompting of a large language +model. Our new sampler (a) allows for more efficient and accurate sampling from +a target distribution and (b) allows generation length to be determined through +the sampling procedure rather than fixed in advance, as past work has required. +We perform experiments on two controlled generation tasks, showing both +downstream performance gains and more accurate target distribution sampling in +comparison with single-token proposal techniques. + +
+
+
+
+
+ + ☆ Chain of Code: Reasoning with a Language Model-Augmented Code Emulator + + +
+ Code provides a general syntactic structure to build complex programs and +perform precise computations when paired with a code interpreter -- we +hypothesize that language models (LMs) can leverage code-writing to improve +Chain of Thought reasoning not only for logic and arithmetic tasks, but also +for linguistic ones (and in particular, those that are a mix of both). For +example, consider prompting an LM to write code that counts the number of times +it detects sarcasm in an essay: the LM may struggle to write an implementation +for "detect_sarcasm(string)" that can be executed by the interpreter (handling +the edge cases would be insurmountable). However, LMs may still produce a valid +solution if they are used not only to write the code, but also to selectively +"emulate" the interpreter by generating the expected output of +"detect_sarcasm(string)" and other lines of code (e.g., that the interpreter +could not compile). In this work, we propose Chain of Code (CoT), a simple yet +surprisingly effective extension that improves LM code-driven reasoning. The +key idea is to encourage LMs to format linguistic sub-tasks in a program as +flexible pseudocode that the compiler can explicitly catch undefined behaviors +and hand off to simulate with an LM (as an "LMulator"). Experiments demonstrate +that Chain of Code outperforms Chain of Thought and other baselines across a +variety of benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of +12% over Chain of Thought. CoT scales well with large and small models alike, +and broadens the scope of reasoning questions that LMs can correctly answer by +"thinking in code". Project webpage: https://chain-of-code.github.io/. + +
+
+
+
+
+ + ☆ On the Learnability of Watermarks for Language Models + + +
+ Watermarking of language model outputs enables statistical detection of +model-generated text, which has many applications in the responsible deployment +of language models. Existing watermarking strategies operate by altering the +decoder of an existing language model, and the ability for a language model to +directly learn to generate the watermark would have significant implications +for the real-world deployment of watermarks. First, learned watermarks could be +used to build open models that naturally generate watermarked text, allowing +for open models to benefit from watermarking. Second, if watermarking is used +to determine the provenance of generated text, an adversary can hurt the +reputation of a victim model by spoofing its watermark and generating damaging +watermarked text. To investigate the learnability of watermarks, we propose +watermark distillation, which trains a student model to behave like a teacher +model that uses decoding-based watermarking. We test our approach on three +distinct decoding-based watermarking strategies and various hyperparameter +settings, finding that models can learn to generate watermarked text with high +detectability. We also find limitations to learnability, including the loss of +watermarking capabilities under fine-tuning on normal text and high sample +complexity when learning low-distortion watermarks. + +
+
+
+
+
+ + ☆ Fortify the Shortest Stave in Attention: Enhancing Context Awareness of + Large Language Models for Effective Tool Use + + +
+ Recent advancements in large language models (LLMs) have significantly +expanded their functionality and skills as tool agents. In this paper, we argue +that a waveform pattern in the model's attention allocation has an impact on +the tool use performance, which degrades when the position of essential +information hits the trough zone. To address this issue, we propose a novel +inference method named Attention Buckets. This approach enables LLMs to handle +context by conducting parallel processes, each featuring a unique RoPE angle +base that shapes the attention waveform. Attention Buckets ensures that an +attention trough of a particular process can be compensated with an attention +peak of another run, reducing the risk of the LLM missing essential information +residing within the attention trough. Our extensive experiments on the widely +recognized tool use benchmark demonstrate the efficacy of our approach, where a +7B-parameter open-source model enhanced by Attention Buckets achieves SOTA +performance on par with GPT-4. + +
+
+
+
+
+ + ☆ OpenAsp: A Benchmark for Multi-document Open Aspect-based Summarization EMNLP 2023 + + +
+ The performance of automatic summarization models has improved dramatically +in recent years. Yet, there is still a gap in meeting specific information +needs of users in real-world scenarios, particularly when a targeted summary is +sought, such as in the useful aspect-based summarization setting targeted in +this paper. Previous datasets and studies for this setting have predominantly +concentrated on a limited set of pre-defined aspects, focused solely on single +document inputs, or relied on synthetic data. To advance research on more +realistic scenarios, we introduce OpenAsp, a benchmark for multi-document +\textit{open} aspect-based summarization. This benchmark is created using a +novel and cost-effective annotation protocol, by which an open aspect dataset +is derived from existing generic multi-document summarization datasets. We +analyze the properties of OpenAsp showcasing its high-quality content. Further, +we show that the realistic open-aspect setting realized in OpenAsp poses a +challenge for current state-of-the-art summarization models, as well as for +large language models. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ LaMPilot: An Open Benchmark Dataset for Autonomous Driving with Language + Model Programs + + +
+ We present LaMPilot, a novel framework for planning in the field of +autonomous driving, rethinking the task as a code-generation process that +leverages established behavioral primitives. This approach aims to address the +challenge of interpreting and executing spontaneous user instructions such as +"overtake the car ahead," which have typically posed difficulties for existing +frameworks. We introduce the LaMPilot benchmark specifically designed to +quantitatively evaluate the efficacy of Large Language Models (LLMs) in +translating human directives into actionable driving policies. We then evaluate +a wide range of state-of-the-art code generation language models on tasks from +the LaMPilot Benchmark. The results of the experiments showed that GPT-4, with +human feedback, achieved an impressive task completion rate of 92.7% and a +minimal collision rate of 0.9%. To encourage further investigation in this +area, our code and dataset will be made available. + +
+
+
+
+
+ + ☆ PCoQA: Persian Conversational Question Answering Dataset + + +
+ Humans seek information regarding a specific topic through performing a +conversation containing a series of questions and answers. In the pursuit of +conversational question answering research, we introduce the PCoQA, the first +\textbf{P}ersian \textbf{Co}nversational \textbf{Q}uestion \textbf{A}nswering +dataset, a resource comprising information-seeking dialogs encompassing a total +of 9,026 contextually-driven questions. Each dialog involves a questioner, a +responder, and a document from the Wikipedia; The questioner asks several +inter-connected questions from the text and the responder provides a span of +the document as the answer for each question. PCoQA is designed to present +novel challenges compared to previous question answering datasets including +having more open-ended non-factual answers, longer answers, and fewer lexical +overlaps. This paper not only presents the comprehensive PCoQA dataset but also +reports the performance of various benchmark models. Our models include +baseline models and pre-trained models, which are leveraged to boost the +performance of the model. The dataset and benchmarks are available at our +Github page. + +
+
+
+
+
+ + ☆ CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language + Models NeurIPS 2023 + + +
+ The ability to perform causal reasoning is widely considered a core feature +of intelligence. In this work, we investigate whether large language models +(LLMs) can coherently reason about causality. Much of the existing work in +natural language processing (NLP) focuses on evaluating commonsense causal +reasoning in LLMs, thus failing to assess whether a model can perform causal +inference in accordance with a set of well-defined formal rules. To address +this, we propose a new NLP task, causal inference in natural language, inspired +by the "causal inference engine" postulated by Judea Pearl et al. We compose a +large dataset, CLadder, with 10K samples: based on a collection of causal +graphs and queries (associational, interventional, and counterfactual), we +obtain symbolic questions and ground-truth answers, through an oracle causal +inference engine. These are then translated into natural language. We evaluate +multiple LLMs on our dataset, and we introduce and evaluate a bespoke +chain-of-thought prompting strategy, CausalCoT. We show that our task is highly +challenging for LLMs, and we conduct an in-depth analysis to gain deeper +insight into the causal reasoning abilities of LLMs. Our data is open-sourced +at https://huggingface.co/datasets/causalNLP/cladder, and our code can be found +at https://github.com/causalNLP/cladder. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ When Input Integers are Given in the Unary Numeral Representation + + +
+ Many NP-complete problems take integers as part of their input instances. +These input integers are generally binarized, that is, provided in the form of +the "binary" numeral representation, and the lengths of such binary forms are +used as a basis unit to measure the computational complexity of the problems. +In sharp contrast, the "unarization" (or the "unary" numeral representation) of +numbers has been known to bring a remarkably different effect onto the +computational complexity of the problems. When no computational-complexity +difference is observed between binarization and unarization of instances, on +the contrary, the problems are said to be strong NP-complete. This work +attempts to spotlight an issue of how the unarization of instances affects the +computational complexity of various combinatorial problems. We present numerous +NP-complete (or even NP-hard) problems, which turn out to be easily solvable +when input integers are represented in unary. We then discuss the computational +complexities of such problems when taking unary-form integer inputs. We hope +that a list of such problems signifies the structural differences between +strong NP-completeness and non-strong NP-completeness. + +
+
+ comment: (A4, 10pt, 12 pages, 1 figure) This is a preliminary report of the + current work, which has appeared in the Proceedings of the 24th Italian + Conference on Theoretical Computer Science (ICTCS 2023), Palermo, Italy, + September 13--15, 2023, CEUR Workshop Proceedings (CEUR-WS.org) +
+
+
+
+
+ + ☆ Enhancing Medical Task Performance in GPT-4V: A Comprehensive Study on + Prompt Engineering Strategies + + +
+ OpenAI's latest large vision-language model (LVLM), GPT-4V(ision), has piqued +considerable interest for its potential in medical applications. Despite its +promise, recent studies and internal reviews highlight its underperformance in +specialized medical tasks. This paper explores the boundary of GPT-4V's +capabilities in medicine, particularly in processing complex imaging data from +endoscopies, CT scans, and MRIs etc. Leveraging open-source datasets, we +assessed its foundational competencies, identifying substantial areas for +enhancement. Our research emphasizes prompt engineering, an often-underutilized +strategy for improving AI responsiveness. Through iterative testing, we refined +the model's prompts, significantly improving its interpretative accuracy and +relevance in medical imaging. From our comprehensive evaluations, we distilled +10 effective prompt engineering techniques, each fortifying GPT-4V's medical +acumen. These methodical enhancements facilitate more reliable, precise, and +clinically valuable insights from GPT-4V, advancing its operability in critical +healthcare environments. Our findings are pivotal for those employing AI in +medicine, providing clear, actionable guidance on harnessing GPT-4V's full +diagnostic potential. + +
+
+
+
+
+ + ☆ Merging by Matching Models in Task Subspaces + + +
+ Model merging aims to cheaply combine individual task-specific models into a +single multitask model. In this work, we view past merging methods as +leveraging different notions of a ''task subspace'' in which models are matched +before being merged. We connect the task subspace of a given model to its loss +landscape and formalize how this approach to model merging can be seen as +solving a linear system of equations. While past work has generally been +limited to linear systems that have a closed-form solution, we consider using +the conjugate gradient method to find a solution. We show that using the +conjugate gradient method can outperform closed-form solutions, enables merging +via linear systems that are otherwise intractable to solve, and flexibly allows +choosing from a wide variety of initializations and estimates for the ''task +subspace''. We ultimately demonstrate that our merging framework called +''Matching Models in their Task Subspace'' (MaTS) achieves state-of-the-art +results in multitask and intermediate-task model merging. We release all of the +code and checkpoints used in our work at https://github.com/r-three/mats. + +
+
+
+
+
+ + ☆ Beyond Surface: Probing LLaMA Across Scales and Layers + + +
+ This paper presents an in-depth analysis of Large Language Models (LLMs), +focusing on LLaMA, a prominent open-source foundational model in natural +language processing. Instead of assessing LLaMA through its generative output, +we design multiple-choice tasks to probe its intrinsic understanding in +high-order tasks such as reasoning and computation. We examine the model +horizontally, comparing different sizes, and vertically, assessing different +layers. We unveil several key and uncommon findings based on the designed +probing tasks: (1) Horizontally, enlarging model sizes almost could not +automatically impart additional knowledge or computational prowess. Instead, it +can enhance reasoning abilities, especially in math problem solving, and helps +reduce hallucinations, but only beyond certain size thresholds; (2) In vertical +analysis, the lower layers of LLaMA lack substantial arithmetic and factual +knowledge, showcasing logical thinking, multilingual and recognitive abilities, +with top layers housing most computational power and real-world knowledge. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ☆ nerblackbox: A High-level Library for Named Entity Recognition in Python + + +
+ We present nerblackbox, a python library to facilitate the use of +state-of-the-art transformer-based models for named entity recognition. It +provides simple-to-use yet powerful methods to access data and models from a +wide range of sources, for fully automated model training and evaluation as +well as versatile model inference. While many technical challenges are solved +and hidden from the user by default, nerblackbox also offers fine-grained +control and a rich set of customizable features. It is thus targeted both at +application-oriented developers as well as machine learning experts and +researchers. + +
+
+ comment: Published in Proceedings of the 3rd Workshop for Natural Language + Processing Open Source Software (NLP-OSS 2023) +
+
+
+
+
+ + ☆ Prompt Highlighter: Interactive Control for Multi-Modal LLMs + + +
+ This study targets a critical aspect of multi-modal LLMs' (LLMs&VLMs) +inference: explicit controllable text generation. Multi-modal LLMs empower +multi-modality understanding with the capability of semantic generation yet +bring less explainability and heavier reliance on prompt contents due to their +autoregressive generative nature. While manipulating prompt formats could +improve outputs, designing specific and precise prompts per task can be +challenging and ineffective. To tackle this issue, we introduce a novel +inference method, Prompt Highlighter, which enables users to highlight specific +prompt spans to interactively control the focus during generation. Motivated by +the classifier-free diffusion guidance, we form regular and unconditional +context pairs based on highlighted tokens, demonstrating that the +autoregressive generation in models can be guided in a classifier-free way. +Notably, we find that, during inference, guiding the models with highlighted +tokens through the attention weights leads to more desired outputs. Our +approach is compatible with current LLMs and VLMs, achieving impressive +customized generation results without training. Experiments confirm its +effectiveness in focusing on input contexts and generating reliable content. +Without tuning on LLaVA-v1.5, our method secured 69.5 in the MMBench test and +1552.5 in MME-perception. The code is available at: +https://github.com/dvlab-research/Prompt-Highlighter/ + +
+
+ comment: 19 pages. Project Page: + https://julianjuaner.github.io/projects/PromptHighlighter +
+
+
+
+
+ + ☆ PsyChat: A Client-Centric Dialogue System for Mental Health Support + + +
+ Dialogue systems are increasingly integrated into mental health support to +help clients facilitate exploration, gain insight, take action, and ultimately +heal themselves. For a dialogue system to be practical and user-friendly, it +should be client-centric, focusing on the client's behaviors. However, existing +dialogue systems publicly available for mental health support often concentrate +solely on the counselor's strategies rather than the behaviors expressed by +clients. This can lead to the implementation of unreasonable or inappropriate +counseling strategies and corresponding responses from the dialogue system. To +address this issue, we propose PsyChat, a client-centric dialogue system that +provides psychological support through online chat. The client-centric dialogue +system comprises five modules: client behavior recognition, counselor strategy +selection, input packer, response generator intentionally fine-tuned to produce +responses, and response selection. Both automatic and human evaluations +demonstrate the effectiveness and practicality of our proposed dialogue system +for real-life mental health support. Furthermore, we employ our proposed +dialogue system to simulate a real-world client-virtual-counselor interaction +scenario. The system is capable of predicting the client's behaviors, selecting +appropriate counselor strategies, and generating accurate and suitable +responses, as demonstrated in the scenario. + +
+
+
+
+
+ + ☆ Swap distance minimization in SOV languages. Cognitive and mathematical + foundations + + +
+ Distance minimization is a general principle of language. A special case of +this principle in the domain of word order is swap distance minimization. This +principle predicts that variations from a canonical order that are reached by +fewer swaps of adjacent constituents are lest costly and thus more likely. Here +we investigate the principle in the context of the triple formed by subject +(S), object (O) and verb (V). We introduce the concept of word order rotation +as a cognitive underpinning of that prediction. When the canonical order of a +language is SOV, the principle predicts SOV < SVO, OSV < VSO, OVS < VOS, in +order of increasing cognitive cost. We test the prediction in three flexible +order SOV languages: Korean (Koreanic), Malayalam (Dravidian), and Sinhalese +(Indo-European). Evidence of swap distance minimization is found in all three +languages, but it is weaker in Sinhalese. Swap distance minimization is +stronger than a preference for the canonical order in Korean and especially +Malayalam. + +
+
+
+
+
+ + ☆ Language Model Knowledge Distillation for Efficient Question Answering + in Spanish + + +
+ Recent advances in the development of pre-trained Spanish language models has +led to significant progress in many Natural Language Processing (NLP) tasks, +such as question answering. However, the lack of efficient models imposes a +barrier for the adoption of such models in resource-constrained environments. +Therefore, smaller distilled models for the Spanish language could be proven to +be highly scalable and facilitate their further adoption on a variety of tasks +and scenarios. In this work, we take one step in this direction by developing +SpanishTinyRoBERTa, a compressed language model based on RoBERTa for efficient +question answering in Spanish. To achieve this, we employ knowledge +distillation from a large model onto a lighter model that allows for a wider +implementation, even in areas with limited computational resources, whilst +attaining negligible performance sacrifice. Our experiments show that the dense +distilled model can still preserve the performance of its larger counterpart, +while significantly increasing inference speedup. This work serves as a +starting point for further research and investigation of model compression +efforts for Spanish language models across various NLP tasks. + +
+
+ comment: 6 pages, 2 tables +
+
+
+
+
+ + ☆ Using a Large Language Model to generate a Design Structure Matrix + + +
+ The Design Structure Matrix (DSM) is an established method used in dependency +modelling, especially in the design of complex engineering systems. The +generation of DSM is traditionally carried out through manual means and can +involve interviewing experts to elicit critical system elements and the +relationships between them. Such manual approaches can be time-consuming and +costly. This paper presents a workflow that uses a Large Language Model (LLM) +to support the generation of DSM and improve productivity. A prototype of the +workflow was developed in this work and applied on a diesel engine DSM +published previously. It was found that the prototype could reproduce 357 out +of 462 DSM entries published (i.e. 77.3%), suggesting that the work can aid DSM +generation. A no-code version of the prototype is made available online to +support future research. + +
+
+ comment: 16 pages, 7 Figures, 6 Tables +
+
+
+
+
+ + ☆ Analyzing the Inherent Response Tendency of LLMs: Real-World + Instructions-Driven Jailbreak + + +
+ Extensive work has been devoted to improving the safety mechanism of Large +Language Models (LLMs). However, in specific scenarios, LLMs still generate +harmful responses when faced with malicious instructions, a phenomenon referred +to as "Jailbreak Attack". In our research, we introduce a novel jailbreak +attack method (\textbf{RADIAL}), which consists of two steps: 1) Inherent +Response Tendency Analysis: we analyze the inherent affirmation and rejection +tendency of LLMs to react to real-world instructions. 2) Real-World +Instructions-Driven Jailbreak: based on our analysis, we strategically choose +several real-world instructions and embed malicious instructions into them to +amplify the LLM's potential to generate harmful responses. On three open-source +human-aligned LLMs, our method achieves excellent jailbreak attack performance +for both Chinese and English malicious instructions. Besides, we guided +detailed ablation experiments and verified the effectiveness of our core idea +"Inherent Response Tendency Analysis". Our exploration also exposes the +vulnerability of LLMs to being induced into generating more detailed harmful +responses in subsequent rounds of dialogue. + +
+
+
+
+
+ + ☆ Enhancing the Rationale-Input Alignment for Self-explaining + Rationalization ICDE 2024 + + +
+ Rationalization empowers deep learning models with self-explaining +capabilities through a cooperative game, where a generator selects a +semantically consistent subset of the input as a rationale, and a subsequent +predictor makes predictions based on the selected rationale. In this paper, we +discover that rationalization is prone to a problem named \emph{rationale +shift}, which arises from the algorithmic bias of the cooperative game. +Rationale shift refers to a situation where the semantics of the selected +rationale may deviate from the original input, but the predictor still produces +accurate predictions based on the deviation, resulting in a compromised +generator with misleading feedback. + To address this issue, we first demonstrate the importance of the alignment +between the rationale and the full input through both empirical observations +and theoretical analysis. Subsequently, we introduce a novel approach called +DAR (\textbf{D}iscriminatively \textbf{A}ligned \textbf{R}ationalization), +which utilizes an auxiliary module pretrained on the full input to +discriminatively align the selected rationale and the original input. We +theoretically illustrate how DAR accomplishes the desired alignment, thereby +overcoming the rationale shift problem. The experiments on two widely used +real-world benchmarks show that the proposed method significantly improves the +explanation quality (measured by the overlap between the model-selected +explanation and the human-annotated rationale) as compared to state-of-the-art +techniques. Additionally, results on two synthetic settings further validate +the effectiveness of DAR in addressing the rationale shift problem. + +
+
+ comment: Accept at ICDE 2024 +
+
+
+
+
+ + ☆ Making Translators Privacy-aware on the User's Side + + +
+ We propose PRISM to enable users of machine translation systems to preserve +the privacy of data on their own initiative. There is a growing demand to apply +machine translation systems to data that require privacy protection. While +several machine translation engines claim to prioritize privacy, the extent and +specifics of such protection are largely ambiguous. First, there is often a +lack of clarity on how and to what degree the data is protected. Even if +service providers believe they have sufficient safeguards in place, +sophisticated adversaries might still extract sensitive information. Second, +vulnerabilities may exist outside of these protective measures, such as within +communication channels, potentially leading to data leakage. As a result, users +are hesitant to utilize machine translation engines for data demanding high +levels of privacy protection, thereby missing out on their benefits. PRISM +resolves this problem. Instead of relying on the translation service to keep +data safe, PRISM provides the means to protect data on the user's side. This +approach ensures that even machine translation engines with inadequate privacy +measures can be used securely. For platforms already equipped with privacy +safeguards, PRISM acts as an additional protection layer, reinforcing their +security furthermore. PRISM adds these privacy features without significantly +compromising translation accuracy. Our experiments demonstrate the +effectiveness of PRISM using real-world translators, T5 and ChatGPT +(GPT-3.5-turbo), and the datasets with two languages. PRISM effectively +balances privacy protection with translation accuracy. + +
+
+
+
+
+ + ☆ Comparing Large Language Model AI and Human-Generated Coaching Messages + for Behavioral Weight Loss + + +
+ Automated coaching messages for weight control can save time and costs, but +their repetitive, generic nature may limit their effectiveness compared to +human coaching. Large language model (LLM) based artificial intelligence (AI) +chatbots, like ChatGPT, could offer more personalized and novel messages to +address repetition with their data-processing abilities. While LLM AI +demonstrates promise to encourage healthier lifestyles, studies have yet to +examine the feasibility and acceptability of LLM-based BWL coaching. 87 adults +in a weight-loss trial rated ten coaching messages' helpfulness (five +human-written, five ChatGPT-generated) using a 5-point Likert scale, providing +additional open-ended feedback to justify their ratings. Participants also +identified which messages they believed were AI-generated. The evaluation +occurred in two phases: messages in Phase 1 were perceived as impersonal and +negative, prompting revisions for Phase 2 messages. In Phase 1, AI-generated +messages were rated less helpful than human-written ones, with 66 percent +receiving a helpfulness rating of 3 or higher. However, in Phase 2, the AI +messages matched the human-written ones regarding helpfulness, with 82% scoring +three or above. Additionally, 50% were misidentified as human-written, +suggesting AI's sophistication in mimicking human-generated content. A thematic +analysis of open-ended feedback revealed that participants appreciated AI's +empathy and personalized suggestions but found them more formulaic, less +authentic, and too data-focused. This study reveals the preliminary feasibility +and acceptability of LLM AIs, like ChatGPT, in crafting potentially effective +weight control coaching messages. Our findings also underscore areas for future +enhancement. + +
+
+ comment: 29 pages, 5 figures +
+
+
+
+
+ + ☆ Multimodal Misinformation Detection in a South African Social Media + Environment + + +
+ With the constant spread of misinformation on social media networks, a need +has arisen to continuously assess the veracity of digital content. This need +has inspired numerous research efforts on the development of misinformation +detection (MD) models. However, many models do not use all information +available to them and existing research contains a lack of relevant datasets to +train the models, specifically within the South African social media +environment. The aim of this paper is to investigate the transferability of +knowledge of a MD model between different contextual environments. This +research contributes a multimodal MD model capable of functioning in the South +African social media environment, as well as introduces a South African +misinformation dataset. The model makes use of multiple sources of information +for misinformation detection, namely: textual and visual elements. It uses +bidirectional encoder representations from transformers (BERT) as the textual +encoder and a residual network (ResNet) as the visual encoder. The model is +trained and evaluated on the Fakeddit dataset and a South African +misinformation dataset. Results show that using South African samples in the +training of the model increases model performance, in a South African +contextual environment, and that a multimodal model retains significantly more +knowledge than both the textual and visual unimodal models. Our study suggests +that the performance of a misinformation detection model is influenced by the +cultural nuances of its operating environment and multimodal models assist in +the transferability of knowledge between different contextual environments. +Therefore, local data should be incorporated into the training process of a +misinformation detection model in order to optimize model performance. + +
+
+ comment: Artificial Intelligence Research. SACAIR 2023 +
+
+
+
+
+ + ☆ RoAST: Robustifying Language Models via Adversarial Perturbation with + Selective Training EMNLP 2023 + + +
+ Fine-tuning pre-trained language models (LMs) has become the de facto +standard in many NLP tasks. Nevertheless, fine-tuned LMs are still prone to +robustness issues, such as adversarial robustness and model calibration. +Several perspectives of robustness for LMs have been studied independently, but +lacking a unified consideration in multiple perspectives. In this paper, we +propose Robustifying LMs via Adversarial perturbation with Selective Training +(RoAST), a simple yet effective fine-tuning technique to enhance the +multi-perspective robustness of LMs in a unified way. RoAST effectively +incorporates two important sources for the model robustness, robustness on the +perturbed inputs and generalizable knowledge in pre-trained LMs. To be +specific, RoAST introduces adversarial perturbation during fine-tuning while +the model parameters are selectively updated upon their relative importance to +minimize unnecessary deviation. Under a unified evaluation of fine-tuned LMs by +incorporating four representative perspectives of model robustness, we +demonstrate the effectiveness of RoAST compared to state-of-the-art fine-tuning +methods on six different types of LMs, which indicates its usefulness in +practice. + +
+
+ comment: 33 pages, accepted at EMNLP 2023 Findings +
+
+
+
+
+ + ☆ A Study on the Calibration of In-context Learning NeurIPS 2023 + + +
+ Modern auto-regressive language models are trained to minimize log loss on +broad data by predicting the next token so they are expected to get calibrated +answers when framing a problem as a next-token prediction task. We study this +for in-context learning (ICL), a widely used way to adapt frozen large language +models (LLMs) via crafting prompts, and investigate the trade-offs between +performance and calibration on a wide range of natural language understanding +and reasoning tasks. We conduct extensive experiments to show that such +trade-offs may get worse as we increase model size, incorporate more ICL +examples, and fine-tune models using instruction, dialog, or reinforcement +learning from human feedback (RLHF) on carefully curated datasets. Furthermore, +we find that common recalibration techniques that are widely effective such as +temperature scaling provide limited gains in calibration errors, suggesting +that new methods may be required for settings where models are expected to be +reliable. + +
+
+ comment: Spotlight Talk at NeurIPS 2023 Workshop on Failure Modes in the Age + of Foundation Models +
+
+
+
+
+ + ☆ Cost-Effective In-Context Learning for Entity Resolution: A Design Space + Exploration + + +
+ Entity resolution (ER) is an important data integration task with a wide +spectrum of applications. The state-of-the-art solutions on ER rely on +pre-trained language models (PLMs), which require fine-tuning on a lot of +labeled matching/non-matching entity pairs. Recently, large languages models +(LLMs), such as GPT-4, have shown the ability to perform many tasks without +tuning model parameters, which is known as in-context learning (ICL) that +facilitates effective learning from a few labeled input context demonstrations. +However, existing ICL approaches to ER typically necessitate providing a task +description and a set of demonstrations for each entity pair and thus have +limitations on the monetary cost of interfacing LLMs. To address the problem, +in this paper, we provide a comprehensive study to investigate how to develop a +cost-effective batch prompting approach to ER. We introduce a framework BATCHER +consisting of demonstration selection and question batching and explore +different design choices that support batch prompting for ER. We also devise a +covering-based demonstration selection strategy that achieves an effective +balance between matching accuracy and monetary cost. We conduct a thorough +evaluation to explore the design space and evaluate our proposed strategies. +Through extensive experiments, we find that batch prompting is very +cost-effective for ER, compared with not only PLM-based methods fine-tuned with +extensive labeled data but also LLM-based methods with manually designed +prompting. We also provide guidance for selecting appropriate design choices +for batch prompting. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Transform Computational Social Science? + + +
+ Large Language Models (LLMs) are capable of successfully performing many +language processing tasks zero-shot (without training data). If zero-shot LLMs +can also reliably classify and explain social phenomena like persuasiveness and +political ideology, then LLMs could augment the Computational Social Science +(CSS) pipeline in important ways. This work provides a road map for using LLMs +as CSS tools. Towards this end, we contribute a set of prompting best practices +and an extensive evaluation pipeline to measure the zero-shot performance of 13 +language models on 25 representative English CSS benchmarks. On taxonomic +labeling tasks (classification), LLMs fail to outperform the best fine-tuned +models but still achieve fair levels of agreement with humans. On free-form +coding tasks (generation), LLMs produce explanations that often exceed the +quality of crowdworkers' gold references. We conclude that the performance of +today's LLMs can augment the CSS research pipeline in two ways: (1) serving as +zero-shot data annotators on human annotation teams, and (2) bootstrapping +challenging creative generation tasks (e.g., explaining the underlying +attributes of a text). In summary, LLMs are posed to meaningfully participate +in} social science analysis in partnership with humans. + +
+
+ comment: To appear in "Computational Linguistics" (CL) +
+
+
+
+
+ + ♻ ☆ A Stability Analysis of Fine-Tuning a Pre-Trained Model + + +
+ Fine-tuning a pre-trained model (such as BERT, ALBERT, RoBERTa, T5, GPT, +etc.) has proven to be one of the most promising paradigms in recent NLP +research. However, numerous recent works indicate that fine-tuning suffers from +the instability problem, i.e., tuning the same model under the same setting +results in significantly different performance. Many recent works have proposed +different methods to solve this problem, but there is no theoretical +understanding of why and how these methods work. In this paper, we propose a +novel theoretical stability analysis of fine-tuning that focuses on two +commonly used settings, namely, full fine-tuning and head tuning. We define the +stability under each setting and prove the corresponding stability bounds. The +theoretical bounds explain why and how several existing methods can stabilize +the fine-tuning procedure. In addition to being able to explain most of the +observed empirical discoveries, our proposed theoretical analysis framework can +also help in the design of effective and provable methods. Based on our theory, +we propose three novel strategies to stabilize the fine-tuning procedure, +namely, Maximal Margin Regularizer (MMR), Multi-Head Loss (MHLoss), and Self +Unsupervised Re-Training (SURT). We extensively evaluate our proposed +approaches on 11 widely used real-world benchmark datasets, as well as hundreds +of synthetic classification datasets. The experiment results show that our +proposed methods significantly stabilize the fine-tuning procedure and also +corroborate our theoretical analysis. + +
+
+
+
+
+ + ♻ ☆ Conversational Semantic Parsing using Dynamic Context Graphs + + +
+ In this paper we consider the task of conversational semantic parsing over +general purpose knowledge graphs (KGs) with millions of entities, and thousands +of relation-types. We focus on models which are capable of interactively +mapping user utterances into executable logical forms (e.g., Sparql) in the +context of the conversational history. Our key idea is to represent information +about an utterance and its context via a subgraph which is created dynamically, +i.e., the number of nodes varies per utterance. Rather than treating the +subgraph as a sequence, we exploit its underlying structure and encode it with +a graph neural network which further allows us to represent a large number of +(unseen) nodes. Experimental results show that dynamic context modeling is +superior to static approaches, delivering performance improvements across the +board (i.e., for simple and complex questions). Our results further confirm +that modeling the structure of context is better at processing discourse +information, (i.e., at handling ellipsis and resolving coreference) and longer +interactions. + +
+
+ comment: camera ready +
+
+
+
+
+ + ♻ ☆ LLMs for Science: Usage for Code Generation and Data Analysis + + +
+ Large language models (LLMs) have been touted to enable increased +productivity in many areas of today's work life. Scientific research as an area +of work is no exception: the potential of LLM-based tools to assist in the +daily work of scientists has become a highly discussed topic across +disciplines. However, we are only at the very onset of this subject of study. +It is still unclear how the potential of LLMs will materialise in research +practice. With this study, we give first empirical evidence on the use of LLMs +in the research process. We have investigated a set of use cases for LLM-based +tools in scientific research, and conducted a first study to assess to which +degree current tools are helpful. In this paper we report specifically on use +cases related to software engineering, such as generating application code and +developing scripts for data analytics. While we studied seemingly simple use +cases, results across tools differ significantly. Our results highlight the +promise of LLM-based tools in general, yet we also observe various issues, +particularly regarding the integrity of the output these tools provide. + +
+
+ comment: Preprint; In Submission +
+
+
+
+
+ + ♻ ☆ Sudden Drops in the Loss: Syntax Acquisition, Phase Transitions, and + Simplicity Bias in MLMs + + +
+ Most interpretability research in NLP focuses on understanding the behavior +and features of a fully trained model. However, certain insights into model +behavior may only be accessible by observing the trajectory of the training +process. We present a case study of syntax acquisition in masked language +models (MLMs) that demonstrates how analyzing the evolution of interpretable +artifacts throughout training deepens our understanding of emergent behavior. +In particular, we study Syntactic Attention Structure (SAS), a naturally +emerging property of MLMs wherein specific Transformer heads tend to focus on +specific syntactic relations. We identify a brief window in pretraining when +models abruptly acquire SAS, concurrent with a steep drop in loss. This +breakthrough precipitates the subsequent acquisition of linguistic +capabilities. We then examine the causal role of SAS by manipulating SAS during +training, and demonstrate that SAS is necessary for the development of +grammatical capabilities. We further find that SAS competes with other +beneficial traits during training, and that briefly suppressing SAS improves +model quality. These findings offer an interpretation of a real-world example +of both simplicity bias and breakthrough training dynamics. + +
+
+
+
+
+ + ♻ ☆ Navigating News Narratives: A Media Bias Analysis Dataset + + +
+ The proliferation of biased news narratives across various media platforms +has become a prominent challenge, influencing public opinion on critical topics +like politics, health, and climate change. This paper introduces the +"Navigating News Narratives: A Media Bias Analysis Dataset", a comprehensive +dataset to address the urgent need for tools to detect and analyze media bias. +This dataset encompasses a broad spectrum of biases, making it a unique and +valuable asset in the field of media studies and artificial intelligence. The +dataset is available at +https://huggingface.co/datasets/newsmediabias/news-bias-full-data. + +
+
+
+
+
+ + ♻ ☆ PromptAgent: Strategic Planning with Language Models Enables + Expert-level Prompt Optimization + + +
+ Highly effective, task-specific prompts are often heavily engineered by +experts to integrate detailed instructions and domain insights based on a deep +understanding of both instincts of large language models (LLMs) and the +intricacies of the target task. However, automating the generation of such +expert-level prompts remains elusive. Existing prompt optimization methods tend +to overlook the depth of domain knowledge and struggle to efficiently explore +the vast space of expert-level prompts. Addressing this, we present +PromptAgent, an optimization method that autonomously crafts prompts equivalent +in quality to those handcrafted by experts. At its core, PromptAgent views +prompt optimization as a strategic planning problem and employs a principled +planning algorithm, rooted in Monte Carlo tree search, to strategically +navigate the expert-level prompt space. Inspired by human-like trial-and-error +exploration, PromptAgent induces precise expert-level insights and in-depth +instructions by reflecting on model errors and generating constructive error +feedback. Such a novel framework allows the agent to iteratively examine +intermediate prompts (states), refine them based on error feedbacks (actions), +simulate future rewards, and search for high-reward paths leading to expert +prompts. We apply PromptAgent to 12 tasks spanning three practical domains: +BIG-Bench Hard (BBH), as well as domain-specific and general NLP tasks, showing +it significantly outperforms strong Chain-of-Thought and recent prompt +optimization baselines. Extensive analyses emphasize its capability to craft +expert-level, detailed, and domain-insightful prompts with great efficiency and +generalizability. + +
+
+ comment: 34 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ Post Hoc Explanations of Language Models Can Improve Language Models + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +performing complex tasks. Moreover, recent research has shown that +incorporating human-annotated rationales (e.g., Chain-of-Thought prompting) +during in-context learning can significantly enhance the performance of these +models, particularly on tasks that require reasoning capabilities. However, +incorporating such rationales poses challenges in terms of scalability as this +requires a high degree of human involvement. In this work, we present a novel +framework, Amplifying Model Performance by Leveraging In-Context Learning with +Post Hoc Explanations (AMPLIFY), which addresses the aforementioned challenges +by automating the process of rationale generation. To this end, we leverage +post hoc explanation methods which output attribution scores (explanations) +capturing the influence of each of the input features on model predictions. +More specifically, we construct automated natural language rationales that +embed insights from post hoc explanations to provide corrective signals to +LLMs. Extensive experimentation with real-world datasets demonstrates that our +framework, AMPLIFY, leads to prediction accuracy improvements of about 10-25% +over a wide range of tasks, including those where prior approaches which rely +on human-annotated rationales such as Chain-of-Thought prompting fall short. +Our work makes one of the first attempts at highlighting the potential of post +hoc explanations as valuable tools for enhancing the effectiveness of LLMs. +Furthermore, we conduct additional empirical analyses and ablation studies to +demonstrate the impact of each of the components of AMPLIFY, which, in turn, +leads to critical insights for refining in-context learning. + +
+
+
+
+
+ + ♻ ☆ Trends in Integration of Knowledge and Large Language Models: A Survey + and Taxonomy of Methods, Benchmarks, and Applications + + +
+ Large language models (LLMs) exhibit superior performance on various natural +language tasks, but they are susceptible to issues stemming from outdated data +and domain-specific limitations. In order to address these challenges, +researchers have pursued two primary strategies, knowledge editing and +retrieval augmentation, to enhance LLMs by incorporating external information +from different aspects. Nevertheless, there is still a notable absence of a +comprehensive survey. In this paper, we propose a review to discuss the trends +in integration of knowledge and large language models, including taxonomy of +methods, benchmarks, and applications. In addition, we conduct an in-depth +analysis of different methods and point out potential research directions in +the future. We hope this survey offers the community quick access and a +comprehensive overview of this research area, with the intention of inspiring +future research endeavors. + +
+
+ comment: Work in progress; 22 pages. This work has been submitted to the IEEE + for possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ Efficient LLM Inference on CPUs NeurIPS'2023 + + +
+ Large language models (LLMs) have demonstrated remarkable performance and +tremendous potential across a wide range of tasks. However, deploying these +models has been challenging due to the astronomical amount of model parameters, +which requires a demand for large memory capacity and high memory bandwidth. In +this paper, we propose an effective approach that can make the deployment of +LLMs more efficiently. We support an automatic INT4 weight-only quantization +flow and design a special LLM runtime with highly-optimized kernels to +accelerate the LLM inference on CPUs. We demonstrate the general applicability +of our approach on popular LLMs including Llama2, Llama, GPT-NeoX, and showcase +the extreme inference efficiency on CPUs. The code is publicly available at: +https://github.com/intel/intel-extension-for-transformers. + +
+
+ comment: NeurIPS'2023 on Efficient Natural Language and Speech Processing +
+
+
+
+
+ + ♻ ☆ PROMISE: A Framework for Model-Driven Stateful Prompt Orchestration + + +
+ The advent of increasingly powerful language models has raised expectations +for language-based interactions. However, controlling these models is a +challenge, emphasizing the need to be able to investigate the feasibility and +value of their application. We present PROMISE, a framework that facilitates +the development of complex language-based interactions with information +systems. Its use of state machine modeling concepts enables model-driven, +dynamic prompt orchestration across hierarchically nested states and +transitions. This improves the control of the behavior of language models and +thus enables their effective and efficient use. We show the benefits of PROMISE +in the context of application scenarios within health information systems and +demonstrate its ability to handle complex interactions. + +
+
+ comment: Minor revision regards wording +
+
+
+
+
+ + ♻ ☆ Holmes: Towards Distributed Training Across Clusters with Heterogeneous + NIC Environment + + +
+ Large language models (LLMs) such as GPT-3, OPT, and LLaMA have demonstrated +remarkable accuracy in a wide range of tasks. However, training these models +can incur significant expenses, often requiring tens of thousands of GPUs for +months of continuous operation. Typically, this training is carried out in +specialized GPU clusters equipped with homogeneous high-speed Remote Direct +Memory Access (RDMA) network interface cards (NICs). The acquisition and +maintenance of such dedicated clusters is challenging. Current LLM training +frameworks, like Megatron-LM and Megatron-DeepSpeed, focus primarily on +optimizing training within homogeneous cluster settings. In this paper, we +introduce Holmes, a training framework for LLMs that employs thoughtfully +crafted data and model parallelism strategies over the heterogeneous NIC +environment. Our primary technical contribution lies in a novel scheduling +method that intelligently allocates distinct computational tasklets in LLM +training to specific groups of GPU devices based on the characteristics of +their connected NICs. Furthermore, our proposed framework, utilizing pipeline +parallel techniques, demonstrates scalability to multiple GPU clusters, even in +scenarios without high-speed interconnects between nodes in distinct clusters. +We conducted comprehensive experiments that involved various scenarios in the +heterogeneous NIC environment. In most cases, our framework achieves +performance levels close to those achievable with homogeneous RDMA-capable +networks (InfiniBand or RoCE), significantly exceeding training efficiency +within the pure Ethernet environment. Additionally, we verified that our +framework outperforms other mainstream LLM frameworks under heterogeneous NIC +environment in terms of training efficiency and can be seamlessly integrated +with them. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Generative Judge for Evaluating Alignment + + +
+ The rapid development of Large Language Models (LLMs) has substantially +expanded the range of tasks they can address. In the field of Natural Language +Processing (NLP), researchers have shifted their focus from conventional NLP +tasks (e.g., sequence tagging and parsing) towards tasks that revolve around +aligning with human needs (e.g., brainstorming and email writing). This shift +in task distribution imposes new requirements on evaluating these aligned +models regarding generality (i.e., assessing performance across diverse +scenarios), flexibility (i.e., examining under different protocols), and +interpretability (i.e., scrutinizing models with explanations). In this paper, +we propose a generative judge with 13B parameters, Auto-J, designed to address +these challenges. Our model is trained on user queries and LLM-generated +responses under massive real-world scenarios and accommodates diverse +evaluation protocols (e.g., pairwise response comparison and single-response +evaluation) with well-structured natural language critiques. To demonstrate the +efficacy of our approach, we construct a new testbed covering 58 different +scenarios. Experimentally, Auto-J outperforms a series of strong competitors, +including both open-source and closed-source models, by a large margin. We also +provide detailed analysis and case studies to further reveal the potential of +our method and make a variety of resources public at +https://github.com/GAIR-NLP/auto-j. + +
+
+ comment: Fix typos in Table 1 +
+
+
+
+
+ + ♻ ☆ Constrained Few-Shot Learning: Human-Like Low Sample Complexity Learning + and Non-Episodic Text Classification + + +
+ Few-shot learning (FSL) is an emergent paradigm of learning that attempts to +learn to reason with low sample complexity to mimic the way humans learn, +generalise and extrapolate from only a few seen examples. While FSL attempts to +mimic these human characteristics, fundamentally, the task of FSL as +conventionally formulated using meta-learning with episodic-based training does +not in actuality align with how humans acquire and reason with knowledge. FSL +with episodic training, while only requires $K$ instances of each test class, +still requires a large number of labelled training instances from disjoint +classes. In this paper, we introduce the novel task of constrained few-shot +learning (CFSL), a special case of FSL where $M$, the number of instances of +each training class is constrained such that $M \leq K$ thus applying a similar +restriction during FSL training and test. We propose a method for CFSL +leveraging Cat2Vec using a novel categorical contrastive loss inspired by +cognitive theories such as fuzzy trace theory and prototype theory. + +
+
+ comment: Add additional references Update various sections for clarity +
+
+
+
+
+ + ♻ ☆ Harnessing the Power of Large Language Models for Empathetic Response + Generation: Empirical Investigations and Improvements EMNLP 2023 + + +
+ Empathetic dialogue is an indispensable part of building harmonious social +relationships and contributes to the development of a helpful AI. Previous +approaches are mainly based on fine small-scale language models. With the +advent of ChatGPT, the application effect of large language models (LLMs) in +this field has attracted great attention. This work empirically investigates +the performance of LLMs in generating empathetic responses and proposes three +improvement methods of semantically similar in-context learning, two-stage +interactive generation, and combination with the knowledge base. Extensive +experiments show that LLMs can significantly benefit from our proposed methods +and is able to achieve state-of-the-art performance in both automatic and human +evaluations. Additionally, we explore the possibility of GPT-4 simulating human +evaluators. + +
+
+ comment: the Findings of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ MAUVE Scores for Generative Models: Theory and Practice + + +
+ Generative artificial intelligence has made significant strides, producing +text indistinguishable from human prose and remarkably photorealistic images. +Automatically measuring how close the generated data distribution is to the +target distribution is central to diagnosing existing models and developing +better ones. We present MAUVE, a family of comparison measures between pairs of +distributions such as those encountered in the generative modeling of text or +images. These scores are statistical summaries of divergence frontiers +capturing two types of errors in generative modeling. We explore three +approaches to statistically estimate these scores: vector quantization, +non-parametric estimation, and classifier-based estimation. We provide +statistical bounds for the vector quantization approach. + Empirically, we find that the proposed scores paired with a range of +$f$-divergences and statistical estimation methods can quantify the gaps +between the distributions of human-written text and those of modern neural +language models by correlating with human judgments and identifying known +properties of the generated texts. We demonstrate in the vision domain that +MAUVE can identify known properties of generated images on par with or better +than existing metrics. In conclusion, we present practical recommendations for +using MAUVE effectively with language and image modalities. + +
+
+ comment: Published in Journal of Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Interactive Text-to-SQL Generation via Editable Step-by-Step + Explanations + + +
+ Relational databases play an important role in this Big Data era. However, it +is challenging for non-experts to fully unleash the analytical power of +relational databases, since they are not familiar with database languages such +as SQL. Many techniques have been proposed to automatically generate SQL from +natural language, but they suffer from two issues: (1) they still make many +mistakes, particularly for complex queries, and (2) they do not provide a +flexible way for non-expert users to validate and refine the incorrect queries. +To address these issues, we introduce a new interaction mechanism that allows +users directly edit a step-by-step explanation of an incorrect SQL to fix SQL +errors. Experiments on the Spider benchmark show that our approach outperforms +three SOTA approaches by at least 31.6% in terms of execution accuracy. A user +study with 24 participants further shows that our approach helped users solve +significantly more SQL tasks with less time and higher confidence, +demonstrating its potential to expand access to databases, particularly for +non-experts. + +
+
+ comment: ARR AE score of 4 +
+
+
+
+
+ + ♻ ☆ Large Language Models as Optimizers + + +
+ Optimization is ubiquitous. While derivative-based algorithms have been +powerful tools for various problems, the absence of gradient imposes challenges +on many real-world applications. In this work, we propose Optimization by +PROmpting (OPRO), a simple and effective approach to leverage large language +models (LLMs) as optimizers, where the optimization task is described in +natural language. In each optimization step, the LLM generates new solutions +from the prompt that contains previously generated solutions with their values, +then the new solutions are evaluated and added to the prompt for the next +optimization step. We first showcase OPRO on linear regression and traveling +salesman problems, then move on to prompt optimization where the goal is to +find instructions that maximize the task accuracy. With a variety of LLMs, we +demonstrate that the best prompts optimized by OPRO outperform human-designed +prompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks. Code at +https://github.com/google-deepmind/opro. + +
+
+ comment: 42 pages, 26 figures, 15 tables. Code at + https://github.com/google-deepmind/opro +
+
+
+
+
+ + ♻ ☆ Mark My Words: Analyzing and Evaluating Language Model Watermarks + + +
+ The capabilities of large language models have grown significantly in recent +years and so too have concerns about their misuse. In this context, the ability +to distinguish machine-generated text from human-authored content becomes +important. Prior works have proposed numerous schemes to watermark text, which +would benefit from a systematic evaluation framework. This work focuses on text +watermarking techniques - as opposed to image watermarks - and proposes +MARKMYWORDS, a comprehensive benchmark for them under different tasks as well +as practical attacks. We focus on three main metrics: quality, size (e.g. the +number of tokens needed to detect a watermark), and tamper-resistance. Current +watermarking techniques are good enough to be deployed: Kirchenbauer et al. [1] +can watermark Llama2-7B-chat with no perceivable loss in quality, the watermark +can be detected with fewer than 100 tokens, and the scheme offers good +tamper-resistance to simple attacks. We argue that watermark +indistinguishability, a criteria emphasized in some prior works, is too strong +a requirement: schemes that slightly modify logit distributions outperform +their indistinguishable counterparts with no noticeable loss in generation +quality. We publicly release our benchmark +(https://github.com/wagner-group/MarkMyWords) + +
+
+ comment: 18 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ UPAR: A Kantian-Inspired Prompting Framework for Enhancing Large + Language Model Capabilities + + +
+ Large Language Models (LLMs) have demonstrated impressive inferential +capabilities, with numerous research endeavors devoted to enhancing this +capacity through prompting. Despite these efforts, a unified epistemological +foundation is still conspicuously absent. Drawing inspiration from Kant's a +priori philosophy, we propose the UPAR prompting framework, designed to emulate +the structure of human cognition within LLMs. The UPAR framework is delineated +into four phases: "Understand", "Plan", "Act", and "Reflect", enabling the +extraction of structured information from complex contexts, prior planning of +solutions, execution according to plan, and self-reflection. This structure +significantly augments the explainability and accuracy of LLM inference, +producing a human-understandable and inspectable inferential trajectory. +Furthermore, our work offers an epistemological foundation for existing +prompting techniques, allowing for a possible systematic integration of these +methods. With GPT-4, our approach elevates the accuracy from COT baseline of +22.92% to 58.33% in a challenging subset of GSM8K, and from 67.91% to 75.40% in +the causal judgment task. Without using few-shot examples or external tools, +UPAR significantly outperforms existing prompting methods on SCIBENCH, a +challenging dataset containing collegiate-level mathematics, chemistry, and +physics scientific problems. + +
+
+
+
+
+ + ♻ ☆ ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and + Multilingual Natural Language Generation + + +
+ Natural Language Generation (NLG) accepts input data in the form of images, +videos, or text and generates corresponding natural language text as output. +Existing NLG methods mainly adopt a supervised approach and rely heavily on +coupled data-to-text pairs. However, for many targeted scenarios and for +non-English languages, sufficient quantities of labeled data are often not +available. To relax the dependency on labeled data of downstream tasks, we +propose an intuitive and effective zero-shot learning framework, ZeroNLG, which +can deal with multiple NLG tasks, including image-to-text (image captioning), +video-to-text (video captioning), and text-to-text (neural machine +translation), across English, Chinese, German, and French within a unified +framework. ZeroNLG does not require any labeled downstream pairs for training. +During training, ZeroNLG (i) projects different domains (across modalities and +languages) to corresponding coordinates in a shared common latent space; (ii) +bridges different domains by aligning their corresponding coordinates in this +space; and (iii) builds an unsupervised multilingual auto-encoder to learn to +generate text by reconstructing the input text given its coordinate in shared +latent space. Consequently, during inference, based on the data-to-text +pipeline, ZeroNLG can generate target sentences across different languages +given the coordinate of input data in the common space. Within this unified +framework, given visual (imaging or video) data as input, ZeroNLG can perform +zero-shot visual captioning; given textual sentences as input, ZeroNLG can +perform zero-shot machine translation. We present the results of extensive +experiments on twelve NLG tasks, showing that, without using any labeled +downstream pairs for training, ZeroNLG generates high-quality and believable +outputs and significantly outperforms existing zero-shot methods. + +
+
+ comment: Our code and data are available at + https://github.com/yangbang18/ZeroNLG +
+
+
+
+
+ + ♻ ☆ Deep Learning for Hate Speech Detection: A Comparative Study + + +
+ Automated hate speech detection is an important tool in combating the spread +of hate speech, particularly in social media. Numerous methods have been +developed for the task, including a recent proliferation of deep-learning based +approaches. A variety of datasets have also been developed, exemplifying +various manifestations of the hate-speech detection problem. We present here a +large-scale empirical comparison of deep and shallow hate-speech detection +methods, mediated through the three most commonly used datasets. Our goal is to +illuminate progress in the area, and identify strengths and weaknesses in the +current state-of-the-art. We particularly focus our analysis on measures of +practical performance, including detection accuracy, computational efficiency, +capability in using pre-trained models, and domain generalization. In doing so +we aim to provide guidance as to the use of hate-speech detection in practice, +quantify the state-of-the-art, and identify future research directions. Code +and dataset are available at +https://github.com/jmjmalik22/Hate-Speech-Detection. + +
+
+ comment: 18 pages, 4 figures, and 6 tables +
+
+
+
+
+ + ♻ ☆ WhisBERT: Multimodal Text-Audio Language Modeling on 100M Words CoNLL 2023 + + +
+ Training on multiple modalities of input can augment the capabilities of a +language model. Here, we ask whether such a training regime can improve the +quality and efficiency of these systems as well. We focus on text--audio and +introduce Whisbert, which is inspired by the text--image approach of FLAVA +(Singh et al., 2022). In accordance with Babylm guidelines (Warstadt et al., +2023), we pretrain Whisbert on a dataset comprising only 100 million words plus +their corresponding speech from the word-aligned version of the People's Speech +dataset (Galvez et al., 2021). To assess the impact of multimodality, we +compare versions of the model that are trained on text only and on both audio +and text simultaneously. We find that while Whisbert is able to perform well on +multimodal masked modeling and surpasses the Babylm baselines in most benchmark +tasks, it struggles to optimize its complex objective and outperform its +text-only Whisbert baseline. + +
+
+ comment: Published at the BabyLM Challenge, a shared task co-sponsored by CMCL + 2023 and CoNLL 2023, hosted by EMNLP 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 150 + +
+
+
+ + ☆ Scaling Laws of Synthetic Images for Model Training ... for Now + + +
+ Recent significant advances in text-to-image models unlock the possibility of +training vision systems using synthetic images, potentially overcoming the +difficulty of collecting curated data at scale. It is unclear, however, how +these models behave at scale, as more synthetic data is added to the training +set. In this paper we study the scaling laws of synthetic images generated by +state of the art text-to-image models, for the training of supervised models: +image classifiers with label supervision, and CLIP with language supervision. +We identify several factors, including text prompts, classifier-free guidance +scale, and types of text-to-image models, that significantly affect scaling +behavior. After tuning these factors, we observe that synthetic images +demonstrate a scaling trend similar to, but slightly less effective than, real +images in CLIP training, while they significantly underperform in scaling when +training supervised image classifiers. Our analysis indicates that the main +reason for this underperformance is the inability of off-the-shelf +text-to-image models to generate certain concepts, a limitation that +significantly impairs the training of image classifiers. Our findings also +suggest that scaling synthetic data can be particularly effective in scenarios +such as: (1) when there is a limited supply of real images for a supervised +problem (e.g., fewer than 0.5 million images in ImageNet), (2) when the +evaluation dataset diverges significantly from the training data, indicating +the out-of-distribution scenario, or (3) when synthetic data is used in +conjunction with real images, as demonstrated in the training of CLIP models. + +
+
+
+
+
+ + ☆ Gen2Det: Generate to Detect + + +
+ Recently diffusion models have shown improvement in synthetic image quality +as well as better control in generation. We motivate and present Gen2Det, a +simple modular pipeline to create synthetic training data for object detection +for free by leveraging state-of-the-art grounded image generation methods. +Unlike existing works which generate individual object instances, require +identifying foreground followed by pasting on other images, we simplify to +directly generating scene-centric images. In addition to the synthetic data, +Gen2Det also proposes a suite of techniques to best utilize the generated data, +including image-level filtering, instance-level filtering, and better training +recipe to account for imperfections in the generation. Using Gen2Det, we show +healthy improvements on object detection and segmentation tasks under various +settings and agnostic to detection methods. In the long-tailed detection +setting on LVIS, Gen2Det improves the performance on rare categories by a large +margin while also significantly improving the performance on other categories, +e.g. we see an improvement of 2.13 Box AP and 1.84 Mask AP over just training +on real data on LVIS with Mask R-CNN. In the low-data regime setting on COCO, +Gen2Det consistently improves both Box and Mask AP by 2.27 and 1.85 points. In +the most general detection setting, Gen2Det still demonstrates robust +performance gains, e.g. it improves the Box and Mask AP on COCO by 0.45 and +0.32 points. + +
+
+
+
+
+ + ☆ MuRF: Multi-Baseline Radiance Fields + + +
+ We present Multi-Baseline Radiance Fields (MuRF), a general feed-forward +approach to solving sparse view synthesis under multiple different baseline +settings (small and large baselines, and different number of input views). To +render a target novel view, we discretize the 3D space into planes parallel to +the target image plane, and accordingly construct a target view frustum volume. +Such a target volume representation is spatially aligned with the target view, +which effectively aggregates relevant information from the input views for +high-quality rendering. It also facilitates subsequent radiance field +regression with a convolutional network thanks to its axis-aligned nature. The +3D context modeled by the convolutional network enables our method to synthesis +sharper scene structures than prior works. Our MuRF achieves state-of-the-art +performance across multiple different baseline settings and diverse scenarios +ranging from simple objects (DTU) to complex indoor and outdoor scenes +(RealEstate10K and LLFF). We also show promising zero-shot generalization +abilities on the Mip-NeRF 360 dataset, demonstrating the general applicability +of MuRF. + +
+
+ comment: Project page: https://haofeixu.github.io/murf/ +
+
+
+
+
+ + ☆ EAGLES: Efficient Accelerated 3D Gaussians with Lightweight EncodingS + + +
+ Recently, 3D Gaussian splatting (3D-GS) has gained popularity in novel-view +scene synthesis. It addresses the challenges of lengthy training times and slow +rendering speeds associated with Neural Radiance Fields (NeRFs). Through rapid, +differentiable rasterization of 3D Gaussians, 3D-GS achieves real-time +rendering and accelerated training. They, however, demand substantial memory +resources for both training and storage, as they require millions of Gaussians +in their point cloud representation for each scene. We present a technique +utilizing quantized embeddings to significantly reduce memory storage +requirements and a coarse-to-fine training strategy for a faster and more +stable optimization of the Gaussian point clouds. Our approach results in scene +representations with fewer Gaussians and quantized representations, leading to +faster training times and rendering speeds for real-time rendering of high +resolution scenes. We reduce memory by more than an order of magnitude all +while maintaining the reconstruction quality. We validate the effectiveness of +our approach on a variety of datasets and scenes preserving the visual quality +while consuming 10-20x less memory and faster training/inference speed. Project +page and code is available https://efficientgaussian.github.io + +
+
+ comment: Website: https://efficientgaussian.github.io Code: + https://github.com/Sharath-girish/efficientgaussian +
+
+
+
+
+ + ☆ Visual Geometry Grounded Deep Structure From Motion + + +
+ Structure-from-motion (SfM) is a long-standing problem in the computer vision +community, which aims to reconstruct the camera poses and 3D structure of a +scene from a set of unconstrained 2D images. Classical frameworks solve this +problem in an incremental manner by detecting and matching keypoints, +registering images, triangulating 3D points, and conducting bundle adjustment. +Recent research efforts have predominantly revolved around harnessing the power +of deep learning techniques to enhance specific elements (e.g., keypoint +matching), but are still based on the original, non-differentiable pipeline. +Instead, we propose a new deep pipeline VGGSfM, where each component is fully +differentiable and thus can be trained in an end-to-end manner. To this end, we +introduce new mechanisms and simplifications. First, we build on recent +advances in deep 2D point tracking to extract reliable pixel-accurate tracks, +which eliminates the need for chaining pairwise matches. Furthermore, we +recover all cameras simultaneously based on the image and track features +instead of gradually registering cameras. Finally, we optimise the cameras and +triangulate 3D points via a differentiable bundle adjustment layer. We attain +state-of-the-art performance on three popular datasets, CO3D, IMC Phototourism, +and ETH3D. + +
+
+ comment: 8 figures. Project page: https://vggsfm.github.io/ +
+
+
+
+
+ + ☆ NeRFiller: Completing Scenes via Generative 3D Inpainting + + +
+ We propose NeRFiller, an approach that completes missing portions of a 3D +capture via generative 3D inpainting using off-the-shelf 2D visual generative +models. Often parts of a captured 3D scene or object are missing due to mesh +reconstruction failures or a lack of observations (e.g., contact regions, such +as the bottom of objects, or hard-to-reach areas). We approach this challenging +3D inpainting problem by leveraging a 2D inpainting diffusion model. We +identify a surprising behavior of these models, where they generate more 3D +consistent inpaints when images form a 2$\times$2 grid, and show how to +generalize this behavior to more than four images. We then present an iterative +framework to distill these inpainted regions into a single consistent 3D scene. +In contrast to related works, we focus on completing scenes rather than +deleting foreground objects, and our approach does not require tight 2D object +masks or text. We compare our approach to relevant baselines adapted to our +setting on a variety of scenes, where NeRFiller creates the most 3D consistent +and plausible scene completions. Our project page is at +https://ethanweber.me/nerfiller. + +
+
+ comment: Project page: https://ethanweber.me/nerfiller +
+
+
+
+
+ + ☆ GenDeF: Learning Generative Deformation Field for Video Generation + + +
+ We offer a new perspective on approaching the task of video generation. +Instead of directly synthesizing a sequence of frames, we propose to render a +video by warping one static image with a generative deformation field (GenDeF). +Such a pipeline enjoys three appealing advantages. First, we can sufficiently +reuse a well-trained image generator to synthesize the static image (also +called canonical image), alleviating the difficulty in producing a video and +thereby resulting in better visual quality. Second, we can easily convert a +deformation field to optical flows, making it possible to apply explicit +structural regularizations for motion modeling, leading to temporally +consistent results. Third, the disentanglement between content and motion +allows users to process a synthesized video through processing its +corresponding static image without any tuning, facilitating many applications +like video editing, keypoint tracking, and video segmentation. Both qualitative +and quantitative results on three common video generation benchmarks +demonstrate the superiority of our GenDeF method. + +
+
+ comment: Project page: https://aim-uofa.github.io/GenDeF/ +
+
+
+
+
+ + ☆ PrimDiffusion: Volumetric Primitives Diffusion for 3D Human Generation NeurIPS 2023 + + +
+ We present PrimDiffusion, the first diffusion-based framework for 3D human +generation. Devising diffusion models for 3D human generation is difficult due +to the intensive computational cost of 3D representations and the articulated +topology of 3D humans. To tackle these challenges, our key insight is operating +the denoising diffusion process directly on a set of volumetric primitives, +which models the human body as a number of small volumes with radiance and +kinematic information. This volumetric primitives representation marries the +capacity of volumetric representations with the efficiency of primitive-based +rendering. Our PrimDiffusion framework has three appealing properties: 1) +compact and expressive parameter space for the diffusion model, 2) flexible 3D +representation that incorporates human prior, and 3) decoder-free rendering for +efficient novel-view and novel-pose synthesis. Extensive experiments validate +that PrimDiffusion outperforms state-of-the-art methods in 3D human generation. +Notably, compared to GAN-based methods, our PrimDiffusion supports real-time +rendering of high-quality 3D humans at a resolution of $512\times512$ once the +denoising process is done. We also demonstrate the flexibility of our framework +on training-free conditional generation such as texture transfer and 3D +inpainting. + +
+
+ comment: NeurIPS 2023; Project page + https://frozenburning.github.io/projects/primdiffusion/ Code available at + https://github.com/FrozenBurning/PrimDiffusion +
+
+
+
+
+ + ☆ MonoGaussianAvatar: Monocular Gaussian Point-based Head Avatar + + +
+ The ability to animate photo-realistic head avatars reconstructed from +monocular portrait video sequences represents a crucial step in bridging the +gap between the virtual and real worlds. Recent advancements in head avatar +techniques, including explicit 3D morphable meshes (3DMM), point clouds, and +neural implicit representation have been exploited for this ongoing research. +However, 3DMM-based methods are constrained by their fixed topologies, +point-based approaches suffer from a heavy training burden due to the extensive +quantity of points involved, and the last ones suffer from limitations in +deformation flexibility and rendering efficiency. In response to these +challenges, we propose MonoGaussianAvatar (Monocular Gaussian Point-based Head +Avatar), a novel approach that harnesses 3D Gaussian point representation +coupled with a Gaussian deformation field to learn explicit head avatars from +monocular portrait videos. We define our head avatars with Gaussian points +characterized by adaptable shapes, enabling flexible topology. These points +exhibit movement with a Gaussian deformation field in alignment with the target +pose and expression of a person, facilitating efficient deformation. +Additionally, the Gaussian points have controllable shape, size, color, and +opacity combined with Gaussian splatting, allowing for efficient training and +rendering. Experiments demonstrate the superior performance of our method, +which achieves state-of-the-art results among previous methods. + +
+
+ comment: The link to our projectpage is + https://yufan1012.github.io/MonoGaussianAvatar +
+
+
+
+
+ + ☆ GenTron: Delving Deep into Diffusion Transformers for Image and Video + Generation + + +
+ In this study, we explore Transformer-based diffusion models for image and +video generation. Despite the dominance of Transformer architectures in various +fields due to their flexibility and scalability, the visual generative domain +primarily utilizes CNN-based U-Net architectures, particularly in +diffusion-based models. We introduce GenTron, a family of Generative models +employing Transformer-based diffusion, to address this gap. Our initial step +was to adapt Diffusion Transformers (DiTs) from class to text conditioning, a +process involving thorough empirical exploration of the conditioning mechanism. +We then scale GenTron from approximately 900M to over 3B parameters, observing +significant improvements in visual quality. Furthermore, we extend GenTron to +text-to-video generation, incorporating novel motion-free guidance to enhance +video quality. In human evaluations against SDXL, GenTron achieves a 51.1% win +rate in visual quality (with a 19.8% draw rate), and a 42.3% win rate in text +alignment (with a 42.9% draw rate). GenTron also excels in the T2I-CompBench, +underscoring its strengths in compositional generation. We believe this work +will provide meaningful insights and serve as a valuable reference for future +research. + +
+
+ comment: Technical report. Website: + https://www.shoufachen.com/gentron_website/ +
+
+
+
+
+ + ☆ Improved Visual Grounding through Self-Consistent Explanations + + +
+ Vision-and-language models trained to match images with text can be combined +with visual explanation methods to point to the locations of specific objects +in an image. Our work shows that the localization --"grounding"-- abilities of +these models can be further improved by finetuning for self-consistent visual +explanations. We propose a strategy for augmenting existing text-image datasets +with paraphrases using a large language model, and SelfEQ, a weakly-supervised +strategy on visual explanation maps for paraphrases that encourages +self-consistency. Specifically, for an input textual phrase, we attempt to +generate a paraphrase and finetune the model so that the phrase and paraphrase +map to the same region in the image. We posit that this both expands the +vocabulary that the model is able to handle, and improves the quality of the +object locations highlighted by gradient-based visual explanation methods (e.g. +GradCAM). We demonstrate that SelfEQ improves performance on Flickr30k, +ReferIt, and RefCOCO+ over a strong baseline method and several prior works. +Particularly, comparing to other methods that do not use any type of box +annotations, we obtain 84.07% on Flickr30k (an absolute improvement of 4.69%), +67.40% on ReferIt (an absolute improvement of 7.68%), and 75.10%, 55.49% on +RefCOCO+ test sets A and B respectively (an absolute improvement of 3.74% on +average). + +
+
+ comment: Project Page: https://catherine-r-he.github.io/SelfEQ/ +
+
+
+
+
+ + ☆ SPIDeRS: Structured Polarization for Invisible Depth and Reflectance + Sensing + + +
+ Can we capture shape and reflectance in stealth? Such capability would be +valuable for many application domains in vision, xR, robotics, and HCI. We +introduce Structured Polarization, the first depth and reflectance sensing +method using patterns of polarized light (SPIDeRS). The key idea is to modulate +the angle of linear polarization (AoLP) of projected light at each pixel. The +use of polarization makes it invisible and lets us recover not only depth but +also directly surface normals and even reflectance. We implement SPIDeRS with a +liquid crystal spatial light modulator (SLM) and a polarimetric camera. We +derive a novel method for robustly extracting the projected structured +polarization pattern from the polarimetric object appearance. We evaluate the +effectiveness of SPIDeRS by applying it to a number of real-world objects. The +results show that our method successfully reconstructs object shapes of various +materials and is robust to diffuse reflection and ambient light. We also +demonstrate relighting using recovered surface normals and reflectance. We +believe SPIDeRS opens a new avenue of polarization use in visual sensing. + +
+
+
+
+
+ + ☆ Generating Illustrated Instructions + + +
+ We introduce the new task of generating Illustrated Instructions, i.e., +visual instructions customized to a user's needs. We identify desiderata unique +to this task, and formalize it through a suite of automatic and human +evaluation metrics, designed to measure the validity, consistency, and efficacy +of the generations. We combine the power of large language models (LLMs) +together with strong text-to-image generation diffusion models to propose a +simple approach called StackedDiffusion, which generates such illustrated +instructions given text as input. The resulting model strongly outperforms +baseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases, +users even prefer it to human-generated articles. Most notably, it enables +various new and exciting applications far beyond what static articles on the +web can provide, such as personalized instructions complete with intermediate +steps and pictures in response to a user's individual situation. + +
+
+ comment: Project website: + http://facebookresearch.github.io/IllustratedInstructions +
+
+
+
+
+ + ☆ Free3D: Consistent Novel View Synthesis without 3D Representation + + +
+ We introduce Free3D, a simple approach designed for open-set novel view +synthesis (NVS) from a single image. Similar to Zero-1-to-3, we start from a +pre-trained 2D image generator for generalization, and fine-tune it for NVS. +Compared to recent and concurrent works, we obtain significant improvements +without resorting to an explicit 3D representation, which is slow and +memory-consuming or training an additional 3D network. We do so by encoding +better the target camera pose via a new per-pixel ray conditioning +normalization (RCN) layer. The latter injects pose information in the +underlying 2D image generator by telling each pixel its specific viewing +direction. We also improve multi-view consistency via a light-weight multi-view +attention layer and multi-view noise sharing. We train Free3D on the Objaverse +dataset and demonstrate excellent generalization to various new categories in +several new datasets, including OminiObject3D and GSO. We hope our simple and +effective approach will serve as a solid baseline and help future research in +NVS with more accuracy pose. The project page is available at +https://chuanxiaz.com/free3d/. + +
+
+ comment: webpage: https://chuanxiaz.com/free3d/ +
+
+
+
+
+ + ☆ Multiview Aerial Visual Recognition (MAVREC): Can Multi-view Improve + Aerial Visual Perception? + + +
+ Despite the commercial abundance of UAVs, aerial data acquisition remains +challenging, and the existing Asia and North America-centric open-source UAV +datasets are small-scale or low-resolution and lack diversity in scene +contextuality. Additionally, the color content of the scenes, solar-zenith +angle, and population density of different geographies influence the data +diversity. These two factors conjointly render suboptimal aerial-visual +perception of the deep neural network (DNN) models trained primarily on the +ground-view data, including the open-world foundational models. + To pave the way for a transformative era of aerial detection, we present +Multiview Aerial Visual RECognition or MAVREC, a video dataset where we record +synchronized scenes from different perspectives -- ground camera and +drone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard +2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million +annotated bounding boxes. This makes MAVREC the largest ground and aerial-view +dataset, and the fourth largest among all drone-based datasets across all +modalities and tasks. Through our extensive benchmarking on MAVREC, we +recognize that augmenting object detectors with ground-view images from the +corresponding geographical location is a superior pre-training strategy for +aerial detection. Building on this strategy, we benchmark MAVREC with a +curriculum-based semi-supervised object detection approach that leverages +labeled (ground and aerial) and unlabeled (only aerial) images to enhance the +aerial detection. We publicly release the MAVREC dataset: +https://mavrec.github.io. + +
+
+
+
+
+ + ☆ PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play + + +
+ Learning from unstructured and uncurated data has become the dominant +paradigm for generative approaches in language and vision. Such unstructured +and unguided behavior data, commonly known as play, is also easier to collect +in robotics but much more difficult to learn from due to its inherently +multimodal, noisy, and suboptimal nature. In this paper, we study this problem +of learning goal-directed skill policies from unstructured play data which is +labeled with language in hindsight. Specifically, we leverage advances in +diffusion models to learn a multi-task diffusion model to extract robotic +skills from play data. Using a conditional denoising diffusion process in the +space of states and actions, we can gracefully handle the complexity and +multimodality of play data and generate diverse and interesting robot +behaviors. To make diffusion models more useful for skill learning, we +encourage robotic agents to acquire a vocabulary of skills by introducing +discrete bottlenecks into the conditional behavior generation process. In our +experiments, we demonstrate the effectiveness of our approach across a wide +variety of environments in both simulation and the real world. Results +visualizations and videos at https://play-fusion.github.io + +
+
+ comment: In CoRL 2023. Website at https://play-fusion.github.io +
+
+
+
+
+ + ☆ Digital Life Project: Autonomous 3D Characters with Social Intelligence + + +
+ In this work, we present Digital Life Project, a framework utilizing language +as the universal medium to build autonomous 3D characters, who are capable of +engaging in social interactions and expressing with articulated body motions, +thereby simulating life in a digital environment. Our framework comprises two +primary components: 1) SocioMind: a meticulously crafted digital brain that +models personalities with systematic few-shot exemplars, incorporates a +reflection process based on psychology principles, and emulates autonomy by +initiating dialogue topics; 2) MoMat-MoGen: a text-driven motion synthesis +paradigm for controlling the character's digital body. It integrates motion +matching, a proven industry technique to ensure motion quality, with +cutting-edge advancements in motion generation for diversity. Extensive +experiments demonstrate that each module achieves state-of-the-art performance +in its respective domain. Collectively, they enable virtual characters to +initiate and sustain dialogues autonomously, while evolving their +socio-psychological states. Concurrently, these characters can perform +contextually relevant bodily movements. Additionally, a motion captioning +module further allows the virtual character to recognize and appropriately +respond to human players' actions. Homepage: https://digital-life-project.com/ + +
+
+ comment: Homepage: https://digital-life-project.com/ +
+
+
+
+
+ + ☆ HyperDreamer: Hyper-Realistic 3D Content Generation and Editing from a + Single Image SIGGRAPH + + +
+ 3D content creation from a single image is a long-standing yet highly +desirable task. Recent advances introduce 2D diffusion priors, yielding +reasonable results. However, existing methods are not hyper-realistic enough +for post-generation usage, as users cannot view, render and edit the resulting +3D content from a full range. To address these challenges, we introduce +HyperDreamer with several key designs and appealing properties: 1) Viewable: +360 degree mesh modeling with high-resolution textures enables the creation of +visually compelling 3D models from a full range of observation points. 2) +Renderable: Fine-grained semantic segmentation and data-driven priors are +incorporated as guidance to learn reasonable albedo, roughness, and specular +properties of the materials, enabling semantic-aware arbitrary material +estimation. 3) Editable: For a generated model or their own data, users can +interactively select any region via a few clicks and efficiently edit the +texture with text-based guidance. Extensive experiments demonstrate the +effectiveness of HyperDreamer in modeling region-aware materials with +high-resolution textures and enabling user-friendly editing. We believe that +HyperDreamer holds promise for advancing 3D content creation and finding +applications in various domains. + +
+
+ comment: SIGGRAPH Asia 2023 (conference track). Project page: + https://ys-imtech.github.io/HyperDreamer/ +
+
+
+
+
+ + ☆ Sim-to-Real Causal Transfer: A Metric Learning Approach to + Causally-Aware Interaction Representations + + +
+ Modeling spatial-temporal interactions among neighboring agents is at the +heart of multi-agent problems such as motion forecasting and crowd navigation. +Despite notable progress, it remains unclear to which extent modern +representations can capture the causal relationships behind agent interactions. +In this work, we take an in-depth look at the causal awareness of these +representations, from computational formalism to real-world practice. First, we +cast doubt on the notion of non-causal robustness studied in the recent +CausalAgents benchmark. We show that recent representations are already +partially resilient to perturbations of non-causal agents, and yet modeling +indirect causal effects involving mediator agents remains challenging. To +address this challenge, we introduce a metric learning approach that +regularizes latent representations with causal annotations. Our controlled +experiments show that this approach not only leads to higher degrees of causal +awareness but also yields stronger out-of-distribution robustness. To further +operationalize it in practice, we propose a sim-to-real causal transfer method +via cross-domain multi-task learning. Experiments on pedestrian datasets show +that our method can substantially boost generalization, even in the absence of +real-world causal annotations. We hope our work provides a new perspective on +the challenges and potential pathways towards causally-aware representations of +multi-agent interactions. Our code is available at +https://github.com/socialcausality. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Self-Guided Open-Vocabulary Semantic Segmentation + + +
+ Vision-Language Models (VLMs) have emerged as promising tools for open-ended +image understanding tasks, including open vocabulary segmentation. Yet, direct +application of such VLMs to segmentation is non-trivial, since VLMs are trained +with image-text pairs and naturally lack pixel-level granularity. Recent works +have made advancements in bridging this gap, often by leveraging the shared +image-text space in which the image and a provided text prompt are represented. +In this paper, we challenge the capabilities of VLMs further and tackle +open-vocabulary segmentation without the need for any textual input. To this +end, we propose a novel Self-Guided Semantic Segmentation (Self-Seg) framework. +Self-Seg is capable of automatically detecting relevant class names from +clustered BLIP embeddings and using these for accurate semantic segmentation. +In addition, we propose an LLM-based Open-Vocabulary Evaluator (LOVE) to +effectively assess predicted open-vocabulary class names. We achieve +state-of-the-art results on Pascal VOC, ADE20K and CityScapes for +open-vocabulary segmentation without given class names, as well as competitive +performance with methods where class names are given. All code and data will be +released. + +
+
+
+
+
+ + ☆ PICTURE: PhotorealistIC virtual Try-on from UnconstRained dEsigns + + +
+ In this paper, we propose a novel virtual try-on from unconstrained designs +(ucVTON) task to enable photorealistic synthesis of personalized composite +clothing on input human images. Unlike prior arts constrained by specific input +types, our method allows flexible specification of style (text or image) and +texture (full garment, cropped sections, or texture patches) conditions. To +address the entanglement challenge when using full garment images as +conditions, we develop a two-stage pipeline with explicit disentanglement of +style and texture. In the first stage, we generate a human parsing map +reflecting the desired style conditioned on the input. In the second stage, we +composite textures onto the parsing map areas based on the texture input. To +represent complex and non-stationary textures that have never been achieved in +previous fashion editing works, we first propose extracting hierarchical and +balanced CLIP features and applying position encoding in VTON. Experiments +demonstrate superior synthesis quality and personalization enabled by our +method. The flexible control over style and texture mixing brings virtual +try-on to a new level of user experience for online shopping and fashion +design. + +
+
+ comment: Project page: https://ningshuliang.github.io/2023/Arxiv/index.html +
+
+
+
+
+ + ☆ Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language + Models + + +
+ We introduce Dream2Real, a robotics framework which integrates +vision-language models (VLMs) trained on 2D data into a 3D object rearrangement +pipeline. This is achieved by the robot autonomously constructing a 3D +representation of the scene, where objects can be rearranged virtually and an +image of the resulting arrangement rendered. These renders are evaluated by a +VLM, so that the arrangement which best satisfies the user instruction is +selected and recreated in the real world with pick-and-place. This enables +language-conditioned rearrangement to be performed zero-shot, without needing +to collect a training dataset of example arrangements. Results on a series of +real-world tasks show that this framework is robust to distractors, +controllable by language, capable of understanding complex multi-object +relations, and readily applicable to both tabletop and 6-DoF rearrangement +tasks. + +
+
+ comment: Project webpage with videos: https://www.robot-learning.uk/dream2real +
+
+
+
+
+ + ☆ Camera Height Doesn't Change: Unsupervised Monocular Scale-Aware + Road-Scene Depth Estimation + + +
+ Monocular depth estimators either require explicit scale supervision through +auxiliary sensors or suffer from scale ambiguity, which renders them difficult +to deploy in downstream applications. A possible source of scale is the sizes +of objects found in the scene, but inaccurate localization makes them difficult +to exploit. In this paper, we introduce a novel scale-aware monocular depth +estimation method called StableCamH that does not require any auxiliary sensor +or supervision. The key idea is to exploit prior knowledge of object heights in +the scene but aggregate the height cues into a single invariant measure common +to all frames in a road video sequence, namely the camera height. By +formulating monocular depth estimation as camera height optimization, we +achieve robust and accurate unsupervised end-to-end training. To realize +StableCamH, we devise a novel learning-based size prior that can directly +convert car appearance into its dimensions. Extensive experiments on KITTI and +Cityscapes show the effectiveness of StableCamH, its state-of-the-art accuracy +compared with related methods, and its generalizability. The training framework +of StableCamH can be used for any monocular depth estimation method and will +hopefully become a fundamental building block for further work. + +
+
+
+
+
+ + ☆ Diffusion Reflectance Map: Single-Image Stochastic Inverse Rendering of + Illumination and Reflectance + + +
+ Reflectance bounds the frequency spectrum of illumination in the object +appearance. In this paper, we introduce the first stochastic inverse rendering +method, which recovers the full frequency spectrum of an illumination jointly +with the object reflectance from a single image. Our key idea is to solve this +blind inverse problem in the reflectance map, an appearance representation +invariant to the underlying geometry, by learning to reverse the image +formation with a novel diffusion model which we refer to as the Diffusion +Reflectance Map Network (DRMNet). Given an observed reflectance map converted +and completed from the single input image, DRMNet generates a reflectance map +corresponding to a perfect mirror sphere while jointly estimating the +reflectance. The forward process can be understood as gradually filtering a +natural illumination with lower and lower frequency reflectance and additive +Gaussian noise. DRMNet learns to invert this process with two subnetworks, +IllNet and RefNet, which work in concert towards this joint estimation. The +network is trained on an extensive synthetic dataset and is demonstrated to +generalize to real images, showing state-of-the-art accuracy on established +datasets. + +
+
+
+
+
+ + ☆ Correspondences of the Third Kind: Camera Pose Estimation from Object + Reflection + + +
+ Computer vision has long relied on two kinds of correspondences: pixel +correspondences in images and 3D correspondences on object surfaces. Is there +another kind, and if there is, what can they do for us? In this paper, we +introduce correspondences of the third kind we call reflection correspondences +and show that they can help estimate camera pose by just looking at objects +without relying on the background. Reflection correspondences are point +correspondences in the reflected world, i.e., the scene reflected by the object +surface. The object geometry and reflectance alters the scene geometrically and +radiometrically, respectively, causing incorrect pixel correspondences. +Geometry recovered from each image is also hampered by distortions, namely +generalized bas-relief ambiguity, leading to erroneous 3D correspondences. We +show that reflection correspondences can resolve the ambiguities arising from +these distortions. We introduce a neural correspondence estimator and a RANSAC +algorithm that fully leverages all three kinds of correspondences for robust +and accurate joint camera pose and object shape estimation just from the object +appearance. The method expands the horizon of numerous downstream tasks, +including camera pose estimation for appearance modeling (e.g., NeRF) and +motion estimation of reflective objects (e.g., cars on the road), to name a +few, as it relieves the requirement of overlapping background. + +
+
+
+
+
+ + ☆ RAVE: Randomized Noise Shuffling for Fast and Consistent Video Editing + with Diffusion Models + + +
+ Recent advancements in diffusion-based models have demonstrated significant +success in generating images from text. However, video editing models have not +yet reached the same level of visual quality and user control. To address this, +we introduce RAVE, a zero-shot video editing method that leverages pre-trained +text-to-image diffusion models without additional training. RAVE takes an input +video and a text prompt to produce high-quality videos while preserving the +original motion and semantic structure. It employs a novel noise shuffling +strategy, leveraging spatio-temporal interactions between frames, to produce +temporally consistent videos faster than existing methods. It is also efficient +in terms of memory requirements, allowing it to handle longer videos. RAVE is +capable of a wide range of edits, from local attribute modifications to shape +transformations. In order to demonstrate the versatility of RAVE, we create a +comprehensive video evaluation dataset ranging from object-focused scenes to +complex human activities like dancing and typing, and dynamic scenes featuring +swimming fish and boats. Our qualitative and quantitative experiments highlight +the effectiveness of RAVE in diverse video editing scenarios compared to +existing methods. Our code, dataset and videos can be found in +https://rave-video.github.io. + +
+
+ comment: Project webpage: https://rave-video.github.io , Github: + http://github.com/rehg-lab/RAVE +
+
+
+
+
+ + ☆ Multimodal Industrial Anomaly Detection by Crossmodal Feature Mapping + + +
+ The paper explores the industrial multimodal Anomaly Detection (AD) task, +which exploits point clouds and RGB images to localize anomalies. We introduce +a novel light and fast framework that learns to map features from one modality +to the other on nominal samples. At test time, anomalies are detected by +pinpointing inconsistencies between observed and mapped features. Extensive +experiments show that our approach achieves state-of-the-art detection and +segmentation performance in both the standard and few-shot settings on the +MVTec 3D-AD dataset while achieving faster inference and occupying less memory +than previous multimodal AD methods. Moreover, we propose a layer-pruning +technique to improve memory and time efficiency with a marginal sacrifice in +performance. + +
+
+
+
+
+ + ☆ Bootstrapping Autonomous Radars with Self-Supervised Learning + + +
+ The perception of autonomous vehicles using radars has attracted increased +research interest due its ability to operate in fog and bad weather. However, +training radar models is hindered by the cost and difficulty of annotating +large-scale radar data. To overcome this bottleneck, we propose a +self-supervised learning framework to leverage the large amount of unlabeled +radar data to pre-train radar-only embeddings for self-driving perception +tasks. The proposed method combines radar-to-radar and radar-to-vision +contrastive losses to learn a general representation from unlabeled radar +heatmaps paired with their corresponding camera images. When used for +downstream object detection, we demonstrate that the proposed self-supervision +framework can improve the accuracy of state-of-the-art supervised baselines by +5.8% in mAP. + +
+
+
+
+
+ + ☆ AVA: Towards Autonomous Visualization Agents through Visual + Perception-Driven Decision-Making + + +
+ With recent advances in multi-modal foundation models, the previously +text-only large language models (LLM) have evolved to incorporate visual input, +opening up unprecedented opportunities for various applications in +visualization. Our work explores the utilization of the visual perception +ability of multi-modal LLMs to develop Autonomous Visualization Agents (AVAs) +that can interpret and accomplish user-defined visualization objectives through +natural language. We propose the first framework for the design of AVAs and +present several usage scenarios intended to demonstrate the general +applicability of the proposed paradigm. The addition of visual perception +allows AVAs to act as the virtual visualization assistant for domain experts +who may lack the knowledge or expertise in fine-tuning visualization outputs. +Our preliminary exploration and proof-of-concept agents suggest that this +approach can be widely applicable whenever the choices of appropriate +visualization parameters require the interpretation of previous visual output. +Feedback from unstructured interviews with experts in AI research, medical +visualization, and radiology has been incorporated, highlighting the +practicality and potential of AVAs. Our study indicates that AVAs represent a +general paradigm for designing intelligent visualization systems that can +achieve high-level visualization goals, which pave the way for developing +expert-level visualization agents in the future. + +
+
+
+
+
+ + ☆ FRNet: Frustum-Range Networks for Scalable LiDAR Segmentation + + +
+ LiDAR segmentation is crucial for autonomous driving systems. The recent +range-view approaches are promising for real-time processing. However, they +suffer inevitably from corrupted contextual information and rely heavily on +post-processing techniques for prediction refinement. In this work, we propose +a simple yet powerful FRNet that restores the contextual information of the +range image pixels with corresponding frustum LiDAR points. Firstly, a frustum +feature encoder module is used to extract per-point features within the frustum +region, which preserves scene consistency and is crucial for point-level +predictions. Next, a frustum-point fusion module is introduced to update +per-point features hierarchically, which enables each point to extract more +surrounding information via the frustum features. Finally, a head fusion module +is used to fuse features at different levels for final semantic prediction. +Extensive experiments on four popular LiDAR segmentation benchmarks under +various task setups demonstrate our superiority. FRNet achieves competitive +performance while maintaining high efficiency. The code is publicly available. + +
+
+ comment: Preprint; 20 pages, 9 figures, 9 tables; Code at + https://github.com/Xiangxu-0103/FRNet +
+
+
+
+
+ + ☆ Hierarchical Spatio-temporal Decoupling for Text-to-Video Generation + + +
+ Despite diffusion models having shown powerful abilities to generate +photorealistic images, generating videos that are realistic and diverse still +remains in its infancy. One of the key reasons is that current methods +intertwine spatial content and temporal dynamics together, leading to a notably +increased complexity of text-to-video generation (T2V). In this work, we +propose HiGen, a diffusion model-based method that improves performance by +decoupling the spatial and temporal factors of videos from two perspectives, +i.e., structure level and content level. At the structure level, we decompose +the T2V task into two steps, including spatial reasoning and temporal +reasoning, using a unified denoiser. Specifically, we generate spatially +coherent priors using text during spatial reasoning and then generate +temporally coherent motions from these priors during temporal reasoning. At the +content level, we extract two subtle cues from the content of the input video +that can express motion and appearance changes, respectively. These two cues +then guide the model's training for generating videos, enabling flexible +content variations and enhancing temporal stability. Through the decoupled +paradigm, HiGen can effectively reduce the complexity of this task and generate +realistic videos with semantics accuracy and motion stability. Extensive +experiments demonstrate the superior performance of HiGen over the +state-of-the-art T2V methods. + +
+
+ comment: Project page: https://higen-t2v.github.io/ +
+
+
+
+
+ + ☆ GSGFormer: Generative Social Graph Transformer for Multimodal Pedestrian + Trajectory Prediction + + +
+ Pedestrian trajectory prediction, vital for selfdriving cars and +socially-aware robots, is complicated due to intricate interactions between +pedestrians, their environment, and other Vulnerable Road Users. This paper +presents GSGFormer, an innovative generative model adept at predicting +pedestrian trajectories by considering these complex interactions and offering +a plethora of potential modal behaviors. We incorporate a heterogeneous graph +neural network to capture interactions between pedestrians, semantic maps, and +potential destinations. The Transformer module extracts temporal features, +while our novel CVAE-Residual-GMM module promotes diverse behavioral modality +generation. Through evaluations on multiple public datasets, GSGFormer not only +outperforms leading methods with ample data but also remains competitive when +data is limited. + +
+
+
+
+
+ + ☆ Emotional Speech-driven 3D Body Animation via Disentangled Latent + Diffusion + + +
+ Existing methods for synthesizing 3D human gestures from speech have shown +promising results, but they do not explicitly model the impact of emotions on +the generated gestures. Instead, these methods directly output animations from +speech without control over the expressed emotion. To address this limitation, +we present AMUSE, an emotional speech-driven body animation model based on +latent diffusion. Our observation is that content (i.e., gestures related to +speech rhythm and word utterances), emotion, and personal style are separable. +To account for this, AMUSE maps the driving audio to three disentangled latent +vectors: one for content, one for emotion, and one for personal style. A latent +diffusion model, trained to generate gesture motion sequences, is then +conditioned on these latent vectors. Once trained, AMUSE synthesizes 3D human +gestures directly from speech with control over the expressed emotions and +style by combining the content from the driving speech with the emotion and +style of another speech sequence. Randomly sampling the noise of the diffusion +model further generates variations of the gesture with the same emotional +expressivity. Qualitative, quantitative, and perceptual evaluations demonstrate +that AMUSE outputs realistic gesture sequences. Compared to the state of the +art, the generated gestures are better synchronized with the speech content and +better represent the emotion expressed by the input speech. Our project website +is amuse.is.tue.mpg.de. + +
+
+
+
+
+ + ☆ FitDiff: Robust monocular 3D facial shape and reflectance estimation + using Diffusion Models + + +
+ The remarkable progress in 3D face reconstruction has resulted in high-detail +and photorealistic facial representations. Recently, Diffusion Models have +revolutionized the capabilities of generative methods by achieving far better +performance than GANs. In this work, we present FitDiff, a diffusion-based 3D +facial avatar generative model. This model accurately generates relightable +facial avatars, utilizing an identity embedding extracted from an "in-the-wild" +2D facial image. Our multi-modal diffusion model concurrently outputs facial +reflectance maps (diffuse and specular albedo and normals) and shapes, +showcasing great generalization capabilities. It is solely trained on an +annotated subset of a public facial dataset, paired with 3D reconstructions. We +revisit the typical 3D facial fitting approach by guiding a reverse diffusion +process using perceptual and face recognition losses. Being the first LDM +conditioned on face recognition embeddings, FitDiff reconstructs relightable +human avatars, that can be used as-is in common rendering engines, starting +only from an unconstrained facial image, and achieving state-of-the-art +performance. + +
+
+
+
+
+ + ☆ PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding + + +
+ Recent advances in text-to-image generation have made remarkable progress in +synthesizing realistic human photos conditioned on given text prompts. However, +existing personalized generation methods cannot simultaneously satisfy the +requirements of high efficiency, promising identity (ID) fidelity, and flexible +text controllability. In this work, we introduce PhotoMaker, an efficient +personalized text-to-image generation method, which mainly encodes an arbitrary +number of input ID images into a stack ID embedding for preserving ID +information. Such an embedding, serving as a unified ID representation, can not +only encapsulate the characteristics of the same input ID comprehensively, but +also accommodate the characteristics of different IDs for subsequent +integration. This paves the way for more intriguing and practically valuable +applications. Besides, to drive the training of our PhotoMaker, we propose an +ID-oriented data construction pipeline to assemble the training data. Under the +nourishment of the dataset constructed through the proposed pipeline, our +PhotoMaker demonstrates better ID preservation ability than test-time +fine-tuning based methods, yet provides significant speed improvements, +high-quality generation results, strong generalization capabilities, and a wide +range of applications. Our project page is available at +https://photo-maker.github.io/ + +
+
+ comment: Tech report; Project page: https://photo-maker.github.io/ +
+
+
+
+
+ + ☆ DreamVideo: Composing Your Dream Videos with Customized Subject and + Motion + + +
+ Customized generation using diffusion models has made impressive progress in +image generation, but remains unsatisfactory in the challenging video +generation task, as it requires the controllability of both subjects and +motions. To that end, we present DreamVideo, a novel approach to generating +personalized videos from a few static images of the desired subject and a few +videos of target motion. DreamVideo decouples this task into two stages, +subject learning and motion learning, by leveraging a pre-trained video +diffusion model. The subject learning aims to accurately capture the fine +appearance of the subject from provided images, which is achieved by combining +textual inversion and fine-tuning of our carefully designed identity adapter. +In motion learning, we architect a motion adapter and fine-tune it on the given +videos to effectively model the target motion pattern. Combining these two +lightweight and efficient adapters allows for flexible customization of any +subject with any motion. Extensive experimental results demonstrate the +superior performance of our DreamVideo over the state-of-the-art methods for +customized video generation. Our project page is at +https://dreamvideo-t2v.github.io. + +
+
+
+
+
+ + ☆ Approximate Caching for Efficiently Serving Diffusion Models + + +
+ Text-to-image generation using diffusion models has seen explosive popularity +owing to their ability in producing high quality images adhering to text +prompts. However, production-grade diffusion model serving is a resource +intensive task that not only require high-end GPUs which are expensive but also +incurs considerable latency. In this paper, we introduce a technique called +approximate-caching that can reduce such iterative denoising steps for an image +generation based on a prompt by reusing intermediate noise states created +during a prior image generation for similar prompts. Based on this idea, we +present an end to end text-to-image system, Nirvana, that uses the +approximate-caching with a novel cache management-policy Least Computationally +Beneficial and Frequently Used (LCBFU) to provide % GPU compute savings, 19.8% +end-to-end latency reduction and 19% dollar savings, on average, on two real +production workloads. We further present an extensive characterization of real +production text-to-image prompts from the perspective of caching, popularity +and reuse of intermediate states in a large production environment. + +
+
+ comment: Accepted at NSDI'24 +
+
+
+
+
+ + ☆ Cascade-Zero123: One Image to Highly Consistent 3D with Self-Prompted + Nearby Views + + +
+ Synthesizing multi-view 3D from one single image is a significant and +challenging task. For this goal, Zero-1-to-3 methods aim to extend a 2D latent +diffusion model to the 3D scope. These approaches generate the target-view +image with a single-view source image and the camera pose as condition +information. However, the one-to-one manner adopted in Zero-1-to-3 incurs +challenges for building geometric and visual consistency across views, +especially for complex objects. We propose a cascade generation framework +constructed with two Zero-1-to-3 models, named Cascade-Zero123, to tackle this +issue, which progressively extracts 3D information from the source image. +Specifically, a self-prompting mechanism is designed to generate several nearby +views at first. These views are then fed into the second-stage model along with +the source image as generation conditions. With self-prompted multiple views as +the supplementary information, our Cascade-Zero123 generates more highly +consistent novel-view images than Zero-1-to-3. The promotion is significant for +various complex and challenging scenes, involving insects, humans, transparent +objects, and stacked multiple objects etc. The project page is at +https://cascadezero123.github.io/. + +
+
+ comment: Project page: https://cascadezero123.github.io/ +
+
+
+
+
+ + ☆ Smooth Diffusion: Crafting Smooth Latent Spaces in Diffusion Models + + +
+ Recently, diffusion models have made remarkable progress in text-to-image +(T2I) generation, synthesizing images with high fidelity and diverse contents. +Despite this advancement, latent space smoothness within diffusion models +remains largely unexplored. Smooth latent spaces ensure that a perturbation on +an input latent corresponds to a steady change in the output image. This +property proves beneficial in downstream tasks, including image interpolation, +inversion, and editing. In this work, we expose the non-smoothness of diffusion +latent spaces by observing noticeable visual fluctuations resulting from minor +latent variations. To tackle this issue, we propose Smooth Diffusion, a new +category of diffusion models that can be simultaneously high-performing and +smooth. Specifically, we introduce Step-wise Variation Regularization to +enforce the proportion between the variations of an arbitrary input latent and +that of the output image is a constant at any diffusion training step. In +addition, we devise an interpolation standard deviation (ISTD) metric to +effectively assess the latent space smoothness of a diffusion model. Extensive +quantitative and qualitative experiments demonstrate that Smooth Diffusion +stands out as a more desirable solution not only in T2I generation but also +across various downstream tasks. Smooth Diffusion is implemented as a +plug-and-play Smooth-LoRA to work with various community models. Code is +available at https://github.com/SHI-Labs/Smooth-Diffusion. + +
+
+ comment: GitHub: https://github.com/SHI-Labs/Smooth-Diffusion +
+
+
+
+
+ + ☆ OT-Attack: Enhancing Adversarial Transferability of Vision-Language + Models via Optimal Transport Optimization + + +
+ Vision-language pre-training (VLP) models demonstrate impressive abilities in +processing both images and text. However, they are vulnerable to multi-modal +adversarial examples (AEs). Investigating the generation of +high-transferability adversarial examples is crucial for uncovering VLP models' +vulnerabilities in practical scenarios. Recent works have indicated that +leveraging data augmentation and image-text modal interactions can enhance the +transferability of adversarial examples for VLP models significantly. However, +they do not consider the optimal alignment problem between dataaugmented +image-text pairs. This oversight leads to adversarial examples that are overly +tailored to the source model, thus limiting improvements in transferability. In +our research, we first explore the interplay between image sets produced +through data augmentation and their corresponding text sets. We find that +augmented image samples can align optimally with certain texts while exhibiting +less relevance to others. Motivated by this, we propose an Optimal +Transport-based Adversarial Attack, dubbed OT-Attack. The proposed method +formulates the features of image and text sets as two distinct distributions +and employs optimal transport theory to determine the most efficient mapping +between them. This optimal mapping informs our generation of adversarial +examples to effectively counteract the overfitting issues. Extensive +experiments across various network architectures and datasets in image-text +matching tasks reveal that our OT-Attack outperforms existing state-of-the-art +methods in terms of adversarial transferability. + +
+
+
+
+
+ + ☆ Intelligent Anomaly Detection for Lane Rendering Using Transformer with + Self-Supervised Pre-Training and Customized Fine-Tuning + + +
+ The burgeoning navigation services using digital maps provide great +convenience to drivers. Nevertheless, the presence of anomalies in lane +rendering map images occasionally introduces potential hazards, as such +anomalies can be misleading to human drivers and consequently contribute to +unsafe driving conditions. In response to this concern and to accurately and +effectively detect the anomalies, this paper transforms lane rendering image +anomaly detection into a classification problem and proposes a four-phase +pipeline consisting of data pre-processing, self-supervised pre-training with +the masked image modeling (MiM) method, customized fine-tuning using +cross-entropy based loss with label smoothing, and post-processing to tackle it +leveraging state-of-the-art deep learning techniques, especially those +involving Transformer models. Various experiments verify the effectiveness of +the proposed pipeline. Results indicate that the proposed pipeline exhibits +superior performance in lane rendering image anomaly detection, and notably, +the self-supervised pre-training with MiM can greatly enhance the detection +accuracy while significantly reducing the total training time. For instance, +employing the Swin Transformer with Uniform Masking as self-supervised +pretraining (Swin-Trans-UM) yielded a heightened accuracy at 94.77% and an +improved Area Under The Curve (AUC) score of 0.9743 compared with the pure Swin +Transformer without pre-training (Swin-Trans) with an accuracy of 94.01% and an +AUC of 0.9498. The fine-tuning epochs were dramatically reduced to 41 from the +original 280. In conclusion, the proposed pipeline, with its incorporation of +self-supervised pre-training using MiM and other advanced deep learning +techniques, emerges as a robust solution for enhancing the accuracy and +efficiency of lane rendering image anomaly detection in digital navigation +systems. + +
+
+ comment: 20 pages, 6 figures, accepted by the 103rd Transportation Research + Board (TRB) Annual Meeting, under review by Transportation Research Record: + Journal of the Transportation Research Board +
+
+
+
+
+ + ☆ PhysHOI: Physics-Based Imitation of Dynamic Human-Object Interaction + + +
+ Humans interact with objects all the time. Enabling a humanoid to learn +human-object interaction (HOI) is a key step for future smart animation and +intelligent robotics systems. However, recent progress in physics-based HOI +requires carefully designed task-specific rewards, making the system unscalable +and labor-intensive. This work focuses on dynamic HOI imitation: teaching +humanoid dynamic interaction skills through imitating kinematic HOI +demonstrations. It is quite challenging because of the complexity of the +interaction between body parts and objects and the lack of dynamic HOI data. To +handle the above issues, we present PhysHOI, the first physics-based whole-body +HOI imitation approach without task-specific reward designs. Except for the +kinematic HOI representations of humans and objects, we introduce the contact +graph to model the contact relations between body parts and objects explicitly. +A contact graph reward is also designed, which proved to be critical for +precise HOI imitation. Based on the key designs, PhysHOI can imitate diverse +HOI tasks simply yet effectively without prior knowledge. To make up for the +lack of dynamic HOI scenarios in this area, we introduce the BallPlay dataset +that contains eight whole-body basketball skills. We validate PhysHOI on +diverse HOI tasks, including whole-body grasping and basketball skills. + +
+
+
+
+
+ + ☆ AniRes2D: Anisotropic Residual-enhanced Diffusion for 2D MR + Super-Resolution SP + + +
+ Anisotropic low-resolution (LR) magnetic resonance (MR) images are fast to +obtain but hinder automated processing. We propose to use denoising diffusion +probabilistic models (DDPMs) to super-resolve these 2D-acquired LR MR slices. +This paper introduces AniRes2D, a novel approach combining DDPM with a residual +prediction for 2D super-resolution (SR). Results demonstrate that AniRes2D +outperforms several other DDPM-based models in quantitative metrics, visual +quality, and out-of-domain evaluation. We use a trained AniRes2D to +super-resolve 3D volumes slice by slice, where comparative quantitative results +and reduced skull aliasing are achieved compared to a recent state-of-the-art +self-supervised 3D super-resolution method. Furthermore, we explored the use of +noise conditioning augmentation (NCA) as an alternative augmentation technique +for DDPM-based SR models, but it was found to reduce performance. Our findings +contribute valuable insights to the application of DDPMs for SR of anisotropic +MR images. + +
+
+ comment: Accepted for presentation at SPIE Medical Imaging 2024, Clinical and + Biomedical Imaging +
+
+
+
+
+ + ☆ SingingHead: A Large-scale 4D Dataset for Singing Head Animation + + +
+ Singing, as a common facial movement second only to talking, can be regarded +as a universal language across ethnicities and cultures, plays an important +role in emotional communication, art, and entertainment. However, it is often +overlooked in the field of audio-driven facial animation due to the lack of +singing head datasets and the domain gap between singing and talking in rhythm +and amplitude. To this end, we collect a high-quality large-scale singing head +dataset, SingingHead, which consists of more than 27 hours of synchronized +singing video, 3D facial motion, singing audio, and background music from 76 +individuals and 8 types of music. Along with the SingingHead dataset, we argue +that 3D and 2D facial animation tasks can be solved together, and propose a +unified singing facial animation framework named UniSinger to achieve both +singing audio-driven 3D singing head animation and 2D singing portrait video +synthesis. Extensive comparative experiments with both SOTA 3D facial animation +and 2D portrait animation methods demonstrate the necessity of singing-specific +datasets in singing head animation tasks and the promising performance of our +unified facial animation framework. + +
+
+
+
+
+ + ☆ DemoCaricature: Democratising Caricature Generation with a Rough Sketch + + +
+ In this paper, we democratise caricature generation, empowering individuals +to effortlessly craft personalised caricatures with just a photo and a +conceptual sketch. Our objective is to strike a delicate balance between +abstraction and identity, while preserving the creativity and subjectivity +inherent in a sketch. To achieve this, we present Explicit Rank-1 Model Editing +alongside single-image personalisation, selectively applying nuanced edits to +cross-attention layers for a seamless merge of identity and style. +Additionally, we propose Random Mask Reconstruction to enhance robustness, +directing the model to focus on distinctive identity and style features. +Crucially, our aim is not to replace artists but to eliminate accessibility +barriers, allowing enthusiasts to engage in the artistry. + +
+
+
+
+
+ + ☆ Enhancing Medical Task Performance in GPT-4V: A Comprehensive Study on + Prompt Engineering Strategies + + +
+ OpenAI's latest large vision-language model (LVLM), GPT-4V(ision), has piqued +considerable interest for its potential in medical applications. Despite its +promise, recent studies and internal reviews highlight its underperformance in +specialized medical tasks. This paper explores the boundary of GPT-4V's +capabilities in medicine, particularly in processing complex imaging data from +endoscopies, CT scans, and MRIs etc. Leveraging open-source datasets, we +assessed its foundational competencies, identifying substantial areas for +enhancement. Our research emphasizes prompt engineering, an often-underutilized +strategy for improving AI responsiveness. Through iterative testing, we refined +the model's prompts, significantly improving its interpretative accuracy and +relevance in medical imaging. From our comprehensive evaluations, we distilled +10 effective prompt engineering techniques, each fortifying GPT-4V's medical +acumen. These methodical enhancements facilitate more reliable, precise, and +clinically valuable insights from GPT-4V, advancing its operability in critical +healthcare environments. Our findings are pivotal for those employing AI in +medicine, providing clear, actionable guidance on harnessing GPT-4V's full +diagnostic potential. + +
+
+
+
+
+ + ☆ Multi-View Unsupervised Image Generation with Cross Attention Guidance + + +
+ The growing interest in novel view synthesis, driven by Neural Radiance Field +(NeRF) models, is hindered by scalability issues due to their reliance on +precisely annotated multi-view images. Recent models address this by +fine-tuning large text2image diffusion models on synthetic multi-view data. +Despite robust zero-shot generalization, they may need post-processing and can +face quality issues due to the synthetic-real domain gap. This paper introduces +a novel pipeline for unsupervised training of a pose-conditioned diffusion +model on single-category datasets. With the help of pretrained self-supervised +Vision Transformers (DINOv2), we identify object poses by clustering the +dataset through comparing visibility and locations of specific object parts. +The pose-conditioned diffusion model, trained on pose labels, and equipped with +cross-frame attention at inference time ensures cross-view consistency, that is +further aided by our novel hard-attention guidance. Our model, MIRAGE, +surpasses prior work in novel view synthesis on real images. Furthermore, +MIRAGE is robust to diverse textures and geometries, as demonstrated with our +experiments on synthetic images generated with pretrained Stable Diffusion. + +
+
+
+
+
+ + ☆ Towards a Perceptual Evaluation Framework for Lighting Estimation + + +
+ Progress in lighting estimation is tracked by computing existing image +quality assessment (IQA) metrics on images from standard datasets. While this +may appear to be a reasonable approach, we demonstrate that doing so does not +correlate to human preference when the estimated lighting is used to relight a +virtual scene into a real photograph. To study this, we design a controlled +psychophysical experiment where human observers must choose their preference +amongst rendered scenes lit using a set of lighting estimation algorithms +selected from the recent literature, and use it to analyse how these algorithms +perform according to human perception. Then, we demonstrate that none of the +most popular IQA metrics from the literature, taken individually, correctly +represent human perception. Finally, we show that by learning a combination of +existing IQA metrics, we can more accurately represent human preference. This +provides a new perceptual framework to help evaluate future lighting estimation +algorithms. + +
+
+
+
+
+ + ☆ A Multi-scale Information Integration Framework for Infrared and Visible + Image Fusion + + +
+ Infrared and visible image fusion aims at generating a fused image containing +the intensity and detail information of source images, and the key issue is +effectively measuring and integrating the complementary information of +multi-modality images from the same scene. Existing methods mostly adopt a +simple weight in the loss function to decide the information retention of each +modality rather than adaptively measuring complementary information for +different image pairs. In this study, we propose a multi-scale dual attention +(MDA) framework for infrared and visible image fusion, which is designed to +measure and integrate complementary information in both structure and loss +function at the image and patch level. In our method, the residual downsample +block decomposes source images into three scales first. Then, dual attention +fusion block integrates complementary information and generates a spatial and +channel attention map at each scale for feature fusion. Finally, the output +image is reconstructed by the residual reconstruction block. Loss function +consists of image-level, feature-level and patch-level three parts, of which +the calculation of the image-level and patch-level two parts are based on the +weights generated by the complementary information measurement. Indeed, to +constrain the pixel intensity distribution between the output and infrared +image, a style loss is added. Our fusion results perform robust and informative +across different scenarios. Qualitative and quantitative results on two +datasets illustrate that our method is able to preserve both thermal radiation +and detailed information from two modalities and achieve comparable results +compared with the other state-of-the-art methods. Ablation experiments show the +effectiveness of our information integration architecture and adaptively +measure complementary information retention in the loss function. + +
+
+
+
+
+ + ☆ iDesigner: A High-Resolution and Complex-Prompt Following Text-to-Image + Diffusion Model for Interior Design + + +
+ With the open-sourcing of text-to-image models (T2I) such as stable diffusion +(SD) and stable diffusion XL (SD-XL), there is an influx of models fine-tuned +in specific domains based on the open-source SD model, such as in anime, +character portraits, etc. However, there are few specialized models in certain +domains, such as interior design, which is attributed to the complex textual +descriptions and detailed visual elements inherent in design, alongside the +necessity for adaptable resolution. Therefore, text-to-image models for +interior design are required to have outstanding prompt-following capabilities, +as well as iterative collaboration with design professionals to achieve the +desired outcome. In this paper, we collect and optimize text-image data in the +design field and continue training in both English and Chinese on the basis of +the open-source CLIP model. We also proposed a fine-tuning strategy with +curriculum learning and reinforcement learning from CLIP feedback to enhance +the prompt-following capabilities of our approach so as to improve the quality +of image generation. The experimental results on the collected dataset +demonstrate the effectiveness of the proposed approach, which achieves +impressive results and outperforms strong baselines. + +
+
+
+
+
+ + ☆ Towards Knowledge-driven Autonomous Driving + + +
+ This paper explores the emerging knowledge-driven autonomous driving +technologies. Our investigation highlights the limitations of current +autonomous driving systems, in particular their sensitivity to data bias, +difficulty in handling long-tail scenarios, and lack of interpretability. +Conversely, knowledge-driven methods with the abilities of cognition, +generalization and life-long learning emerge as a promising way to overcome +these challenges. This paper delves into the essence of knowledge-driven +autonomous driving and examines its core components: dataset \& benchmark, +environment, and driver agent. By leveraging large language models, world +models, neural rendering, and other advanced artificial intelligence +techniques, these components collectively contribute to a more holistic, +adaptive, and intelligent autonomous driving system. The paper systematically +organizes and reviews previous research efforts in this area, and provides +insights and guidance for future research and practical applications of +autonomous driving. We will continually share the latest updates on +cutting-edge developments in knowledge-driven autonomous driving along with the +relevant valuable open-source resources at: +\url{https://github.com/PJLab-ADG/awesome-knowledge-driven-AD}. + +
+
+
+
+
+ + ☆ GPT4SGG: Synthesizing Scene Graphs from Holistic and Region-specific + Narratives + + +
+ Learning scene graphs from natural language descriptions has proven to be a +cheap and promising scheme for Scene Graph Generation (SGG). However, such +unstructured caption data and its processing are troubling the learning an +acurrate and complete scene graph. This dilema can be summarized as three +points. First, traditional language parsers often fail to extract meaningful +relationship triplets from caption data. Second, grounding unlocalized objects +in parsed triplets will meet ambiguity in visual-language alignment. Last, +caption data typically are sparse and exhibit bias to partial observations of +image content. These three issues make it hard for the model to generate +comprehensive and accurate scene graphs. To fill this gap, we propose a simple +yet effective framework, GPT4SGG, to synthesize scene graphs from holistic and +region-specific narratives. The framework discards traditional language parser, +and localize objects before obtaining relationship triplets. To obtain +relationship triplets, holistic and dense region-specific narratives are +generated from the image. With such textual representation of image data and a +task-specific prompt, an LLM, particularly GPT-4, directly synthesizes a scene +graph as "pseudo labels". Experimental results showcase GPT4SGG significantly +improves the performance of SGG models trained on image-caption data. We +believe this pioneering work can motivate further research into mining the +visual reasoning capabilities of LLMs. + +
+
+
+
+
+ + ☆ Prompt Highlighter: Interactive Control for Multi-Modal LLMs + + +
+ This study targets a critical aspect of multi-modal LLMs' (LLMs&VLMs) +inference: explicit controllable text generation. Multi-modal LLMs empower +multi-modality understanding with the capability of semantic generation yet +bring less explainability and heavier reliance on prompt contents due to their +autoregressive generative nature. While manipulating prompt formats could +improve outputs, designing specific and precise prompts per task can be +challenging and ineffective. To tackle this issue, we introduce a novel +inference method, Prompt Highlighter, which enables users to highlight specific +prompt spans to interactively control the focus during generation. Motivated by +the classifier-free diffusion guidance, we form regular and unconditional +context pairs based on highlighted tokens, demonstrating that the +autoregressive generation in models can be guided in a classifier-free way. +Notably, we find that, during inference, guiding the models with highlighted +tokens through the attention weights leads to more desired outputs. Our +approach is compatible with current LLMs and VLMs, achieving impressive +customized generation results without training. Experiments confirm its +effectiveness in focusing on input contexts and generating reliable content. +Without tuning on LLaVA-v1.5, our method secured 69.5 in the MMBench test and +1552.5 in MME-perception. The code is available at: +https://github.com/dvlab-research/Prompt-Highlighter/ + +
+
+ comment: 19 pages. Project Page: + https://julianjuaner.github.io/projects/PromptHighlighter +
+
+
+
+
+ + ☆ Cross-codex Learning for Reliable Scribe Identification in Medieval + Manuscripts + + +
+ Historic scribe identification is a substantial task for obtaining +information about the past. Uniform script styles, such as the Carolingian +minuscule, make it a difficult task for classification to focus on meaningful +features. Therefore, we demonstrate in this paper the importance of cross-codex +training data for CNN based text-independent off-line scribe identification, to +overcome codex dependent overfitting. We report three main findings: First, we +found that preprocessing with masked grayscale images instead of RGB images +clearly increased the F1-score of the classification results. Second, we +trained different neural networks on our complex data, validating time and +accuracy differences in order to define the most reliable network architecture. +With AlexNet, the network with the best trade-off between F1-score and time, we +achieved for individual classes F1-scores of up to 0,96 on line level and up to +1.0 on page level in classification. Third, we could replicate the finding that +the CNN output can be further improved by implementing a reject option, giving +more stable results. We present the results on our large scale open source +dataset -- the Codex Claustroneoburgensis database (CCl-DB) -- containing a +significant number of writings from different scribes in several codices. We +demonstrate for the first time on a dataset with such a variety of codices that +paleographic decisions can be reproduced automatically and precisely with CNNs. +This gives manifold new and fast possibilities for paleographers to gain +insights into unlabeled material, but also to develop further hypotheses. + +
+
+
+
+
+ + ☆ GPT-4V with Emotion: A Zero-shot Benchmark for Multimodal Emotion + Understanding + + +
+ Recently, GPT-4 with Vision (GPT-4V) has shown remarkable performance across +various multimodal tasks. However, its efficacy in emotion recognition remains +a question. This paper quantitatively evaluates GPT-4V's capabilities in +multimodal emotion understanding, encompassing tasks such as facial emotion +recognition, visual sentiment analysis, micro-expression recognition, dynamic +facial emotion recognition, and multimodal emotion recognition. Our experiments +show that GPT-4V exhibits impressive multimodal and temporal understanding +capabilities, even surpassing supervised systems in some tasks. Despite these +achievements, GPT-4V is currently tailored for general domains. It performs +poorly in micro-expression recognition that requires specialized expertise. The +main purpose of this paper is to present quantitative results of GPT-4V on +emotion understanding and establish a zero-shot benchmark for future research. +Code and evaluation results are available at: +https://github.com/zeroQiaoba/gpt4v-emotion. + +
+
+
+
+
+ + ☆ Activity Grammars for Temporal Action Segmentation NeurIPS 2023 + + +
+ Sequence prediction on temporal data requires the ability to understand +compositional structures of multi-level semantics beyond individual and +contextual properties. The task of temporal action segmentation, which aims at +translating an untrimmed activity video into a sequence of action segments, +remains challenging for this reason. This paper addresses the problem by +introducing an effective activity grammar to guide neural predictions for +temporal action segmentation. We propose a novel grammar induction algorithm +that extracts a powerful context-free grammar from action sequence data. We +also develop an efficient generalized parser that transforms frame-level +probability distributions into a reliable sequence of actions according to the +induced grammar with recursive rules. Our approach can be combined with any +neural network for temporal action segmentation to enhance the sequence +prediction and discover its compositional structure. Experimental results +demonstrate that our method significantly improves temporal action segmentation +in terms of both performance and interpretability on two standard benchmarks, +Breakfast and 50 Salads. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ☆ Stronger, Fewer, & Superior: Harnessing Vision Foundation Models for + Domain Generalized Semantic Segmentation + + +
+ In this paper, we first assess and harness various Vision Foundation Models +(VFMs) in the context of Domain Generalized Semantic Segmentation (DGSS). +Driven by the motivation that Leveraging Stronger pre-trained models and Fewer +trainable parameters for Superior generalizability, we introduce a robust +fine-tuning approach, namely Rein, to parameter-efficiently harness VFMs for +DGSS. Built upon a set of trainable tokens, each linked to distinct instances, +Rein precisely refines and forwards the feature maps from each layer to the +next layer within the backbone. This process produces diverse refinements for +different categories within a single image. With fewer trainable parameters, +Rein efficiently fine-tunes VFMs for DGSS tasks, surprisingly surpassing full +parameter fine-tuning. Extensive experiments across various settings +demonstrate that Rein significantly outperforms state-of-the-art methods. +Remarkably, with just an extra 1% of trainable parameters within the frozen +backbone, Rein achieves a mIoU of 68.1% on the Cityscapes, without accessing +any real urban-scene datasets. + +
+
+
+
+
+ + ☆ TeMO: Towards Text-Driven 3D Stylization for Multi-Object Meshes + + +
+ Recent progress in the text-driven 3D stylization of a single object has been +considerably promoted by CLIP-based methods. However, the stylization of +multi-object 3D scenes is still impeded in that the image-text pairs used for +pre-training CLIP mostly consist of an object. Meanwhile, the local details of +multiple objects may be susceptible to omission due to the existing supervision +manner primarily relying on coarse-grained contrast of image-text pairs. To +overcome these challenges, we present a novel framework, dubbed TeMO, to parse +multi-object 3D scenes and edit their styles under the contrast supervision at +multiple levels. We first propose a Decoupled Graph Attention (DGA) module to +distinguishably reinforce the features of 3D surface points. Particularly, a +cross-modal graph is constructed to align the object points accurately and noun +phrases decoupled from the 3D mesh and textual description. Then, we develop a +Cross-Grained Contrast (CGC) supervision system, where a fine-grained loss +between the words in the textual description and the randomly rendered images +are constructed to complement the coarse-grained loss. Extensive experiments +show that our method can synthesize high-quality stylized content and +outperform the existing methods over a wide range of multi-object 3D meshes. +Our code and results will be made publicly available + +
+
+
+
+
+ + ☆ Detecting and Restoring Non-Standard Hands in Stable Diffusion Generated + Images + + +
+ We introduce a pipeline to address anatomical inaccuracies in Stable +Diffusion generated hand images. The initial step involves constructing a +specialized dataset, focusing on hand anomalies, to train our models +effectively. A finetuned detection model is pivotal for precise identification +of these anomalies, ensuring targeted correction. Body pose estimation aids in +understanding hand orientation and positioning, crucial for accurate anomaly +correction. The integration of ControlNet and InstructPix2Pix facilitates +sophisticated inpainting and pixel-level transformation, respectively. This +dual approach allows for high-fidelity image adjustments. This comprehensive +approach ensures the generation of images with anatomically accurate hands, +closely resembling real-world appearances. Our experimental results demonstrate +the pipeline's efficacy in enhancing hand image realism in Stable Diffusion +outputs. We provide an online demo at https://fixhand.yiqun.io + +
+
+
+
+
+ + ☆ Fine-tune vision foundation model for crack segmentation in civil + infrastructures + + +
+ Large-scale foundation models have become the mainstream method in the field +of deep learning, while in civil engineering, the scale of AI models is +strictly limited. In this work, vision foundation model is introduced for crack +segmentation. Two Parameter-efficient fine-tuning methods, adapter and low-rank +adaptation, are adopted to fine-tune the foundation model in the field of +semantic segmentation: Segment Anything Model (SAM). The fine-tuned model +CrackSAM is much larger than all the existing crack segmentation models, but +shows excellent performance. To test the zero-shot performance of the proposed +method, two unique datasets related to road and exterior wall cracks are +collected, annotated and open-sourced, in total 810 images. Comparative +experiments are conducted with twelve mature semantic segmentation models. On +datasets with artificial noise and previously unseen datasets, the performance +of CrackSAM far exceeds that of all state-of-the-art models. CrackSAM exhibits +remarkable superiority, particularly in challenging conditions such as dim +lighting, shadows, road markings, construction joints, and other interference +factors. Such cross-scenario results demonstrate the outstanding zero-shot +capability of foundation models, and provide new ideas for the development of +vision models in civil engineering. + +
+
+
+
+
+ + ☆ Adventures of Trustworthy Vision-Language Models: A Survey AAAI 2024 + + +
+ Recently, transformers have become incredibly popular in computer vision and +vision-language tasks. This notable rise in their usage can be primarily +attributed to the capabilities offered by attention mechanisms and the +outstanding ability of transformers to adapt and apply themselves to a variety +of tasks and domains. Their versatility and state-of-the-art performance have +established them as indispensable tools for a wide array of applications. +However, in the constantly changing landscape of machine learning, the +assurance of the trustworthiness of transformers holds utmost importance. This +paper conducts a thorough examination of vision-language transformers, +employing three fundamental principles of responsible AI: Bias, Robustness, and +Interpretability. The primary objective of this paper is to delve into the +intricacies and complexities associated with the practical use of transformers, +with the overarching goal of advancing our comprehension of how to enhance +their reliability and accountability. + +
+
+ comment: Accepted in AAAI 2024 +
+
+
+
+
+ + ☆ TLCE: Transfer-Learning Based Classifier Ensembles for Few-Shot + Class-Incremental Learning + + +
+ Few-shot class-incremental learning (FSCIL) struggles to incrementally +recognize novel classes from few examples without catastrophic forgetting of +old classes or overfitting to new classes. We propose TLCE, which ensembles +multiple pre-trained models to improve separation of novel and old classes. +TLCE minimizes interference between old and new classes by mapping old class +images to quasi-orthogonal prototypes using episodic training. It then +ensembles diverse pre-trained models to better adapt to novel classes despite +data imbalance. Extensive experiments on various datasets demonstrate that our +transfer learning ensemble approach outperforms state-of-the-art FSCIL methods. + +
+
+
+
+
+ + ☆ Guided Reconstruction with Conditioned Diffusion Models for Unsupervised + Anomaly Detection in Brain MRIs + + +
+ Unsupervised anomaly detection in Brain MRIs aims to identify abnormalities +as outliers from a healthy training distribution. Reconstruction-based +approaches that use generative models to learn to reconstruct healthy brain +anatomy are commonly used for this task. Diffusion models are an emerging class +of deep generative models that show great potential regarding reconstruction +fidelity. However, they face challenges in preserving intensity characteristics +in the reconstructed images, limiting their performance in anomaly detection. +To address this challenge, we propose to condition the denoising mechanism of +diffusion models with additional information about the image to reconstruct +coming from a latent representation of the noise-free input image. This +conditioning enables high-fidelity reconstruction of healthy brain structures +while aligning local intensity characteristics of input-reconstruction pairs. +We evaluate our method's reconstruction quality, domain adaptation features and +finally segmentation performance on publicly available data sets with various +pathologies. Using our proposed conditioning mechanism we can reduce the +false-positive predictions and enable a more precise delineation of anomalies +which significantly enhances the anomaly detection performance compared to +established state-of-the-art approaches to unsupervised anomaly detection in +brain MRI. Furthermore, our approach shows promise in domain adaptation across +different MRI acquisitions and simulated contrasts, a crucial property of +general anomaly detection methods. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Constraint Model for the Satellite Image Mosaic Selection Problem + + +
+ Satellite imagery solutions are widely used to study and monitor different +regions of the Earth. However, a single satellite image can cover only a +limited area. In cases where a larger area of interest is studied, several +images must be stitched together to create a single larger image, called a +mosaic, that can cover the area. Today, with the increasing number of satellite +images available for commercial use, selecting the images to build the mosaic +is challenging, especially when the user wants to optimize one or more +parameters, such as the total cost and the cloud coverage percentage in the +mosaic. More precisely, for this problem the input is an area of interest, +several satellite images intersecting the area, a list of requirements relative +to the image and the mosaic, such as cloud coverage percentage, image +resolution, and a list of objectives to optimize. We contribute to the +constraint and mixed integer lineal programming formulation of this new +problem, which we call the \textit{satellite image mosaic selection problem}, +which is a multi-objective extension of the polygon cover problem. We propose a +dataset of realistic and challenging instances, where the images were captured +by the satellite constellations SPOT, Pl\'eiades and Pl\'eiades Neo. We +evaluate and compare the two proposed models and show their efficiency for +large instances, up to 200 images. + +
+
+ comment: This paper contains minor corrections from the original document + presented at the 29th International Conference on Principles and Practice of + Constraint Programming (CP 2023). Minor corrections in Figures 5a and 5b that + do not affect the analysis result. Minor typo corrections in Appendix A +
+
+
+
+
+ + ☆ SAMBA: A Trainable Segmentation Web-App with Smart Labelling + + +
+ Segmentation is the assigning of a semantic class to every pixel in an image +and is a prerequisite for various statistical analysis tasks in materials +science, like phase quantification, physics simulations or morphological +characterization. The wide range of length scales, imaging techniques and +materials studied in materials science means any segmentation algorithm must +generalise to unseen data and support abstract, user-defined semantic classes. +Trainable segmentation is a popular interactive segmentation paradigm where a +classifier is trained to map from image features to user drawn labels. SAMBA is +a trainable segmentation tool that uses Meta's Segment Anything Model (SAM) for +fast, high-quality label suggestions and a random forest classifier for robust, +generalizable segmentations. It is accessible in the browser +(https://www.sambasegment.com/) without the need to download any external +dependencies. The segmentation backend is run in the cloud, so does not require +the user to have powerful hardware. + +
+
+
+
+
+ + ☆ Joint-Individual Fusion Structure with Fusion Attention Module for + Multi-Modal Skin Cancer Classification + + +
+ Most convolutional neural network (CNN) based methods for skin cancer +classification obtain their results using only dermatological images. Although +good classification results have been shown, more accurate results can be +achieved by considering the patient's metadata, which is valuable clinical +information for dermatologists. Current methods only use the simple joint +fusion structure (FS) and fusion modules (FMs) for the multi-modal +classification methods, there still is room to increase the accuracy by +exploring more advanced FS and FM. Therefore, in this paper, we design a new +fusion method that combines dermatological images (dermoscopy images or +clinical images) and patient metadata for skin cancer classification from the +perspectives of FS and FM. First, we propose a joint-individual fusion (JIF) +structure that learns the shared features of multi-modality data and preserves +specific features simultaneously. Second, we introduce a fusion attention (FA) +module that enhances the most relevant image and metadata features based on +both the self and mutual attention mechanism to support the decision-making +pipeline. We compare the proposed JIF-MMFA method with other state-of-the-art +fusion methods on three different public datasets. The results show that our +JIF-MMFA method improves the classification results for all tested CNN +backbones and performs better than the other fusion methods on the three public +datasets, demonstrating our method's effectiveness and robustness + +
+
+ comment: submitted to Pattern Recognition journal before 2022 +
+
+
+
+
+ + ☆ Augmentation-Free Dense Contrastive Knowledge Distillation for Efficient + Semantic Segmentation NeurIPS 2023 + + +
+ In recent years, knowledge distillation methods based on contrastive learning +have achieved promising results on image classification and object detection +tasks. However, in this line of research, we note that less attention is paid +to semantic segmentation. Existing methods heavily rely on data augmentation +and memory buffer, which entail high computational resource demands when +applying them to handle semantic segmentation that requires to preserve +high-resolution feature maps for making dense pixel-wise predictions. In order +to address this problem, we present Augmentation-free Dense Contrastive +Knowledge Distillation (Af-DCD), a new contrastive distillation learning +paradigm to train compact and accurate deep neural networks for semantic +segmentation applications. Af-DCD leverages a masked feature mimicking +strategy, and formulates a novel contrastive learning loss via taking advantage +of tactful feature partitions across both channel and spatial dimensions, +allowing to effectively transfer dense and structured local knowledge learnt by +the teacher model to a target student model while maintaining training +efficiency. Extensive experiments on five mainstream benchmarks with various +teacher-student network pairs demonstrate the effectiveness of our approach. +For instance, the DeepLabV3-Res18|DeepLabV3-MBV2 model trained by Af-DCD +reaches 77.03%|76.38% mIOU on Cityscapes dataset when choosing DeepLabV3-Res101 +as the teacher, setting new performance records. Besides that, Af-DCD achieves +an absolute mIOU improvement of 3.26%|3.04%|2.75%|2.30%|1.42% compared with +individually trained counterpart on Cityscapes|Pascal +VOC|Camvid|ADE20K|COCO-Stuff-164K. Code is available at +https://github.com/OSVAI/Af-DCD + +
+
+ comment: The paper of Af-DCD is accepted to NeurIPS 2023. Code and models are + available at https://github.com/OSVAI/Af-DCD +
+
+
+
+
+ + ☆ Text as Image: Learning Transferable Adapter for Multi-Label + Classification + + +
+ Pre-trained vision-language models have notably accelerated progress of +open-world concept recognition. Their impressive zero-shot ability has recently +been transferred to multi-label image classification via prompt tuning, +enabling to discover novel labels in an open-vocabulary manner. However, this +paradigm suffers from non-trivial training costs, and becomes computationally +prohibitive for a large number of candidate labels. To address this issue, we +note that vision-language pre-training aligns images and texts in a unified +embedding space, making it potential for an adapter network to identify labels +in visual modality while be trained in text modality. To enhance such +cross-modal transfer ability, a simple yet effective method termed random +perturbation is proposed, which enables the adapter to search for potential +visual embeddings by perturbing text embeddings with noise during training, +resulting in better performance in visual modality. Furthermore, we introduce +an effective approach to employ large language models for multi-label +instruction-following text generation. In this way, a fully automated pipeline +for visual label recognition is developed without relying on any manual data. +Extensive experiments on public benchmarks show the superiority of our method +in various multi-label classification tasks. + +
+
+
+
+
+ + ☆ EulerMormer: Robust Eulerian Motion Magnification via Dynamic Filtering + within Transformer + + +
+ Video Motion Magnification (VMM) aims to break the resolution limit of human +visual perception capability and reveal the imperceptible minor motion that +contains valuable information in the macroscopic domain. However, challenges +arise in this task due to photon noise inevitably introduced by photographic +devices and spatial inconsistency in amplification, leading to flickering +artifacts in static fields and motion blur and distortion in dynamic fields in +the video. Existing methods focus on explicit motion modeling without +emphasizing prioritized denoising during the motion magnification process. This +paper proposes a novel dynamic filtering strategy to achieve static-dynamic +field adaptive denoising. Specifically, based on Eulerian theory, we separate +texture and shape to extract motion representation through inter-frame shape +differences, expecting to leverage these subdivided features to solve this task +finely. Then, we introduce a novel dynamic filter that eliminates noise cues +and preserves critical features in the motion magnification and amplification +generation phases. Overall, our unified framework, EulerMormer, is a pioneering +effort to first equip with Transformer in learning-based VMM. The core of the +dynamic filter lies in a global dynamic sparse cross-covariance attention +mechanism that explicitly removes noise while preserving vital information, +coupled with a multi-scale dual-path gating mechanism that selectively +regulates the dependence on different frequency features to reduce spatial +attenuation and complement motion boundaries. We demonstrate extensive +experiments that EulerMormer achieves more robust video motion magnification +from the Eulerian perspective, significantly outperforming state-of-the-art +methods. The source code is available at +https://github.com/VUT-HFUT/EulerMormer. + +
+
+
+
+
+ + ☆ Diffusing Colors: Image Colorization with Text Guided Diffusion SIGGRAPH + + +
+ The colorization of grayscale images is a complex and subjective task with +significant challenges. Despite recent progress in employing large-scale +datasets with deep neural networks, difficulties with controllability and +visual quality persist. To tackle these issues, we present a novel image +colorization framework that utilizes image diffusion techniques with granular +text prompts. This integration not only produces colorization outputs that are +semantically appropriate but also greatly improves the level of control users +have over the colorization process. Our method provides a balance between +automation and control, outperforming existing techniques in terms of visual +quality and semantic coherence. We leverage a pretrained generative Diffusion +Model, and show that we can finetune it for the colorization task without +losing its generative power or attention to text prompts. Moreover, we present +a novel CLIP-based ranking model that evaluates color vividness, enabling +automatic selection of the most suitable level of vividness based on the +specific scene semantics. Our approach holds potential particularly for color +enhancement and historical image colorization. + +
+
+ comment: SIGGRAPH Asia 2023 +
+
+
+
+
+ + ☆ Towards 4D Human Video Stylization + + +
+ We present a first step towards 4D (3D and time) human video stylization, +which addresses style transfer, novel view synthesis and human animation within +a unified framework. While numerous video stylization methods have been +developed, they are often restricted to rendering images in specific viewpoints +of the input video, lacking the capability to generalize to novel views and +novel poses in dynamic scenes. To overcome these limitations, we leverage +Neural Radiance Fields (NeRFs) to represent videos, conducting stylization in +the rendered feature space. Our innovative approach involves the simultaneous +representation of both the human subject and the surrounding scene using two +NeRFs. This dual representation facilitates the animation of human subjects +across various poses and novel viewpoints. Specifically, we introduce a novel +geometry-guided tri-plane representation, significantly enhancing feature +representation robustness compared to direct tri-plane optimization. Following +the video reconstruction, stylization is performed within the NeRFs' rendered +feature space. Extensive experiments demonstrate that the proposed method +strikes a superior balance between stylized textures and temporal coherence, +surpassing existing approaches. Furthermore, our framework uniquely extends its +capabilities to accommodate novel poses and viewpoints, making it a versatile +tool for creative human video stylization. + +
+
+ comment: Under Review +
+
+
+
+
+ + ☆ Polarimetric Light Transport Analysis for Specular Inter-reflection + + +
+ Polarization is well known for its ability to decompose diffuse and specular +reflections. However, the existing decomposition methods only focus on direct +reflection and overlook multiple reflections, especially specular +inter-reflection. In this paper, we propose a novel decomposition method for +handling specular inter-reflection of metal objects by using a unique +polarimetric feature: the rotation direction of linear polarization. This +rotation direction serves as a discriminative factor between direct and +inter-reflection on specular surfaces. To decompose the reflectance components, +we actively rotate the linear polarization of incident light and analyze the +rotation direction of the reflected light. We evaluate our method using both +synthetic and real data, demonstrating its effectiveness in decomposing +specular inter-reflections of metal objects. Furthermore, we demonstrate that +our method can be combined with other decomposition methods for a detailed +analysis of light transport. As a practical application, we show its +effectiveness in improving the accuracy of 3D measurement against strong +specular inter-reflection. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Forensic Iris Image Synthesis + + +
+ Post-mortem iris recognition is an emerging application of iris-based human +identification in a forensic setup, able to correctly identify deceased +subjects even three weeks post-mortem. This technique thus is considered as an +important component of future forensic toolkits. The current advancements in +this field are seriously slowed down by exceptionally difficult data +collection, which can happen in mortuary conditions, at crime scenes, or in +``body farm'' facilities. This paper makes a novel contribution to facilitate +progress in post-mortem iris recognition by offering a conditional +StyleGAN-based iris synthesis model, trained on the largest-available dataset +of post-mortem iris samples acquired from more than 350 subjects, generating -- +through appropriate exploration of StyleGAN latent space -- multiple +within-class (same identity) and between-class (different new identities) +post-mortem iris images, compliant with ISO/IEC 29794-6, and with decomposition +deformations controlled by the requested PMI (post mortem interval). Besides an +obvious application to enhance the existing, very sparse, post-mortem iris +datasets to advance -- among others -- iris presentation attack endeavors, we +anticipate it may be useful to generate samples that would expose professional +forensic human examiners to never-seen-before deformations for various PMIs, +increasing their training effectiveness. The source codes and model weights are +made available with the paper. + +
+
+
+
+
+ + ☆ A Multilevel Guidance-Exploration Network and Behavior-Scene Matching + Method for Human Behavior Anomaly Detection + + +
+ Human behavior anomaly detection aims to identify unusual human actions, +playing a crucial role in intelligent surveillance and other areas. The current +mainstream methods still adopt reconstruction or future frame prediction +techniques. However, reconstructing or predicting low-level pixel features +easily enables the network to achieve overly strong generalization ability, +allowing anomalies to be reconstructed or predicted as effectively as normal +data. Different from their methods, inspired by the Student-Teacher Network, we +propose a novel framework called the Multilevel Guidance-Exploration +Network(MGENet), which detects anomalies through the difference in high-level +representation between the Guidance and Exploration network. Specifically, we +first utilize the pre-trained Normalizing Flow that takes skeletal keypoints as +input to guide an RGB encoder, which takes unmasked RGB frames as input, to +explore motion latent features. Then, the RGB encoder guides the mask encoder, +which takes masked RGB frames as input, to explore the latent appearance +feature. Additionally, we design a Behavior-Scene Matching Module(BSMM) to +detect scene-related behavioral anomalies. Extensive experiments demonstrate +that our proposed method achieves state-of-the-art performance on ShanghaiTech +and UBnormal datasets, with AUC of 86.9 % and 73.5 %, respectively. The code +will be available on https://github.com/molu-ggg/GENet. + +
+
+ comment: 8 pages,7 figures +
+
+
+
+
+ + ☆ Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic + Play + + +
+ Infants' ability to recognize and categorize objects develops gradually. The +second year of life is marked by both the emergence of more semantic visual +representations and a better understanding of word meaning. This suggests that +language input may play an important role in shaping visual representations. +However, even in suitable contexts for word learning like dyadic play sessions, +caregivers utterances are sparse and ambiguous, often referring to objects that +are different from the one to which the child attends. Here, we systematically +investigate to what extent caregivers' utterances can nevertheless enhance +visual representations. For this we propose a computational model of visual +representation learning during dyadic play. We introduce a synthetic dataset of +ego-centric images perceived by a toddler-agent that moves and rotates toy +objects in different parts of its home environment while hearing caregivers' +utterances, modeled as captions. We propose to model toddlers' learning as +simultaneously aligning representations for 1) close-in-time images and 2) +co-occurring images and utterances. We show that utterances with statistics +matching those of real caregivers give rise to representations supporting +improved category recognition. Our analysis reveals that a small +decrease/increase in object-relevant naming frequencies can drastically impact +the learned representations. This affects the attention on object names within +an utterance, which is required for efficient visuo-linguistic alignment. +Overall, our results support the hypothesis that caregivers' naming utterances +can improve toddlers' visual representations. + +
+
+ comment: Proceedings of the 2023 IEEE International Conference on Development + and Learning (ICDL) +
+
+
+
+
+ + ☆ Instance Tracking in 3D Scenes from Egocentric Videos + + +
+ Egocentric sensors such as AR/VR devices capture human-object interactions +and offer the potential to provide task-assistance by recalling 3D locations of +objects of interest in the surrounding environment. This capability requires +instance tracking in real-world 3D scenes from egocentric videos (IT3DEgo). We +explore this problem by first introducing a new benchmark dataset, consisting +of RGB and depth videos, per-frame camera pose, and instance-level annotations +in both 2D camera and 3D world coordinates. We present an evaluation protocol +which evaluates tracking performance in 3D coordinates with two settings for +enrolling instances to track: (1) single-view online enrollment where an +instance is specified on-the-fly based on the human wearer's interactions. and +(2) multi-view pre-enrollment where images of an instance to be tracked are +stored in memory ahead of time. To address IT3DEgo, we first re-purpose methods +from relevant areas, e.g., single object tracking (SOT) -- running SOT methods +to track instances in 2D frames and lifting them to 3D using camera pose and +depth. We also present a simple method that leverages pretrained segmentation +and detection models to generate proposals from RGB frames and match proposals +with enrolled instance images. Perhaps surprisingly, our extensive experiments +show that our method (with no finetuning) significantly outperforms SOT-based +approaches. We conclude by arguing that the problem of egocentric instance +tracking is made easier by leveraging camera pose and using a 3D allocentric +(world) coordinate representation. + +
+
+
+
+
+ + ☆ Multi-strategy Collaborative Optimized YOLOv5s and its Application in + Distance Estimation + + +
+ The increasing accident rate brought about by the explosive growth of +automobiles has made the research on active safety systems of automobiles +increasingly important. The importance of improving the accuracy of vehicle +target detection is self-evident. To achieve the goals of vehicle detection and +distance estimation and provide safety warnings, a Distance Estimation Safety +Warning System (DESWS) based on a new neural network model (YOLOv5s-SE) by +replacing the IoU with DIoU, embedding SE attention module, and a distance +estimation method through using the principle of similar triangles was +proposed. In addition, a method that can give safety suggestions based on the +estimated distance using nonparametric testing was presented in this work. +Through the simulation experiment, it was verified that the mAP was improved by +5.5% and the purpose of giving safety suggestions based on the estimated +distance information can be achieved. + +
+
+ comment: This paper contains 5 pages, 10 figures, and was accepted at 4th + International Conference on Advances in Electrical Engineering and Computer + Applications (AEECA2023) +
+
+
+
+
+ + ☆ Identity-Obscured Neural Radiance Fields: Privacy-Preserving 3D Facial + Reconstruction + + +
+ Neural radiance fields (NeRF) typically require a complete set of images +taken from multiple camera perspectives to accurately reconstruct geometric +details. However, this approach raise significant privacy concerns in the +context of facial reconstruction. The critical need for privacy protection +often leads invidividuals to be reluctant in sharing their facial images, due +to fears of potential misuse or security risks. Addressing these concerns, we +propose a method that leverages privacy-preserving images for reconstructing 3D +head geometry within the NeRF framework. Our method stands apart from +traditional facial reconstruction techniques as it does not depend on RGB +information from images containing sensitive facial data. Instead, it +effectively generates plausible facial geometry using a series of +identity-obscured inputs, thereby protecting facial privacy. + +
+
+
+
+
+ + ☆ Learn to Unlearn for Deep Neural Networks: Minimizing Unlearning + Interference with Gradient Projection WACV 2024 + + +
+ Recent data-privacy laws have sparked interest in machine unlearning, which +involves removing the effect of specific training samples from a learnt model +as if they were never present in the original training dataset. The challenge +of machine unlearning is to discard information about the ``forget'' data in +the learnt model without altering the knowledge about the remaining dataset and +to do so more efficiently than the naive retraining approach. To achieve this, +we adopt a projected-gradient based learning method, named as +Projected-Gradient Unlearning (PGU), in which the model takes steps in the +orthogonal direction to the gradient subspaces deemed unimportant for the +retaining dataset, so as to its knowledge is preserved. By utilizing Stochastic +Gradient Descent (SGD) to update the model weights, our method can efficiently +scale to any model and dataset size. We provide empirically evidence to +demonstrate that our unlearning method can produce models that behave similar +to models retrained from scratch across various metrics even when the training +dataset is no longer accessible. Our code is available at +https://github.com/hnanhtuan/projected_gradient_unlearning. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ☆ Open-Vocabulary Segmentation with Semantic-Assisted Calibration + + +
+ This paper studies open-vocabulary segmentation (OVS) through calibrating +in-vocabulary and domain-biased embedding space with generalized contextual +prior of CLIP. As the core of open-vocabulary understanding, alignment of +visual content with the semantics of unbounded text has become the bottleneck +of this field. To address this challenge, recent works propose to utilize CLIP +as an additional classifier and aggregate model predictions with CLIP +classification results. Despite their remarkable progress, performance of OVS +methods in relevant scenarios is still unsatisfactory compared with supervised +counterparts. We attribute this to the in-vocabulary embedding and +domain-biased CLIP prediction. To this end, we present a Semantic-assisted +CAlibration Network (SCAN). In SCAN, we incorporate generalized semantic prior +of CLIP into proposal embedding to avoid collapsing on known categories. +Besides, a contextual shift strategy is applied to mitigate the lack of global +context and unnatural background noise. With above designs, SCAN achieves +state-of-the-art performance on all popular open-vocabulary segmentation +benchmarks. Furthermore, we also focus on the problem of existing evaluation +system that ignores semantic duplication across categories, and propose a new +metric called Semantic-Guided IoU (SG-IoU). + +
+
+
+
+
+ + ☆ VRPTEST: Evaluating Visual Referring Prompting in Large Multimodal + Models + + +
+ With recent advancements in Large Multimodal Models (LMMs) across various +domains, a novel prompting method called visual referring prompting has +emerged, showing significant potential in enhancing human-computer interaction +within multimodal systems. This method offers a more natural and flexible +approach to human interaction with these systems compared to traditional text +descriptions or coordinates. However, the categorization of visual referring +prompting remains undefined, and its impact on the performance of LMMs has yet +to be formally examined. In this study, we conduct the first comprehensive +analysis of LMMs using a variety of visual referring prompting strategies. We +introduce a benchmark dataset called VRPTEST, comprising 3 different visual +tasks and 2,275 images, spanning diverse combinations of prompt strategies. +Using VRPTEST, we conduct a comprehensive evaluation of eight versions of +prominent open-source and proprietary foundation models, including two early +versions of GPT-4V. We develop an automated assessment framework based on +software metamorphic testing techniques to evaluate the accuracy of LMMs +without the need for human intervention or manual labeling. We find that the +current proprietary models generally outperform the open-source ones, showing +an average accuracy improvement of 22.70%; however, there is still potential +for improvement. Moreover, our quantitative analysis shows that the choice of +prompt strategy significantly affects the accuracy of LMMs, with variations +ranging from -17.5% to +7.3%. Further case studies indicate that an appropriate +visual referring prompting strategy can improve LMMs' understanding of context +and location information, while an unsuitable one might lead to answer +rejection. We also provide insights on minimizing the negative impact of visual +referring prompting on LMMs. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ MTVG : Multi-text Video Generation with Text-to-Video Models + + +
+ Recently, video generation has attracted massive attention and yielded +noticeable outcomes. Concerning the characteristics of video, multi-text +conditioning incorporating sequential events is necessary for next-step video +generation. In this work, we propose a novel multi-text video generation~(MTVG) +by directly utilizing a pre-trained diffusion-based text-to-video~(T2V) +generation model without additional fine-tuning. To generate consecutive video +segments, visual consistency generated by distinct prompts is necessary with +diverse variations, such as motion and content-related transitions. Our +proposed MTVG includes Dynamic Noise and Last Frame Aware Inversion which +reinitialize the noise latent to preserve visual coherence between videos of +different prompts and prevent repetitive motion or contents. Furthermore, we +present Structure Guiding Sampling to maintain the global appearance across the +frames in a single video clip, where we leverage iterative latent updates +across the preceding frame. Additionally, our Prompt Generator allows for +arbitrary format of text conditions consisting of diverse events. As a result, +our extensive experiments, including diverse transitions of descriptions, +demonstrate that our proposed methods show superior generated outputs in terms +of semantically coherent and temporally seamless video.Video examples are +available in our project page: https://kuai-lab.github.io/mtvg-page. + +
+
+
+
+
+ + ☆ Large Language Models are Good Prompt Learners for Low-Shot Image + Classification + + +
+ Low-shot image classification, where training images are limited or +inaccessible, has benefited from recent progress on pre-trained vision-language +(VL) models with strong generalizability, e.g. CLIP. Prompt learning methods +built with VL models generate text features from the class names that only have +confined class-specific information. Large Language Models (LLMs), with their +vast encyclopedic knowledge, emerge as the complement. Thus, in this paper, we +discuss the integration of LLMs to enhance pre-trained VL models, specifically +on low-shot classification. However, the domain gap between language and vision +blocks the direct application of LLMs. Thus, we propose LLaMP, Large Language +Models as Prompt learners, that produces adaptive prompts for the CLIP text +encoder, establishing it as the connecting bridge. Experiments show that, +compared with other state-of-the-art prompt learning methods, LLaMP yields +better performance on both zero-shot generalization and few-shot image +classification, over a spectrum of 11 datasets. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Combining inherent knowledge of vision-language models with unsupervised + domain adaptation through self-knowledge distillation + + +
+ Unsupervised domain adaptation (UDA) tries to overcome the tedious work of +labeling data by leveraging a labeled source dataset and transferring its +knowledge to a similar but different target dataset. On the other hand, current +vision-language models exhibit astonishing zero-shot prediction capabilities. +In this work, we combine knowledge gained through UDA with the inherent +knowledge of vision-language models. In a first step, we generate the zero-shot +predictions of the source and target dataset using the vision-language model. +Since zero-shot predictions usually exhibit a large entropy, meaning that the +class probabilities are rather evenly distributed, we first adjust the +distribution to accentuate the winning probabilities. This is done using both +source and target data to keep the relative confidence between source and +target data. We then employ a conventional DA method, to gain the knowledge +from the source dataset, in combination with self-knowledge distillation, to +maintain the inherent knowledge of the vision-language model. We further +combine our method with a gradual source domain expansion strategy (GSDE) and +show that this strategy can also benefit by including zero-shot predictions. We +conduct experiments and ablation studies on three benchmarks (OfficeHome, +VisDA, and DomainNet) and outperform state-of-the-art methods. We further show +in ablation studies the contributions of different parts of our algorithm. + +
+
+
+
+
+ + ☆ An unsupervised approach towards promptable defect segmentation in + laser-based additive manufacturing by Segment Anything + + +
+ Foundation models are currently driving a paradigm shift in computer vision +tasks for various fields including biology, astronomy, and robotics among +others, leveraging user-generated prompts to enhance their performance. In the +manufacturing domain, accurate image-based defect segmentation is imperative to +ensure product quality and facilitate real-time process control. However, such +tasks are often characterized by multiple challenges including the absence of +labels and the requirement for low latency inference among others. To address +these issues, we construct a framework for image segmentation using a +state-of-the-art Vision Transformer (ViT) based Foundation model (Segment +Anything Model) with a novel multi-point prompt generation scheme using +unsupervised clustering. We apply our framework to perform real-time porosity +segmentation in a case study of laser base powder bed fusion (L-PBF) and obtain +high Dice Similarity Coefficients (DSC) without the necessity for any +supervised fine-tuning in the model. Using such lightweight foundation model +inference in conjunction with unsupervised prompt generation, we envision the +construction of a real-time anomaly detection pipeline that has the potential +to revolutionize the current laser-based additive manufacturing processes, +thereby facilitating the shift towards Industry 4.0 and promoting defect-free +production along with operational efficiency. + +
+
+ comment: 18 pages, 9 figures +
+
+
+
+
+ + ☆ Differentiable Registration of Images and LiDAR Point Clouds with + VoxelPoint-to-Pixel Matching NeurIPS2023 + + +
+ Cross-modality registration between 2D images from cameras and 3D point +clouds from LiDARs is a crucial task in computer vision and robotic. Previous +methods estimate 2D-3D correspondences by matching point and pixel patterns +learned by neural networks, and use Perspective-n-Points (PnP) to estimate +rigid transformation during post-processing. However, these methods struggle to +map points and pixels to a shared latent space robustly since points and pixels +have very different characteristics with patterns learned in different manners +(MLP and CNN), and they also fail to construct supervision directly on the +transformation since the PnP is non-differentiable, which leads to unstable +registration results. To address these problems, we propose to learn a +structured cross-modality latent space to represent pixel features and 3D +features via a differentiable probabilistic PnP solver. Specifically, we design +a triplet network to learn VoxelPoint-to-Pixel matching, where we represent 3D +elements using both voxels and points to learn the cross-modality latent space +with pixels. We design both the voxel and pixel branch based on CNNs to operate +convolutions on voxels/pixels represented in grids, and integrate an additional +point branch to regain the information lost during voxelization. We train our +framework end-to-end by imposing supervisions directly on the predicted pose +distribution with a probabilistic PnP solver. To explore distinctive patterns +of cross-modality features, we design a novel loss with adaptive-weighted +optimization for cross-modality feature description. The experimental results +on KITTI and nuScenes datasets show significant improvements over the +state-of-the-art methods. The code and models are available at +https://github.com/junshengzhou/VP2P-Match. + +
+
+ comment: To appear at NeurIPS2023 (Spotlight). Code is available at + https://github.com/junshengzhou/VP2P-Match +
+
+
+
+
+ + ☆ Residual Graph Convolutional Network for Bird's-Eye-View Semantic + Segmentation WACV 2024 + + +
+ Retrieving spatial information and understanding the semantic information of +the surroundings are important for Bird's-Eye-View (BEV) semantic segmentation. +In the application of autonomous driving, autonomous vehicles need to be aware +of their surroundings to drive safely. However, current BEV semantic +segmentation techniques, deep Convolutional Neural Networks (CNNs) and +transformers, have difficulties in obtaining the global semantic relationships +of the surroundings at the early layers of the network. In this paper, we +propose to incorporate a novel Residual Graph Convolutional (RGC) module in +deep CNNs to acquire both the global information and the region-level semantic +relationship in the multi-view image domain. Specifically, the RGC module +employs a non-overlapping graph space projection to efficiently project the +complete BEV information into graph space. It then builds interconnected +spatial and channel graphs to extract spatial information between each node and +channel information within each node (i.e., extract contextual relationships of +the global features). Furthermore, it uses a downsample residual process to +enhance the coordinate feature reuse to maintain the global information. The +segmentation data augmentation and alignment module helps to simultaneously +augment and align BEV features and ground truth to geometrically preserve their +alignment to achieve better segmentation results. Our experimental results on +the nuScenes benchmark dataset demonstrate that the RGC network outperforms +four state-of-the-art networks and its four variants in terms of IoU and mIoU. +The proposed RGC network achieves a higher mIoU of 3.1% than the best +state-of-the-art network, BEVFusion. Code and models will be released. + +
+
+ comment: 8 pages, 5 figures, this paper has been accepted by and will be + presented at the WACV 2024 +
+
+
+
+
+ + ☆ Doodle Your 3D: From Abstract Freehand Sketches to Precise 3D Shapes + + +
+ In this paper, we democratise 3D content creation, enabling precise +generation of 3D shapes from abstract sketches while overcoming limitations +tied to drawing skills. We introduce a novel part-level modelling and alignment +framework that facilitates abstraction modelling and cross-modal +correspondence. Leveraging the same part-level decoder, our approach seamlessly +extends to sketch modelling by establishing correspondence between CLIPasso +edgemaps and projected 3D part regions, eliminating the need for a dataset +pairing human sketches and 3D shapes. Additionally, our method introduces a +seamless in-position editing process as a byproduct of cross-modal part-aligned +modelling. Operating in a low-dimensional implicit space, our approach +significantly reduces computational demands and processing time. + +
+
+ comment: Project Page: https://hmrishavbandy.github.io/doodle23d/ +
+
+
+
+
+ + ☆ DiffusionPhase: Motion Diffusion in Frequency Domain + + +
+ In this study, we introduce a learning-based method for generating +high-quality human motion sequences from text descriptions (e.g., ``A person +walks forward"). Existing techniques struggle with motion diversity and smooth +transitions in generating arbitrary-length motion sequences, due to limited +text-to-motion datasets and the pose representations used that often lack +expressiveness or compactness. To address these issues, we propose the first +method for text-conditioned human motion generation in the frequency domain of +motions. We develop a network encoder that converts the motion space into a +compact yet expressive parameterized phase space with high-frequency details +encoded, capturing the local periodicity of motions in time and space with high +accuracy. We also introduce a conditional diffusion model for predicting +periodic motion parameters based on text descriptions and a start pose, +efficiently achieving smooth transitions between motion sequences associated +with different text descriptions. Experiments demonstrate that our approach +outperforms current methods in generating a broader variety of high-quality +motions, and synthesizing long sequences with natural transitions. + +
+
+
+
+
+ + ☆ Improved Face Representation via Joint Label Classification and + Supervised Contrastive Clustering + + +
+ Face clustering tasks can learn hierarchical semantic information from +large-scale data, which has the potential to help facilitate face recognition. +However, there are few works on this problem. This paper explores it by +proposing a joint optimization task of label classification and supervised +contrastive clustering to introduce the cluster knowledge to the traditional +face recognition task in two ways. We first extend ArcFace with a +cluster-guided angular margin to adjust the within-class feature distribution +according to the hard level of face clustering. Secondly, we propose a +supervised contrastive clustering approach to pull the features to the cluster +center and propose the cluster-aligning procedure to align the cluster center +and the learnable class center in the classifier for joint training. Finally, +extensive qualitative and quantitative experiments on popular facial benchmarks +demonstrate the effectiveness of our paradigm and its superiority over the +existing approaches to face recognition. + +
+
+ comment: 9 pages +
+
+
+
+
+ + ☆ ImFace++: A Sophisticated Nonlinear 3D Morphable Face Model with + Implicit Neural Representations + + +
+ Accurate representations of 3D faces are of paramount importance in various +computer vision and graphics applications. However, the challenges persist due +to the limitations imposed by data discretization and model linearity, which +hinder the precise capture of identity and expression clues in current studies. +This paper presents a novel 3D morphable face model, named ImFace++, to learn a +sophisticated and continuous space with implicit neural representations. +ImFace++ first constructs two explicitly disentangled deformation fields to +model complex shapes associated with identities and expressions, respectively, +which simultaneously facilitate the automatic learning of correspondences +across diverse facial shapes. To capture more sophisticated facial details, a +refinement displacement field within the template space is further +incorporated, enabling a fine-grained learning of individual-specific facial +details. Furthermore, a Neural Blend-Field is designed to reinforce the +representation capabilities through adaptive blending of an array of local +fields. In addition to ImFace++, we have devised an improved learning strategy +to extend expression embeddings, allowing for a broader range of expression +variations. Comprehensive qualitative and quantitative evaluations demonstrate +that ImFace++ significantly advances the state-of-the-art in terms of both face +reconstruction fidelity and correspondence accuracy. + +
+
+ comment: 17 pages, 14 figures. arXiv admin note: text overlap with + arXiv:2203.14510 +
+
+
+
+
+ + ☆ k* Distribution: Evaluating the Latent Space of Deep Neural Networks + using Local Neighborhood Analysis + + +
+ Most examinations of neural networks' learned latent spaces typically employ +dimensionality reduction techniques such as t-SNE or UMAP. While these methods +effectively capture the overall sample distribution in the entire learned +latent space, they tend to distort the structure of sample distributions within +specific classes in the subset of the latent space. This distortion complicates +the task of easily distinguishing classes identifiable by neural networks. In +response to this challenge, we introduce the k* Distribution methodology. This +approach focuses on capturing the characteristics and structure of sample +distributions for individual classes within the subset of the learned latent +space using local neighborhood analysis. The key concept is to facilitate easy +comparison of different k* distributions, enabling analysis of how various +classes are processed by the same neural network. This provides a more profound +understanding of existing contemporary visualizations. Our study reveals three +distinct distributions of samples within the learned latent space subset: a) +Fractured, b) Overlapped, and c) Clustered. We note and demonstrate that the +distribution of samples within the network's learned latent space significantly +varies depending on the class. Furthermore, we illustrate that our analysis can +be applied to explore the latent space of diverse neural network architectures, +various layers within neural networks, transformations applied to input +samples, and the distribution of training and testing data for neural networks. +We anticipate that our approach will facilitate more targeted investigations +into neural networks by collectively examining the distribution of different +samples within the learned latent space. + +
+
+
+
+
+ + ☆ PartDistill: 3D Shape Part Segmentation by Vision-Language Model + Distillation + + +
+ This paper proposes a cross-modal distillation framework, PartDistill, which +transfers 2D knowledge from vision-language models (VLMs) to facilitate 3D +shape part segmentation. PartDistill addresses three major challenges in this +task: the lack of 3D segmentation in invisible or undetected regions in the 2D +projections, inaccurate and inconsistent 2D predictions by VLMs, and the lack +of knowledge accumulation across different 3D shapes. PartDistill consists of a +teacher network that uses a VLM to make 2D predictions and a student network +that learns from the 2D predictions while extracting geometrical features from +multiple 3D shapes to carry out 3D part segmentation. A bi-directional +distillation, including forward and backward distillations, is carried out +within the framework, where the former forward distills the 2D predictions to +the student network, and the latter improves the quality of the 2D predictions, +which subsequently enhances the final 3D part segmentation. Moreover, +PartDistill can exploit generative models that facilitate effortless 3D shape +creation for generating knowledge sources to be distilled. Through extensive +experiments, PartDistill boosts the existing methods with substantial margins +on widely used ShapeNetPart and PartE datasets, by more than 15% and 12% higher +mIoU scores, respectively. + +
+
+
+
+
+ + ☆ Natural-language-driven Simulation Benchmark and Copilot for Efficient + Production of Object Interactions in Virtual Road Scenes + + +
+ We advocate the idea of the natural-language-driven(NLD) simulation to +efficiently produce the object interactions between multiple objects in the +virtual road scenes, for teaching and testing the autonomous driving systems +that should take quick action to avoid collision with obstacles with +unpredictable motions. The NLD simulation allows the brief natural-language +description to control the object interactions, significantly reducing the +human efforts for creating a large amount of interaction data. To facilitate +the research of NLD simulation, we collect the Language-to-Interaction(L2I) +benchmark dataset with 120,000 natural-language descriptions of object +interactions in 6 common types of road topologies. Each description is +associated with the programming code, which the graphic render can use to +visually reconstruct the object interactions in the virtual scenes. As a +methodology contribution, we design SimCopilot to translate the interaction +descriptions to the renderable code. We use the L2I dataset to evaluate +SimCopilot's abilities to control the object motions, generate complex +interactions, and generalize interactions across road topologies. The L2I +dataset and the evaluation results motivate the relevant research of the NLD +simulation. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ KOALA: Self-Attention Matters in Knowledge Distillation of Latent + Diffusion Models for Memory-Efficient and Fast Image Synthesis + + +
+ Stable diffusion is the mainstay of the text-to-image (T2I) synthesis in the +community due to its generation performance and open-source nature. Recently, +Stable Diffusion XL (SDXL), the successor of stable diffusion, has received a +lot of attention due to its significant performance improvements with a higher +resolution of 1024x1024 and a larger model. However, its increased computation +cost and model size require higher-end hardware(e.g., bigger VRAM GPU) for +end-users, incurring higher costs of operation. To address this problem, in +this work, we propose an efficient latent diffusion model for text-to-image +synthesis obtained by distilling the knowledge of SDXL. To this end, we first +perform an in-depth analysis of the denoising U-Net in SDXL, which is the main +bottleneck of the model, and then design a more efficient U-Net based on the +analysis. Secondly, we explore how to effectively distill the generation +capability of SDXL into an efficient U-Net and eventually identify four +essential factors, the core of which is that self-attention is the most +important part. With our efficient U-Net and self-attention-based knowledge +distillation strategy, we build our efficient T2I models, called KOALA-1B & +-700M, while reducing the model size up to 54% and 69% of the original SDXL +model. In particular, the KOALA-700M is more than twice as fast as SDXL while +still retaining a decent generation quality. We hope that due to its balanced +speed-performance tradeoff, our KOALA models can serve as a cost-effective +alternative to SDXL in resource-constrained environments. + +
+
+ comment: Project page: https://youngwanlee.github.io/KOALA/ +
+
+
+
+
+ + ☆ LiDAR: Sensing Linear Probing Performance in Joint Embedding SSL + Architectures + + +
+ Joint embedding (JE) architectures have emerged as a promising avenue for +acquiring transferable data representations. A key obstacle to using JE +methods, however, is the inherent challenge of evaluating learned +representations without access to a downstream task, and an annotated dataset. +Without efficient and reliable evaluation, it is difficult to iterate on +architectural and training choices for JE methods. In this paper, we introduce +LiDAR (Linear Discriminant Analysis Rank), a metric designed to measure the +quality of representations within JE architectures. Our metric addresses +several shortcomings of recent approaches based on feature covariance rank by +discriminating between informative and uninformative features. In essence, +LiDAR quantifies the rank of the Linear Discriminant Analysis (LDA) matrix +associated with the surrogate SSL task -- a measure that intuitively captures +the information content as it pertains to solving the SSL task. We empirically +demonstrate that LiDAR significantly surpasses naive rank based approaches in +its predictive power of optimal hyperparameters. Our proposed criterion +presents a more robust and intuitive means of assessing the quality of +representations within JE architectures, which we hope facilitates broader +adoption of these powerful techniques in various domains. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Stable diffusion for Data Augmentation in COCO and Weed Datasets + + +
+ Generative models have increasingly impacted relative tasks ranging from +image revision and object detection in computer vision to interior design and +idea illustration in more general fields. Stable diffusion is an outstanding +model series that paves the way for producing high-resolution images with +thorough details from text prompts or reference images. It will be an +interesting topic about how to leverage the capability of stable diffusion to +elevate the image variations of certain categories (e.g., vehicles, humans, and +daily objects); particularly, it has the potential to gain improvements for +small datasets with image-sparse categories. This study utilized seven +categories in the popular COCO dataset and three widespread weed species in +Michigan to evaluate the efficiency of a recent version of stable diffusion. In +detail, Stable diffusion was used to generate synthetic images belonging to +these classes; then, YOLOv8 models were trained based on these synthetic +images, whose performance was compared to the models trained on original +images. In addition, several techniques (e.g., Image-to-image translation, +Dreambooth, ControlNet) of Stable diffusion were leveraged for image generation +with different focuses. In spite of the overall results being disappointing, +promising results have been achieved in some classes, illustrating the +potential of stable diffusion models to improve the performance of detection +models, which represent more helpful information being conveyed into the models +by the generated images. This seminal study may expedite the adaption of stable +diffusion models to classification and detection tasks in different fields. + +
+
+
+
+
+ + ☆ Style Transfer to Calvin and Hobbes comics using Stable Diffusion + + +
+ This project report summarizes our journey to perform stable diffusion +fine-tuning on a dataset containing Calvin and Hobbes comics. The purpose is to +convert any given input image into the comic style of Calvin and Hobbes, +essentially performing style transfer. We train stable-diffusion-v1.5 using Low +Rank Adaptation (LoRA) to efficiently speed up the fine-tuning process. The +diffusion itself is handled by a Variational Autoencoder (VAE), which is a +U-net. Our results were visually appealing for the amount of training time and +the quality of input data that went into training. + +
+
+ comment: Project report for ECE 371Q Digital Image Processing at UT Austin +
+
+
+
+
+ + ☆ Improving Medical Report Generation with Adapter Tuning and Knowledge + Enhancement in Vision-Language Foundation Models + + +
+ Medical report generation demands automatic creation of coherent and precise +descriptions for medical images. However, the scarcity of labelled medical +image-report pairs poses formidable challenges in developing large-scale neural +networks capable of harnessing the potential of artificial intelligence, +exemplified by large language models. This study builds upon the +state-of-the-art vision-language pre-training and fine-tuning approach, BLIP-2, +to customize general large-scale foundation models. Integrating adapter tuning +and a medical knowledge enhancement loss, our model significantly improves +accuracy and coherence. Validation on the dataset of ImageCLEFmedical 2023 +demonstrates our model's prowess, achieving the best-averaged results against +several state-of-the-art methods. Significant improvements in ROUGE and CIDEr +underscore our method's efficacy, highlighting promising outcomes for the rapid +medical-domain adaptation of the vision-language foundation models in +addressing challenges posed by data scarcity. + +
+
+
+
+
+ + ♻ ☆ SceneDreamer: Unbounded 3D Scene Generation from 2D Image Collections + + +
+ In this work, we present SceneDreamer, an unconditional generative model for +unbounded 3D scenes, which synthesizes large-scale 3D landscapes from random +noise. Our framework is learned from in-the-wild 2D image collections only, +without any 3D annotations. At the core of SceneDreamer is a principled +learning paradigm comprising 1) an efficient yet expressive 3D scene +representation, 2) a generative scene parameterization, and 3) an effective +renderer that can leverage the knowledge from 2D images. Our approach begins +with an efficient bird's-eye-view (BEV) representation generated from simplex +noise, which includes a height field for surface elevation and a semantic field +for detailed scene semantics. This BEV scene representation enables 1) +representing a 3D scene with quadratic complexity, 2) disentangled geometry and +semantics, and 3) efficient training. Moreover, we propose a novel generative +neural hash grid to parameterize the latent space based on 3D positions and +scene semantics, aiming to encode generalizable features across various scenes. +Lastly, a neural volumetric renderer, learned from 2D image collections through +adversarial training, is employed to produce photorealistic images. Extensive +experiments demonstrate the effectiveness of SceneDreamer and superiority over +state-of-the-art methods in generating vivid yet diverse unbounded 3D worlds. + +
+
+ comment: IEEE Transactions on Pattern Analysis & Machine Intelligence (TPAMI) + 2023; Project Page https://scene-dreamer.github.io/ Code + https://github.com/FrozenBurning/SceneDreamer +
+
+
+
+
+ + ♻ ☆ Plotting Behind the Scenes: Towards Learnable Game Engines + + +
+ Neural video game simulators emerged as powerful tools to generate and edit +videos. Their idea is to represent games as the evolution of an environment's +state driven by the actions of its agents. While such a paradigm enables users +to play a game action-by-action, its rigidity precludes more semantic forms of +control. To overcome this limitation, we augment game models with prompts +specified as a set of natural language actions and desired states. The result-a +Promptable Game Model (PGM)-makes it possible for a user to play the game by +prompting it with high- and low-level action sequences. Most captivatingly, our +PGM unlocks the director's mode, where the game is played by specifying goals +for the agents in the form of a prompt. This requires learning "game AI", +encapsulated by our animation model, to navigate the scene using high-level +constraints, play against an adversary, and devise a strategy to win a point. +To render the resulting state, we use a compositional NeRF representation +encapsulated in our synthesis model. To foster future research, we present +newly collected, annotated and calibrated Tennis and Minecraft datasets. Our +method significantly outperforms existing neural video game simulators in terms +of rendering quality and unlocks applications beyond the capabilities of the +current state of the art. Our framework, data, and models are available at +https://snap-research.github.io/promptable-game-models/. + +
+
+ comment: ACM Transactions on Graphics \c{opyright} Copyright is held by the + owner/author(s) 2023. This is the author's version of the work. It is posted + here for your personal use. Not for redistribution. The definitive Version of + Record was published in ACM Transactions on Graphics, + http://dx.doi.org/10.1145/3635705 +
+
+
+
+
+ + ♻ ☆ Listen to Look into the Future: Audio-Visual Egocentric Gaze + Anticipation + + +
+ Egocentric gaze anticipation serves as a key building block for the emerging +capability of Augmented Reality. Notably, gaze behavior is driven by both +visual cues and audio signals during daily activities. Motivated by this +observation, we introduce the first model that leverages both the video and +audio modalities for egocentric gaze anticipation. Specifically, we propose a +Contrastive Spatial-Temporal Separable (CSTS) fusion approach that adopts two +modules to separately capture audio-visual correlations in spatial and temporal +dimensions, and applies a contrastive loss on the re-weighted audio-visual +features from fusion modules for representation learning. We conduct extensive +ablation studies and thorough analysis using two egocentric video datasets: +Ego4D and Aria, to validate our model design. We also demonstrate our model +outperforms prior state-of-the-art methods by at least +1.9% and +1.6%. +Moreover, we provide visualizations to show the gaze anticipation results and +provide additional insights into audio-visual representation learning. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ♻ ☆ If your data distribution shifts, use self-learning + + +
+ We demonstrate that self-learning techniques like entropy minimization and +pseudo-labeling are simple and effective at improving performance of a deployed +computer vision model under systematic domain shifts. We conduct a wide range +of large-scale experiments and show consistent improvements irrespective of the +model architecture, the pre-training technique or the type of distribution +shift. At the same time, self-learning is simple to use in practice because it +does not require knowledge or access to the original training data or scheme, +is robust to hyperparameter choices, is straight-forward to implement and +requires only a few adaptation epochs. This makes self-learning techniques +highly attractive for any practitioner who applies machine learning algorithms +in the real world. We present state-of-the-art adaptation results on CIFAR10-C +(8.5% error), ImageNet-C (22.0% mCE), ImageNet-R (17.4% error) and ImageNet-A +(14.8% error), theoretically study the dynamics of self-supervised adaptation +methods and propose a new classification dataset (ImageNet-D) which is +challenging even with adaptation. + +
+
+ comment: Web: https://domainadaptation.org/selflearning +
+
+
+
+
+ + ♻ ☆ ViCo: Plug-and-play Visual Condition for Personalized Text-to-image + Generation + + +
+ Personalized text-to-image generation using diffusion models has recently +emerged and garnered significant interest. This task learns a novel concept +(e.g., a unique toy), illustrated in a handful of images, into a generative +model that captures fine visual details and generates photorealistic images +based on textual embeddings. In this paper, we present ViCo, a novel +lightweight plug-and-play method that seamlessly integrates visual condition +into personalized text-to-image generation. ViCo stands out for its unique +feature of not requiring any fine-tuning of the original diffusion model +parameters, thereby facilitating more flexible and scalable model deployment. +This key advantage distinguishes ViCo from most existing models that +necessitate partial or full diffusion fine-tuning. ViCo incorporates an image +attention module that conditions the diffusion process on patch-wise visual +semantics, and an attention-based object mask that comes at no extra cost from +the attention module. Despite only requiring light parameter training (~6% +compared to the diffusion U-Net), ViCo delivers performance that is on par +with, or even surpasses, all state-of-the-art models, both qualitatively and +quantitatively. This underscores the efficacy of ViCo, making it a highly +promising solution for personalized text-to-image generation without the need +for diffusion model fine-tuning. Code: https://github.com/haoosz/ViCo + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ MapFormer: Boosting Change Detection by Using Pre-change Information ICCV 2023 + + +
+ Change detection in remote sensing imagery is essential for a variety of +applications such as urban planning, disaster management, and climate research. +However, existing methods for identifying semantically changed areas overlook +the availability of semantic information in the form of existing maps +describing features of the earth's surface. In this paper, we leverage this +information for change detection in bi-temporal images. We show that the simple +integration of the additional information via concatenation of latent +representations suffices to significantly outperform state-of-the-art change +detection methods. Motivated by this observation, we propose the new task of +*Conditional Change Detection*, where pre-change semantic information is used +as input next to bi-temporal images. To fully exploit the extra information, we +propose *MapFormer*, a novel architecture based on a multi-modal feature fusion +module that allows for feature processing conditioned on the available semantic +information. We further employ a supervised, cross-modal contrastive loss to +guide the learning of visual representations. Our approach outperforms existing +change detection methods by an absolute 11.7\% and 18.4\% in terms of binary +change IoU on DynamicEarthNet and HRSCD, respectively. Furthermore, we +demonstrate the robustness of our approach to the quality of the pre-change +semantic information and the absence pre-change imagery. The code is available +at https://github.com/mxbh/mapformer. + +
+
+ comment: accepted at ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave + Communications + + +
+ This study demonstrates the feasibility of point cloud-based proactive link +quality prediction for millimeter-wave (mmWave) communications. Previous +studies have proposed machine learning-based methods to predict received signal +strength for future time periods using time series of depth images to mitigate +the line-of-sight (LOS) path blockage by pedestrians in mmWave communication. +However, these image-based methods have limited applicability due to privacy +concerns as camera images may contain sensitive information. This study +proposes a point cloud-based method for mmWave link quality prediction and +demonstrates its feasibility through experiments. Point clouds represent +three-dimensional (3D) spaces as a set of points and are sparser and less +likely to contain sensitive information than camera images. Additionally, point +clouds provide 3D position and motion information, which is necessary for +understanding the radio propagation environment involving pedestrians. This +study designs the mmWave link quality prediction method and conducts realistic +indoor experiments, where the link quality fluctuates significantly due to +human blockage, using commercially available IEEE 802.11ad-based 60 GHz +wireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light +detection and ranging (LiDAR) for point cloud acquisition. The experimental +results showed that our proposed method can predict future large attenuation of +mmWave received signal strength and throughput induced by the LOS path blockage +by pedestrians with comparable or superior accuracy to image-based prediction +methods. Hence, our point cloud-based method can serve as a viable alternative +to image-based methods. + +
+
+
+
+
+ + ♻ ☆ MAELi: Masked Autoencoder for Large-Scale LiDAR Point Clouds WACV 2024 + + +
+ The sensing process of large-scale LiDAR point clouds inevitably causes large +blind spots, i.e. regions not visible to the sensor. We demonstrate how these +inherent sampling properties can be effectively utilized for self-supervised +representation learning by designing a highly effective pre-training framework +that considerably reduces the need for tedious 3D annotations to train +state-of-the-art object detectors. Our Masked AutoEncoder for LiDAR point +clouds (MAELi) intuitively leverages the sparsity of LiDAR point clouds in both +the encoder and decoder during reconstruction. This results in more expressive +and useful initialization, which can be directly applied to downstream +perception tasks, such as 3D object detection or semantic segmentation for +autonomous driving. In a novel reconstruction approach, MAELi distinguishes +between empty and occluded space and employs a new masking strategy that +targets the LiDAR's inherent spherical projection. Thereby, without any ground +truth whatsoever and trained on single frames only, MAELi obtains an +understanding of the underlying 3D scene geometry and semantics. To demonstrate +the potential of MAELi, we pre-train backbones in an end-to-end manner and show +the effectiveness of our unsupervised pre-trained weights on the tasks of 3D +object detection and semantic segmentation. + +
+
+ comment: Accepted to WACV 2024, 16 pages +
+
+
+
+
+ + ♻ ☆ CustomNet: Zero-shot Object Customization with Variable-Viewpoints in + Text-to-Image Diffusion Models + + +
+ Incorporating a customized object into image generation presents an +attractive feature in text-to-image generation. However, existing +optimization-based and encoder-based methods are hindered by drawbacks such as +time-consuming optimization, insufficient identity preservation, and a +prevalent copy-pasting effect. To overcome these limitations, we introduce +CustomNet, a novel object customization approach that explicitly incorporates +3D novel view synthesis capabilities into the object customization process. +This integration facilitates the adjustment of spatial position relationships +and viewpoints, yielding diverse outputs while effectively preserving object +identity. Moreover, we introduce delicate designs to enable location control +and flexible background control through textual descriptions or specific +user-defined images, overcoming the limitations of existing 3D novel view +synthesis methods. We further leverage a dataset construction pipeline that can +better handle real-world objects and complex backgrounds. Equipped with these +designs, our method facilitates zero-shot object customization without +test-time optimization, offering simultaneous control over the viewpoints, +location, and background. As a result, our CustomNet ensures enhanced identity +preservation and generates diverse, harmonious outputs. + +
+
+ comment: Project webpage available at https://jiangyzy.github.io/CustomNet/ +
+
+
+
+
+ + ♻ ☆ How (not) to ensemble LVLMs for VQA NeurIPS + 2023 + + +
+ This paper studies ensembling in the era of Large Vision-Language Models +(LVLMs). Ensembling is a classical method to combine different models to get +increased performance. In the recent work on Encyclopedic-VQA the authors +examine a wide variety of models to solve their task: from vanilla LVLMs, to +models including the caption as extra context, to models augmented with +Lens-based retrieval of Wikipedia pages. Intuitively these models are highly +complementary, which should make them ideal for ensembling. Indeed, an oracle +experiment shows potential gains from 48.8% accuracy (the best single model) +all the way up to 67% (best possible ensemble). So it is a trivial exercise to +create an ensemble with substantial real gains. Or is it? + +
+
+ comment: 4th I Can't Believe It's Not Better Workshop (co-located with NeurIPS + 2023) +
+
+
+
+
+ + ♻ ☆ D2S: Representing local descriptors and global scene coordinates for + camera relocalization + + +
+ State-of-the-art visual localization methods mostly rely on complex +procedures to match local descriptors and 3D point clouds. However, these +procedures can incur significant cost in terms of inference, storage, and +updates over time. In this study, we propose a direct learning-based approach +that utilizes a simple network named D2S to represent local descriptors and +their scene coordinates. Our method is characterized by its simplicity and +cost-effectiveness. It solely leverages a single RGB image for localization +during the testing phase and only requires a lightweight model to encode a +complex sparse scene. The proposed D2S employs a combination of a simple loss +function and graph attention to selectively focus on robust descriptors while +disregarding areas such as clouds, trees, and several dynamic objects. This +selective attention enables D2S to effectively perform a binary-semantic +classification for sparse descriptors. Additionally, we propose a new outdoor +dataset to evaluate the capabilities of visual localization methods in terms of +scene generalization and self-updating from unlabeled observations. Our +approach outperforms the state-of-the-art CNN-based methods in scene coordinate +regression in indoor and outdoor environments. It demonstrates the ability to +generalize beyond training data, including scenarios involving transitions from +day to night and adapting to domain shifts, even in the absence of the labeled +data sources. The source code, trained models, dataset, and demo videos are +available at the following link: https://thpjp.github.io/d2s + +
+
+
+
+
+ + ♻ ☆ HiFi4G: High-Fidelity Human Performance Rendering via Compact Gaussian + Splatting + + +
+ We have recently seen tremendous progress in photo-real human modeling and +rendering. Yet, efficiently rendering realistic human performance and +integrating it into the rasterization pipeline remains challenging. In this +paper, we present HiFi4G, an explicit and compact Gaussian-based approach for +high-fidelity human performance rendering from dense footage. Our core +intuition is to marry the 3D Gaussian representation with non-rigid tracking, +achieving a compact and compression-friendly representation. We first propose a +dual-graph mechanism to obtain motion priors, with a coarse deformation graph +for effective initialization and a fine-grained Gaussian graph to enforce +subsequent constraints. Then, we utilize a 4D Gaussian optimization scheme with +adaptive spatial-temporal regularizers to effectively balance the non-rigid +prior and Gaussian updating. We also present a companion compression scheme +with residual compensation for immersive experiences on various platforms. It +achieves a substantial compression rate of approximately 25 times, with less +than 2MB of storage per frame. Extensive experiments demonstrate the +effectiveness of our approach, which significantly outperforms existing +approaches in terms of optimization speed, rendering quality, and storage +overhead. + +
+
+
+
+
+ + ♻ ☆ Dense Optical Tracking: Connecting the Dots + + +
+ Recent approaches to point tracking are able to recover the trajectory of any +scene point through a large portion of a video despite the presence of +occlusions. They are, however, too slow in practice to track every point +observed in a single frame in a reasonable amount of time. This paper +introduces DOT, a novel, simple and efficient method for solving this problem. +It first extracts a small set of tracks from key regions at motion boundaries +using an off-the-shelf point tracking algorithm. Given source and target +frames, DOT then computes rough initial estimates of a dense flow field and +visibility mask through nearest-neighbor interpolation, before refining them +using a learnable optical flow estimator that explicitly handles occlusions and +can be trained on synthetic data with ground-truth correspondences. We show +that DOT is significantly more accurate than current optical flow techniques, +outperforms sophisticated "universal" trackers like OmniMotion, and is on par +with, or better than, the best point tracking algorithms like CoTracker while +being at least two orders of magnitude faster. Quantitative and qualitative +experiments with synthetic and real videos validate the promise of the proposed +approach. Code, data, and videos showcasing the capabilities of our approach +are available in the project webpage: https://16lemoing.github.io/dot . + +
+
+
+
+
+ + ♻ ☆ A Parameterized Generative Adversarial Network Using Cyclic Projection + for Explainable Medical Image Classification + + +
+ Although current data augmentation methods are successful to alleviate the +data insufficiency, conventional augmentation are primarily intra-domain while +advanced generative adversarial networks (GANs) generate images remaining +uncertain, particularly in small-scale datasets. In this paper, we propose a +parameterized GAN (ParaGAN) that effectively controls the changes of synthetic +samples among domains and highlights the attention regions for downstream +classification. Specifically, ParaGAN incorporates projection distance +parameters in cyclic projection and projects the source images to the decision +boundary to obtain the class-difference maps. Our experiments show that ParaGAN +can consistently outperform the existing augmentation methods with explainable +classification on two small-scale medical datasets. + +
+
+ comment: 5 pages, 4 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ Cam4DOcc: Benchmark for Camera-Only 4D Occupancy Forecasting in + Autonomous Driving Applications + + +
+ Understanding how the surrounding environment changes is crucial for +performing downstream tasks safely and reliably in autonomous driving +applications. Recent occupancy estimation techniques using only camera images +as input can provide dense occupancy representations of large-scale scenes +based on the current observation. However, they are mostly limited to +representing the current 3D space and do not consider the future state of +surrounding objects along the time axis. To extend camera-only occupancy +estimation into spatiotemporal prediction, we propose Cam4DOcc, a new benchmark +for camera-only 4D occupancy forecasting, evaluating the surrounding scene +changes in a near future. We build our benchmark based on multiple publicly +available datasets, including nuScenes, nuScenes-Occupancy, and Lyft-Level5, +which provides sequential occupancy states of general movable and static +objects, as well as their 3D backward centripetal flow. To establish this +benchmark for future research with comprehensive comparisons, we introduce four +baseline types from diverse camera-based perception and prediction +implementations, including a static-world occupancy model, voxelization of +point cloud prediction, 2D-3D instance-based prediction, and our proposed novel +end-to-end 4D occupancy forecasting network. Furthermore, the standardized +evaluation protocol for preset multiple tasks is also provided to compare the +performance of all the proposed baselines on present and future occupancy +estimation with respect to objects of interest in autonomous driving scenarios. +The dataset and our implementation of all four baselines in the proposed +Cam4DOcc benchmark will be released here: https://github.com/haomo-ai/Cam4DOcc. + +
+
+
+
+
+ + ♻ ☆ visClust: A visual clustering algorithm based on orthogonal projections + + +
+ We present a novel clustering algorithm, visClust, that is based on lower +dimensional data representations and visual interpretation. Thereto, we design +a transformation that allows the data to be represented by a binary integer +array enabling the use of image processing methods to select a partition. +Qualitative and quantitative analyses measured in accuracy and an adjusted +Rand-Index show that the algorithm performs well while requiring low runtime +and RAM. We compare the results to 6 state-of-the-art algorithms with available +code, confirming the quality of visClust by superior performance in most +experiments. Moreover, the algorithm asks for just one obligatory input +parameter while allowing optimization via optional parameters. The code is made +available on GitHub and straightforward to use. + +
+
+ comment: 28 pages +
+
+
+
+
+ + ♻ ☆ 4D Gaussian Splatting for Real-Time Dynamic Scene Rendering + + +
+ Representing and rendering dynamic scenes has been an important but +challenging task. Especially, to accurately model complex motions, high +efficiency is usually hard to guarantee. To achieve real-time dynamic scene +rendering while also enjoying high training and storage efficiency, we propose +4D Gaussian Splatting (4D-GS) as a holistic representation for dynamic scenes +rather than applying 3D-GS for each individual frame. In 4D-GS, a novel +explicit representation containing both 3D Gaussians and 4D neural voxels is +proposed. A decomposed neural voxel encoding algorithm inspired by HexPlane is +proposed to efficiently build Gaussian features from 4D neural voxels and then +a lightweight MLP is applied to predict Gaussian deformations at novel +timestamps. Our 4D-GS method achieves real-time rendering under high +resolutions, 82 FPS at an 800$\times$800 resolution on an RTX 3090 GPU while +maintaining comparable or better quality than previous state-of-the-art +methods. More demos and code are available at +https://guanjunwu.github.io/4dgs/. + +
+
+ comment: Project page: https://guanjunwu.github.io/4dgs/ +
+
+
+
+
+ + ♻ ☆ Similarity of Neural Architectures using Adversarial Attack + Transferability + + +
+ In recent years, many deep neural architectures have been developed for image +classification. Whether they are similar or dissimilar and what factors +contribute to their (dis)similarities remains curious. To address this +question, we aim to design a quantitative and scalable similarity measure +between neural architectures. We propose Similarity by Attack Transferability +(SAT) from the observation that adversarial attack transferability contains +information related to input gradients and decision boundaries widely used to +understand model behaviors. We conduct a large-scale analysis on 69 +state-of-the-art ImageNet classifiers using our proposed similarity function to +answer the question. Moreover, we observe neural architecture-related phenomena +using model similarity that model diversity can lead to better performance on +model ensembles and knowledge distillation under specific conditions. Our +results provide insights into why developing diverse neural architectures with +distinct components is necessary. + +
+
+ comment: 20pages, 13 figures, 2.3MB +
+
+
+
+
+ + ♻ ☆ Universal Segmentation at Arbitrary Granularity with Language + Instruction + + +
+ This paper aims to achieve universal segmentation of arbitrary semantic +level. Despite significant progress in recent years, specialist segmentation +approaches are limited to specific tasks and data distribution. Retraining a +new model for adaptation to new scenarios or settings takes expensive +computation and time cost, which raises the demand for versatile and universal +segmentation model that can cater to various granularity. Although some +attempts have been made for unifying different segmentation tasks or +generalization to various scenarios, limitations in the definition of paradigms +and input-output spaces make it difficult for them to achieve accurate +understanding of content at arbitrary granularity. To this end, we present +UniLSeg, a universal segmentation model that can perform segmentation at any +semantic level with the guidance of language instructions. For training +UniLSeg, we reorganize a group of tasks from original diverse distributions +into a unified data format, where images with texts describing segmentation +targets as input and corresponding masks are output. Combined with a automatic +annotation engine for utilizing numerous unlabeled data, UniLSeg achieves +excellent performance on various tasks and settings, surpassing both specialist +and unified segmentation models. + +
+
+
+
+
+ + ♻ ☆ SILC: Improving Vision Language Pretraining with Self-Distillation + + +
+ Image-Text pretraining on web-scale image caption datasets has become the +default recipe for open vocabulary classification and retrieval models thanks +to the success of CLIP and its variants. Several works have also used CLIP +features for dense prediction tasks and have shown the emergence of open-set +abilities. However, the contrastive objective used by these models only focuses +on image-text alignment and does not incentivise image feature learning for +dense prediction tasks. In this work, we introduce SILC, a novel framework for +vision language pretraining. SILC improves image-text contrastive learning with +the simple addition of local-to-global correspondence learning by +self-distillation. We show that distilling local image features from an +exponential moving average (EMA) teacher model significantly improves model +performance on dense predictions tasks like detection and segmentation, while +also providing improvements on image-level tasks such as classification and +retrieval. SILC models sets a new state of the art for zero-shot +classification, few shot classification, image and text retrieval, zero-shot +segmentation, and open vocabulary segmentation. We further show that SILC +features greatly benefit open vocabulary detection, captioning and visual +question answering. + +
+
+
+
+
+ + ♻ ☆ XAI-TRIS: Non-linear image benchmarks to quantify false positive + post-hoc attribution of feature importance + + +
+ The field of 'explainable' artificial intelligence (XAI) has produced highly +cited methods that seek to make the decisions of complex machine learning (ML) +methods 'understandable' to humans, for example by attributing 'importance' +scores to input features. Yet, a lack of formal underpinning leaves it unclear +as to what conclusions can safely be drawn from the results of a given XAI +method and has also so far hindered the theoretical verification and empirical +validation of XAI methods. This means that challenging non-linear problems, +typically solved by deep neural networks, presently lack appropriate remedies. +Here, we craft benchmark datasets for three different non-linear classification +scenarios, in which the important class-conditional features are known by +design, serving as ground truth explanations. Using novel quantitative metrics, +we benchmark the explanation performance of a wide set of XAI methods across +three deep learning model architectures. We show that popular XAI methods are +often unable to significantly outperform random performance baselines and edge +detection methods. Moreover, we demonstrate that explanations derived from +different model architectures can be vastly different; thus, prone to +misinterpretation even under controlled conditions. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Exploring the Interactive Guidance for Unified and Effective Image + Matting + + +
+ Recent image matting studies are developing towards proposing trimap-free or +interactive methods for complete complex image matting tasks. Although avoiding +the extensive labors of trimap annotation, existing methods still suffer from +two limitations: (1) For the single image with multiple objects, it is +essential to provide extra interaction information to help determining the +matting target; (2) For transparent objects, the accurate regression of alpha +matte from RGB image is much more difficult compared with the opaque ones. In +this work, we propose a Unified Interactive image Matting method, named UIM, +which solves the limitations and achieves satisfying matting results for any +scenario. Specifically, UIM leverages multiple types of user interaction to +avoid the ambiguity of multiple matting targets, and we compare the pros and +cons of different annotation types in detail. To unify the matting performance +for transparent and opaque objects, we decouple image matting into two stages, +i.e., foreground segmentation and transparency prediction. Moreover, we design +a multi-scale attentive fusion module to alleviate the vagueness in the +boundary region. Experimental results demonstrate that UIM achieves +state-of-the-art performance on the Composition-1K test set and a synthetic +unified dataset. + +
+
+
+
+
+ + ♻ ☆ Temporal Shuffling for Defending Deep Action Recognition Models against + Adversarial Attacks + + +
+ Recently, video-based action recognition methods using convolutional neural +networks (CNNs) achieve remarkable recognition performance. However, there is +still lack of understanding about the generalization mechanism of action +recognition models. In this paper, we suggest that action recognition models +rely on the motion information less than expected, and thus they are robust to +randomization of frame orders. Furthermore, we find that motion monotonicity +remaining after randomization also contributes to such robustness. Based on +this observation, we develop a novel defense method using temporal shuffling of +input videos against adversarial attacks for action recognition models. Another +observation enabling our defense method is that adversarial perturbations on +videos are sensitive to temporal destruction. To the best of our knowledge, +this is the first attempt to design a defense method without additional +training for 3D CNN-based video action recognition models. + +
+
+ comment: 12 pages, accepted to Neural Networks +
+
+
+
+
+ + ♻ ☆ Open-vocabulary object 6D pose estimation + + +
+ We introduce the new setting of open-vocabulary object 6D pose estimation, in +which a textual prompt is used to specify the object of interest. In contrast +to existing approaches, in our setting (i) the object of interest is specified +solely through the textual prompt, (ii) no object model (e.g. CAD or video +sequence) is required at inference, (iii) the object is imaged from two +different viewpoints of two different scenes, and (iv) the object was not +observed during the training phase. To operate in this setting, we introduce a +novel approach that leverages a Vision-Language Model to segment the object of +interest from two distinct scenes and to estimate its relative 6D pose. The key +of our approach is a carefully devised strategy to fuse object-level +information provided by the prompt with local image features, resulting in a +feature space that can generalize to novel concepts. We validate our approach +on a new benchmark based on two popular datasets, REAL275 and Toyota-Light, +which collectively encompass 39 object instances appearing in four thousand +image pairs. The results demonstrate that our approach outperforms both a +well-established hand-crafted method and a recent deep learning-based baseline +in estimating the relative 6D pose of objects in different scenes. Project +page: https://jcorsetti.github.io/oryon/. + +
+
+ comment: Technical report. 21 pages, 15 figures, 6 tables. Updated website + link +
+
+
+
+
+ + ♻ ☆ DualGenerator: Information Interaction-based Generative Network for + Point Cloud Completion + + +
+ Point cloud completion estimates complete shapes from incomplete point clouds +to obtain higher-quality point cloud data. Most existing methods only consider +global object features, ignoring spatial and semantic information of adjacent +points. They cannot distinguish structural information well between different +object parts, and the robustness of models is poor. To tackle these challenges, +we propose an information interaction-based generative network for point cloud +completion ($\mathbf{DualGenerator}$). It contains an adversarial generation +path and a variational generation path, which interact with each other and +share weights. DualGenerator introduces a local refinement module in generation +paths, which captures general structures from partial inputs, and then refines +shape details of the point cloud. It promotes completion in the unknown region +and makes a distinction between different parts more obvious. Moreover, we +design DGStyleGAN to improve the generation quality further. It promotes the +robustness of this network combined with fusion analysis of dual-path +completion results. Qualitative and quantitative evaluations demonstrate that +our method is superior on MVP and Completion3D datasets. The performance will +not degrade significantly after adding noise interference or sparse sampling. + +
+
+
+
+
+ + ♻ ☆ Unveiling Objects with SOLA: An Annotation-Free Image Search on the + Object Level for Automotive Data Sets + + +
+ Huge image data sets are the fundament for the development of the perception +of automated driving systems. A large number of images is necessary to train +robust neural networks that can cope with diverse situations. A sufficiently +large data set contains challenging situations and objects. For testing the +resulting functions, it is necessary that these situations and objects can be +found and extracted from the data set. While it is relatively easy to record a +large amount of unlabeled data, it is far more difficult to find demanding +situations and objects. However, during the development of perception systems, +it must be possible to access challenging data without having to perform +lengthy and time-consuming annotations. A developer must therefore be able to +search dynamically for specific situations and objects in a data set. Thus, we +designed a method which is based on state-of-the-art neural networks to search +for objects with certain properties within an image. For the ease of use, the +query of this search is described using natural language. To determine the time +savings and performance gains, we evaluated our method qualitatively and +quantitatively on automotive data sets. + +
+
+
+
+
+ + ♻ ☆ Assessing hierarchies by their consistent segmentations + + +
+ Current approaches to generic segmentation start by creating a hierarchy of +nested image partitions and then specifying a segmentation from it. Our first +contribution is to describe several ways, most of them new, for specifying +segmentations using the hierarchy elements. Then, we consider the best +hierarchy-induced segmentation specified by a limited number of hierarchy +elements. We focus on a common quality measure for binary segmentations, the +Jaccard index (also known as IoU). Optimizing the Jaccard index is highly +non-trivial, and yet we propose an efficient approach for doing exactly that. +This way we get algorithm-independent upper bounds on the quality of any +segmentation created from the hierarchy. We found that the obtainable +segmentation quality varies significantly depending on the way that the +segments are specified by the hierarchy elements, and that representing a +segmentation with only a few hierarchy elements is often possible. (Code is +available). + +
+
+
+
+
+ + ♻ ☆ A New Fine-grained Alignment Method for Image-text Matching + + +
+ Image-text retrieval is a widely studied topic in the field of computer +vision due to the exponential growth of multimedia data, whose core concept is +to measure the similarity between images and text. However, most existing +retrieval methods heavily rely on cross-attention mechanisms for cross-modal +fine-grained alignment, which takes into account excessive irrelevant regions +and treats prominent and non-significant words equally, thereby limiting +retrieval accuracy. This paper aims to investigate an alignment approach that +reduces the involvement of non-significant fragments in images and text while +enhancing the alignment of prominent segments. For this purpose, we introduce +the Cross-Modal Prominent Fragments Enhancement Aligning Network(CPFEAN), which +achieves improved retrieval accuracy by diminishing the participation of +irrelevant regions during alignment and relatively increasing the alignment +similarity of prominent words. Additionally, we incorporate prior textual +information into image regions to reduce misalignment occurrences. In practice, +we first design a novel intra-modal fragments relationship reasoning method, +and subsequently employ our proposed alignment mechanism to compute the +similarity between images and text. Extensive quantitative comparative +experiments on MS-COCO and Flickr30K datasets demonstrate that our approach +outperforms state-of-the-art methods by about 5% to 10% in the rSum metric. + +
+
+
+
+
+ + ♻ ☆ HODN: Disentangling Human-Object Feature for HOI Detection + + +
+ The task of Human-Object Interaction (HOI) detection is to detect humans and +their interactions with surrounding objects, where transformer-based methods +show dominant advances currently. However, these methods ignore the +relationship among humans, objects, and interactions: 1) human features are +more contributive than object ones to interaction prediction; 2) interactive +information disturbs the detection of objects but helps human detection. In +this paper, we propose a Human and Object Disentangling Network (HODN) to model +the HOI relationships explicitly, where humans and objects are first detected +by two disentangling decoders independently and then processed by an +interaction decoder. Considering that human features are more contributive to +interaction, we propose a Human-Guide Linking method to make sure the +interaction decoder focuses on the human-centric regions with human features as +the positional embeddings. To handle the opposite influences of interactions on +humans and objects, we propose a Stop-Gradient Mechanism to stop interaction +gradients from optimizing the object detection but to allow them to optimize +the human detection. Our proposed method achieves competitive performance on +both the V-COCO and the HICO-Det datasets. It can be combined with existing +methods easily for state-of-the-art results. + +
+
+ comment: Accepted by TMM 2023 +
+
+
+
+
+ + ♻ ☆ ProSpect: Prompt Spectrum for Attribute-Aware Personalization of + Diffusion Models + + +
+ Personalizing generative models offers a way to guide image generation with +user-provided references. Current personalization methods can invert an object +or concept into the textual conditioning space and compose new natural +sentences for text-to-image diffusion models. However, representing and editing +specific visual attributes such as material, style, and layout remains a +challenge, leading to a lack of disentanglement and editability. To address +this problem, we propose a novel approach that leverages the step-by-step +generation process of diffusion models, which generate images from low to high +frequency information, providing a new perspective on representing, generating, +and editing images. We develop the Prompt Spectrum Space P*, an expanded +textual conditioning space, and a new image representation method called +\sysname. ProSpect represents an image as a collection of inverted textual +token embeddings encoded from per-stage prompts, where each prompt corresponds +to a specific generation stage (i.e., a group of consecutive steps) of the +diffusion model. Experimental results demonstrate that P* and ProSpect offer +better disentanglement and controllability compared to existing methods. We +apply ProSpect in various personalized attribute-aware image generation +applications, such as image-guided or text-driven manipulations of materials, +style, and layout, achieving previously unattainable results from a single +image input without fine-tuning the diffusion models. Our source code is +available athttps://github.com/zyxElsa/ProSpect. + +
+
+
+
+
+ + ♻ ☆ Adv-4-Adv: Thwarting Changing Adversarial Perturbations via Adversarial + Domain Adaptation + + +
+ Whereas adversarial training can be useful against specific adversarial +perturbations, they have also proven ineffective in generalizing towards +attacks deviating from those used for training. However, we observe that this +ineffectiveness is intrinsically connected to domain adaptability, another +crucial issue in deep learning for which adversarial domain adaptation appears +to be a promising solution. Consequently, we proposed Adv-4-Adv as a novel +adversarial training method that aims to retain robustness against unseen +adversarial perturbations. Essentially, Adv-4-Adv treats attacks incurring +different perturbations as distinct domains, and by leveraging the power of +adversarial domain adaptation, it aims to remove the domain/attack-specific +features. This forces a trained model to learn a robust domain-invariant +representation, which in turn enhances its generalization ability. Extensive +evaluations on Fashion-MNIST, SVHN, CIFAR-10, and CIFAR-100 demonstrate that a +model trained by Adv-4-Adv based on samples crafted by simple attacks (e.g., +FGSM) can be generalized to more advanced attacks (e.g., PGD), and the +performance exceeds state-of-the-art proposals on these datasets. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ Invariant Target Detection in Images through the Normalized 2-D + Correlation Technique + + +
+ The normalized 2-D correlation technique is a robust method for detecting +targets in images due to its ability to remain invariant under rotation, +translation, and scaling. This paper examines the impact of translation, and +scaling on target identification in images. The results indicate a high level +of accuracy in detecting targets, even when they are exhibit variations in +location and size. The results indicate that the similarity between the image +and the two used targets improves as the resize ratio increases. All +statistical estimators demonstrate a strong similarity between the original and +extracted targets. The elapsed time for all scenarios falls within the range +(44.75-44.85), (37.48-37.73) seconds for bird and children targets +respectively, and the correlation coefficient displays stable relationships +with values that fall within the range of (0.90-0.98) and (0.87-0.93) for bird +and children targets respectively. + +
+
+ comment: The paper in its uploaded format contained errors in analysis as in + Abstract-4th and 5th line. In section 4 ; results and discussion and in + conclusion section. So, in order not to adopt the research as a source by + other researchers I wish to withdraw it completely +
+
+
+
+
+ + ♻ ☆ Point Cloud Attacks in Graph Spectral Domain: When 3D Geometry Meets + Graph Signal Processing + + +
+ With the increasing attention in various 3D safety-critical applications, +point cloud learning models have been shown to be vulnerable to adversarial +attacks. Although existing 3D attack methods achieve high success rates, they +delve into the data space with point-wise perturbation, which may neglect the +geometric characteristics. Instead, we propose point cloud attacks from a new +perspective -- the graph spectral domain attack, aiming to perturb graph +transform coefficients in the spectral domain that corresponds to varying +certain geometric structure. Specifically, leveraging on graph signal +processing, we first adaptively transform the coordinates of points onto the +spectral domain via graph Fourier transform (GFT) for compact representation. +Then, we analyze the influence of different spectral bands on the geometric +structure, based on which we propose to perturb the GFT coefficients via a +learnable graph spectral filter. Considering the low-frequency components +mainly contribute to the rough shape of the 3D object, we further introduce a +low-frequency constraint to limit perturbations within imperceptible +high-frequency components. Finally, the adversarial point cloud is generated by +transforming the perturbed spectral representation back to the data domain via +the inverse GFT. Experimental results demonstrate the effectiveness of the +proposed attack in terms of both the imperceptibility and attack success rates. + +
+
+ comment: Accepted to IEEE Transactions on Pattern Analysis and Machine + Intelligence (TPAMI). arXiv admin note: substantial text overlap with + arXiv:2202.07261 +
+
+
+
+
+ + ♻ ☆ Watching the News: Towards VideoQA Models that can Read + + +
+ Video Question Answering methods focus on commonsense reasoning and visual +cognition of objects or persons and their interactions over time. Current +VideoQA approaches ignore the textual information present in the video. +Instead, we argue that textual information is complementary to the action and +provides essential contextualisation cues to the reasoning process. To this +end, we propose a novel VideoQA task that requires reading and understanding +the text in the video. To explore this direction, we focus on news videos and +require QA systems to comprehend and answer questions about the topics +presented by combining visual and textual cues in the video. We introduce the +``NewsVideoQA'' dataset that comprises more than $8,600$ QA pairs on $3,000+$ +news videos obtained from diverse news channels from around the world. We +demonstrate the limitations of current Scene Text VQA and VideoQA methods and +propose ways to incorporate scene text information into VideoQA methods. + +
+
+
+
+
+ + ♻ ☆ OriCon3D: Effective 3D Object Detection using Orientation and Confidence + + +
+ We introduce a technique for detecting 3D objects and estimating their +position from a single image. Our method is built on top of a similar +state-of-the-art technique [1], but with improved accuracy. The approach +followed in this research first estimates common 3D properties of an object +using a Deep Convolutional Neural Network (DCNN), contrary to other frameworks +that only leverage centre-point predictions. We then combine these estimates +with geometric constraints provided by a 2D bounding box to produce a complete +3D bounding box. The first output of our network estimates the 3D object +orientation using a discrete-continuous loss [1]. The second output predicts +the 3D object dimensions with minimal variance. Here we also present our +extensions by augmenting light-weight feature extractors and a customized +multibin architecture. By combining these estimates with the geometric +constraints of the 2D bounding box, we can accurately (or comparatively) +determine the 3D object pose better than our baseline [1] on the KITTI 3D +detection benchmark [2]. + +
+
+
+
+
+ + ♻ ☆ PAPR: Proximity Attention Point Rendering + + +
+ Learning accurate and parsimonious point cloud representations of scene +surfaces from scratch remains a challenge in 3D representation learning. +Existing point-based methods often suffer from the vanishing gradient problem +or require a large number of points to accurately model scene geometry and +texture. To address these limitations, we propose Proximity Attention Point +Rendering (PAPR), a novel method that consists of a point-based scene +representation and a differentiable renderer. Our scene representation uses a +point cloud where each point is characterized by its spatial position, +influence score, and view-independent feature vector. The renderer selects the +relevant points for each ray and produces accurate colours using their +associated features. PAPR effectively learns point cloud positions to represent +the correct scene geometry, even when the initialization drastically differs +from the target geometry. Notably, our method captures fine texture details +while using only a parsimonious set of points. We also demonstrate four +practical applications of our method: zero-shot geometry editing, object +manipulation, texture transfer, and exposure control. More results and code are +available on our project website at https://zvict.github.io/papr/. + +
+
+
+
+
+ + ♻ ☆ Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging + + +
+ Video face re-aging deals with altering the apparent age of a person to the +target age in videos. This problem is challenging due to the lack of paired +video datasets maintaining temporal consistency in identity and age. Most +re-aging methods process each image individually without considering the +temporal consistency of videos. While some existing works address the issue of +temporal coherence through video facial attribute manipulation in latent space, +they often fail to deliver satisfactory performance in age transformation. To +tackle the issues, we propose (1) a novel synthetic video dataset that features +subjects across a diverse range of age groups; (2) a baseline architecture +designed to validate the effectiveness of our proposed dataset, and (3) the +development of three novel metrics tailored explicitly for evaluating the +temporal consistency of video re-aging techniques. Our comprehensive +experiments on public datasets, such as VFHQ and CelebV-HQ, show that our +method outperforms the existing approaches in terms of both age transformation +and temporal consistency. + +
+
+ comment: 8 pages, 6 figures, 4 tables, Project page: + https://video-reaging.github.io/ +
+
+
+
+
+ + ♻ ☆ Steve-Eye: Equipping LLM-based Embodied Agents with Visual Perception in + Open Worlds + + +
+ Recent studies have presented compelling evidence that large language models +(LLMs) can equip embodied agents with the self-driven capability to interact +with the world, which marks an initial step toward versatile robotics. However, +these efforts tend to overlook the visual richness of open worlds, rendering +the entire interactive process akin to "a blindfolded text-based game." +Consequently, LLM-based agents frequently encounter challenges in intuitively +comprehending their surroundings and producing responses that are easy to +understand. In this paper, we propose Steve-Eye, an end-to-end trained large +multimodal model designed to address this limitation. Steve-Eye integrates the +LLM with a visual encoder which enables it to process visual-text inputs and +generate multimodal feedback. In addition, we use a semi-automatic strategy to +collect an extensive dataset comprising 850K open-world instruction pairs, +empowering our model to encompass three essential functions for an agent: +multimodal perception, foundational knowledge base, and skill prediction and +planning. Lastly, we develop three open-world evaluation benchmarks, then carry +out extensive experiments from a wide range of perspectives to validate our +model's capability to strategically act and plan. Codes and datasets will be +released. + +
+
+ comment: 19 pages, 19 figures +
+
+
+
+
+ + ♻ ☆ BM2CP: Efficient Collaborative Perception with LiDAR-Camera Modalities + + +
+ Collaborative perception enables agents to share complementary perceptual +information with nearby agents. This would improve the perception performance +and alleviate the issues of single-view perception, such as occlusion and +sparsity. Most existing approaches mainly focus on single modality (especially +LiDAR), and not fully exploit the superiority of multi-modal perception. We +propose a collaborative perception paradigm, BM2CP, which employs LiDAR and +camera to achieve efficient multi-modal perception. It utilizes LiDAR-guided +modal fusion, cooperative depth generation and modality-guided intermediate +fusion to acquire deep interactions among modalities of different agents, +Moreover, it is capable to cope with the special case where one of the sensors, +same or different type, of any agent is missing. Extensive experiments validate +that our approach outperforms the state-of-the-art methods with 50X lower +communication volumes in both simulated and real-world autonomous driving +scenarios. Our code is available at https://github.com/byzhaoAI/BM2CP. + +
+
+ comment: 14 pages, 8 figures. Accepted by CoRL 2023 +
+
+
+
+
+ + ♻ ☆ Enhancing Few-shot CLIP with Semantic-Aware Fine-Tuning + + +
+ Learning generalized representations from limited training samples is crucial +for applying deep neural networks in low-resource scenarios. Recently, methods +based on Contrastive Language-Image Pre-training (CLIP) have exhibited +promising performance in few-shot adaptation tasks. To avoid catastrophic +forgetting and overfitting caused by few-shot fine-tuning, existing works +usually freeze the parameters of CLIP pre-trained on large-scale datasets, +overlooking the possibility that some parameters might not be suitable for +downstream tasks. To this end, we revisit CLIP's visual encoder with a specific +focus on its distinctive attention pooling layer, which performs a spatial +weighted-sum of the dense feature maps. Given that dense feature maps contain +meaningful semantic information, and different semantics hold varying +importance for diverse downstream tasks (such as prioritizing semantics like +ears and eyes in pet classification tasks rather than side mirrors), using the +same weighted-sum operation for dense features across different few-shot tasks +might not be appropriate. Hence, we propose fine-tuning the parameters of the +attention pooling layer during the training process to encourage the model to +focus on task-specific semantics. In the inference process, we perform residual +blending between the features pooled by the fine-tuned and the original +attention pooling layers to incorporate both the few-shot knowledge and the +pre-trained CLIP's prior knowledge. We term this method as Semantic-Aware +FinE-tuning (SAFE). SAFE is effective in enhancing the conventional few-shot +CLIP and is compatible with the existing adapter approach (termed SAFE-A). + +
+
+
+
+
+ + ♻ ☆ Class Incremental Learning for Adversarial Robustness + + +
+ Adversarial training integrates adversarial examples during model training to +enhance robustness. However, its application in fixed dataset settings differs +from real-world dynamics, where data accumulates incrementally. In this study, +we investigate Adversarially Robust Class Incremental Learning (ARCIL), a +method that combines adversarial robustness with incremental learning. We +observe that combining incremental learning with naive adversarial training +easily leads to a loss of robustness. We discover that this is attributed to +the disappearance of the flatness of the loss function, a characteristic of +adversarial training. To address this issue, we propose the Flatness Preserving +Distillation (FPD) loss that leverages the output difference between +adversarial and clean examples. Additionally, we introduce the Logit Adjustment +Distillation (LAD) loss, which adapts the model's knowledge to perform well on +new tasks. Experimental results demonstrate the superiority of our method over +approaches that apply adversarial training to existing incremental learning +methods, which provides a strong baseline for incremental learning on +adversarial robustness in the future. Our method achieves AutoAttack accuracy +that is 5.99\%p, 5.27\%p, and 3.90\%p higher on average than the baseline on +split CIFAR-10, CIFAR-100, and Tiny ImageNet, respectively. The code will be +made available. + +
+
+
+
+
+ + ♻ ☆ ZeroNLG: Aligning and Autoencoding Domains for Zero-Shot Multimodal and + Multilingual Natural Language Generation + + +
+ Natural Language Generation (NLG) accepts input data in the form of images, +videos, or text and generates corresponding natural language text as output. +Existing NLG methods mainly adopt a supervised approach and rely heavily on +coupled data-to-text pairs. However, for many targeted scenarios and for +non-English languages, sufficient quantities of labeled data are often not +available. To relax the dependency on labeled data of downstream tasks, we +propose an intuitive and effective zero-shot learning framework, ZeroNLG, which +can deal with multiple NLG tasks, including image-to-text (image captioning), +video-to-text (video captioning), and text-to-text (neural machine +translation), across English, Chinese, German, and French within a unified +framework. ZeroNLG does not require any labeled downstream pairs for training. +During training, ZeroNLG (i) projects different domains (across modalities and +languages) to corresponding coordinates in a shared common latent space; (ii) +bridges different domains by aligning their corresponding coordinates in this +space; and (iii) builds an unsupervised multilingual auto-encoder to learn to +generate text by reconstructing the input text given its coordinate in shared +latent space. Consequently, during inference, based on the data-to-text +pipeline, ZeroNLG can generate target sentences across different languages +given the coordinate of input data in the common space. Within this unified +framework, given visual (imaging or video) data as input, ZeroNLG can perform +zero-shot visual captioning; given textual sentences as input, ZeroNLG can +perform zero-shot machine translation. We present the results of extensive +experiments on twelve NLG tasks, showing that, without using any labeled +downstream pairs for training, ZeroNLG generates high-quality and believable +outputs and significantly outperforms existing zero-shot methods. + +
+
+ comment: Our code and data are available at + https://github.com/yangbang18/ZeroNLG +
+
+
+
+
+ + ♻ ☆ UFOGen: You Forward Once Large Scale Text-to-Image Generation via + Diffusion GANs + + +
+ Text-to-image diffusion models have demonstrated remarkable capabilities in +transforming textual prompts into coherent images, yet the computational cost +of their inference remains a persistent challenge. To address this issue, we +present UFOGen, a novel generative model designed for ultra-fast, one-step +text-to-image synthesis. In contrast to conventional approaches that focus on +improving samplers or employing distillation techniques for diffusion models, +UFOGen adopts a hybrid methodology, integrating diffusion models with a GAN +objective. Leveraging a newly introduced diffusion-GAN objective and +initialization with pre-trained diffusion models, UFOGen excels in efficiently +generating high-quality images conditioned on textual descriptions in a single +step. Beyond traditional text-to-image generation, UFOGen showcases versatility +in applications. Notably, UFOGen stands among the pioneering models enabling +one-step text-to-image generation and diverse downstream tasks, presenting a +significant advancement in the landscape of efficient generative models. + +
+
+
+
+
+ + ♻ ☆ VividTalk: One-Shot Audio-Driven Talking Head Generation Based on 3D + Hybrid Prior + + +
+ Audio-driven talking head generation has drawn much attention in recent +years, and many efforts have been made in lip-sync, expressive facial +expressions, natural head pose generation, and high video quality. However, no +model has yet led or tied on all these metrics due to the one-to-many mapping +between audio and motion. In this paper, we propose VividTalk, a two-stage +generic framework that supports generating high-visual quality talking head +videos with all the above properties. Specifically, in the first stage, we map +the audio to mesh by learning two motions, including non-rigid expression +motion and rigid head motion. For expression motion, both blendshape and vertex +are adopted as the intermediate representation to maximize the representation +ability of the model. For natural head motion, a novel learnable head pose +codebook with a two-phase training mechanism is proposed. In the second stage, +we proposed a dual branch motion-vae and a generator to transform the meshes +into dense motion and synthesize high-quality video frame-by-frame. Extensive +experiments show that the proposed VividTalk can generate high-visual quality +talking head videos with lip-sync and realistic enhanced by a large margin, and +outperforms previous state-of-the-art works in objective and subjective +comparisons. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ A Task is Worth One Word: Learning with Task Prompts for High-Quality + Versatile Image Inpainting + + +
+ Achieving high-quality versatile image inpainting, where user-specified +regions are filled with plausible content according to user intent, presents a +significant challenge. Existing methods face difficulties in simultaneously +addressing context-aware image inpainting and text-guided object inpainting due +to the distinct optimal training strategies required. To overcome this +challenge, we introduce PowerPaint, the first high-quality and versatile +inpainting model that excels in both tasks. First, we introduce learnable task +prompts along with tailored fine-tuning strategies to guide the model's focus +on different inpainting targets explicitly. This enables PowerPaint to +accomplish various inpainting tasks by utilizing different task prompts, +resulting in state-of-the-art performance. Second, we demonstrate the +versatility of the task prompt in PowerPaint by showcasing its effectiveness as +a negative prompt for object removal. Additionally, we leverage prompt +interpolation techniques to enable controllable shape-guided object inpainting. +Finally, we extensively evaluate PowerPaint on various inpainting benchmarks to +demonstrate its superior performance for versatile image inpainting. We release +our codes and models on our project page: https://powerpaint.github.io/. + +
+
+ comment: Project page with code: https://powerpaint.github.io/ +
+
+
+
+
+ + ♻ ☆ FIANCEE: Faster Inference of Adversarial Networks via Conditional Early + Exits + + +
+ Generative DNNs are a powerful tool for image synthesis, but they are limited +by their computational load. On the other hand, given a trained model and a +task, e.g. faces generation within a range of characteristics, the output image +quality will be unevenly distributed among images with different +characteristics. It follows, that we might restrain the models complexity on +some instances, maintaining a high quality. We propose a method for diminishing +computations by adding so-called early exit branches to the original +architecture, and dynamically switching the computational path depending on how +difficult it will be to render the output. We apply our method on two different +SOTA models performing generative tasks: generation from a semantic map, and +cross-reenactment of face expressions; showing it is able to output images with +custom lower-quality thresholds. For a threshold of LPIPS <=0.1, we diminish +their computations by up to a half. This is especially relevant for real-time +applications such as synthesis of faces, when quality loss needs to be +contained, but most of the inputs need fewer computations than the complex +instances. + +
+
+ comment: 12 pages, 22 figures +
+
+
+
+
+ + ♻ ☆ Intrinsic Harmonization for Illumination-Aware Compositing SIGGRAPH + + +
+ Despite significant advancements in network-based image harmonization +techniques, there still exists a domain disparity between typical training +pairs and real-world composites encountered during inference. Most existing +methods are trained to reverse global edits made on segmented image regions, +which fail to accurately capture the lighting inconsistencies between the +foreground and background found in composited images. In this work, we +introduce a self-supervised illumination harmonization approach formulated in +the intrinsic image domain. First, we estimate a simple global lighting model +from mid-level vision representations to generate a rough shading for the +foreground region. A network then refines this inferred shading to generate a +harmonious re-shading that aligns with the background scene. In order to match +the color appearance of the foreground and background, we utilize ideas from +prior harmonization approaches to perform parameterized image edits in the +albedo domain. To validate the effectiveness of our approach, we present +results from challenging real-world composites and conduct a user study to +objectively measure the enhanced realism achieved compared to state-of-the-art +harmonization methods. + +
+
+ comment: 10 pages, 8 figures. Accepted to SIGGRAPH Asia 2023 (Conference + Track). Project page: https://yaksoy.github.io/intrinsicCompositing/ +
+
+
+
+
+ + ♻ ☆ Convolutional layers are equivariant to discrete shifts but not + continuous translations + + +
+ The purpose of this short and simple note is to clarify a common +misconception about convolutional neural networks (CNNs). CNNs are made up of +convolutional layers which are shift equivariant due to weight sharing. +However, convolutional layers are not translation equivariant, even when +boundary effects are ignored and when pooling and subsampling are absent. This +is because shift equivariance is a discrete symmetry while translation +equivariance is a continuous symmetry. This fact is well known among +researchers in equivariant machine learning, but is usually overlooked among +non-experts. To minimize confusion, we suggest using the term `shift +equivariance' to refer to discrete shifts in pixels and `translation +equivariance' to refer to continuous translations. + +
+
+
+
+
+ + ♻ ☆ SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting + + +
+ Spatiotemporal forecasting tasks, such as weather forecasting and traffic +prediction, offer significant societal benefits. These tasks can be effectively +approached as image forecasting problems using computer vision models. Vector +quantization (VQ) is a well-known method for discrete representation that +improves the latent space, leading to enhanced generalization and transfer +learning capabilities. One of the main challenges in using VQ for +spatiotemporal forecasting is how to balance between keeping enough details and +removing noises from the original patterns for better generalization. We +address this challenge by developing sparse vector quantization, or {\bf SVQ} +for short, that leverages sparse regression to make better trade-off between +the two objectives. The main innovation of this work is to approximate sparse +regression by a two-layer MLP and a randomly fixed or learnable matrix, +dramatically improving its computational efficiency. Through experiments +conducted on diverse datasets in multiple fields including weather forecasting, +traffic flow prediction, and video forecasting, we unequivocally demonstrate +that our proposed method consistently enhances the performance of base models +and achieves state-of-the-art results across all benchmarks. + +
+
+
+
+
+ + ♻ ☆ High-resolution power equipment recognition based on improved + self-attention + + +
+ The current trend of automating inspections at substations has sparked a +surge in interest in the field of transformer image recognition. However, due +to restrictions in the number of parameters in existing models, high-resolution +images can't be directly applied, leaving significant room for enhancing +recognition accuracy. Addressing this challenge, the paper introduces a novel +improvement on deep self-attention networks tailored for this issue. The +proposed model comprises four key components: a foundational network, a region +proposal network, a module for extracting and segmenting target areas, and a +final prediction network. The innovative approach of this paper differentiates +itself by decoupling the processes of part localization and recognition, +initially using low-resolution images for localization followed by +high-resolution images for recognition. Moreover, the deep self-attention +network's prediction mechanism uniquely incorporates the semantic context of +images, resulting in substantially improved recognition performance. +Comparative experiments validate that this method outperforms the two other +prevalent target recognition models, offering a groundbreaking perspective for +automating electrical equipment inspections. + +
+
+
+
+
+ + ♻ ☆ Robust Learning Based Condition Diagnosis Method for Distribution + Network Switchgear + + +
+ This paper introduces a robust, learning-based method for diagnosing the +state of distribution network switchgear, which is crucial for maintaining the +power quality for end users. Traditional diagnostic models often rely heavily +on expert knowledge and lack robustness. To address this, our method +incorporates an expanded feature vector that includes environmental data, +temperature readings, switch position, motor operation, insulation conditions, +and local discharge information. We tackle the issue of high dimensionality +through feature mapping. The method introduces a decision radius to categorize +unlabeled samples and updates the model parameters using a combination of +supervised and unsupervised loss, along with a consistency regularization +function. This approach ensures robust learning even with a limited number of +labeled samples. Comparative analysis demonstrates that this method +significantly outperforms existing models in both accuracy and robustness. + +
+
+
+
+
+
+
+
+ + Information Retrieval 4 + +
+
+
+ + ☆ Multi-agricultural Machinery Collaborative Task Assignment Based on + Improved Genetic Hybrid Optimization Algorithm + + +
+ To address the challenges of delayed scheduling information, heavy reliance +on manual labour, and low operational efficiency in traditional large-scale +agricultural machinery operations, this study proposes a method for +multi-agricultural machinery collaborative task assignment based on an improved +genetic hybrid optimisation algorithm. The proposed method establishes a +multi-agricultural machinery task allocation model by combining the path +pre-planning of a simulated annealing algorithm and the static task allocation +of a genetic algorithm. By sequentially fusing these two algorithms, their +respective shortcomings can be overcome, and their advantages in global and +local search can be utilised. Consequently, the search capability of the +population is enhanced, leading to the discovery of more optimal solutions. +Then, an adaptive crossover operator is constructed according to the task +assignment model, considering the capacity, path cost, and time of agricultural +machinery; two-segment coding and multi-population adaptive mutation are used +to assign tasks to improve the diversity of the population and enhance the +exploration ability of the population; and to improve the global optimisation +ability of the hybrid algorithm, a 2-Opt local optimisation operator and an +Circle modification algorithm are introduced. Finally, simulation experiments +were conducted in MATLAB to evaluate the performance of the multi-agricultural +machinery collaborative task assignment based on the improved genetic hybrid +algorithm. The algorithm's capabilities were assessed through comparative +analysis in the simulation trials. The results demonstrate that the developed +hybrid algorithm can effectively reduce path costs, and the efficiency of the +assignment outcomes surpasses that of the classical genetic algorithm. This +approach proves particularly suitable for addressing large-scale task +allocation problems. + +
+
+
+
+
+ + ☆ Synergistic Signals: Exploiting Co-Engagement and Semantic Links via + Graph Neural Networks + + +
+ Given a set of candidate entities (e.g. movie titles), the ability to +identify similar entities is a core capability of many recommender systems. +Most often this is achieved by collaborative filtering approaches, i.e. if +users co-engage with a pair of entities frequently enough, the embeddings +should be similar. However, relying on co-engagement data alone can result in +lower-quality embeddings for new and unpopular entities. We study this problem +in the context recommender systems at Netflix. We observe that there is +abundant semantic information such as genre, content maturity level, themes, +etc. that complements co-engagement signals and provides interpretability in +similarity models. To learn entity similarities from both data sources +holistically, we propose a novel graph-based approach called SemanticGNN. +SemanticGNN models entities, semantic concepts, collaborative edges, and +semantic edges within a large-scale knowledge graph and conducts representation +learning over it. Our key technical contributions are twofold: (1) we develop a +novel relation-aware attention graph neural network (GNN) to handle the +imbalanced distribution of relation types in our graph; (2) to handle web-scale +graph data that has millions of nodes and billions of edges, we develop a novel +distributed graph training paradigm. The proposed model is successfully +deployed within Netflix and empirical experiments indicate it yields up to 35% +improvement in performance on similarity judgment tasks. + +
+
+
+
+
+ + ♻ ☆ GEMRec: Towards Generative Model Recommendation WSDM 2024 + + +
+ Recommender Systems are built to retrieve relevant items to satisfy users' +information needs. The candidate corpus usually consists of a finite set of +items that are ready to be served, such as videos, products, or articles. With +recent advances in Generative AI such as GPT and Diffusion models, a new form +of recommendation task is yet to be explored where items are to be created by +generative models with personalized prompts. Taking image generation as an +example, with a single prompt from the user and access to a generative model, +it is possible to generate hundreds of new images in a few minutes. How shall +we attain personalization in the presence of "infinite" items? In this +preliminary study, we propose a two-stage framework, namely Prompt-Model +Retrieval and Generated Item Ranking, to approach this new task formulation. We +release GEMRec-18K, a prompt-model interaction dataset with 18K images +generated by 200 publicly-available generative models paired with a diverse set +of 90 textual prompts. Our findings demonstrate the promise of generative model +recommendation as a novel personalization problem and the limitations of +existing evaluation metrics. We highlight future directions for the RecSys +community to advance towards generative recommender systems. Our code and +dataset are available at https://github.com/MAPS-research/GEMRec. + +
+
+ comment: Accepted by WSDM 2024 (Demo Track) +
+
+
+
+
+ + ♻ ☆ Deep Learning for Hate Speech Detection: A Comparative Study + + +
+ Automated hate speech detection is an important tool in combating the spread +of hate speech, particularly in social media. Numerous methods have been +developed for the task, including a recent proliferation of deep-learning based +approaches. A variety of datasets have also been developed, exemplifying +various manifestations of the hate-speech detection problem. We present here a +large-scale empirical comparison of deep and shallow hate-speech detection +methods, mediated through the three most commonly used datasets. Our goal is to +illuminate progress in the area, and identify strengths and weaknesses in the +current state-of-the-art. We particularly focus our analysis on measures of +practical performance, including detection accuracy, computational efficiency, +capability in using pre-trained models, and domain generalization. In doing so +we aim to provide guidance as to the use of hate-speech detection in practice, +quantify the state-of-the-art, and identify future research directions. Code +and dataset are available at +https://github.com/jmjmalik22/Hate-Speech-Detection. + +
+
+ comment: 18 pages, 4 figures, and 6 tables +
+
+
+
+
+
+
+
+ + Machine Learning 144 + +
+
+
+ + ☆ Large Language Models for Mathematicians + + +
+ Large language models (LLMs) such as ChatGPT have received immense interest +for their general-purpose language understanding and, in particular, their +ability to generate high-quality text or computer code. For many professions, +LLMs represent an invaluable tool that can speed up and improve the quality of +work. In this note, we discuss to what extent they can aid professional +mathematicians. We first provide a mathematical description of the transformer +model used in all modern language models. Based on recent studies, we then +outline best practices and potential issues and report on the mathematical +abilities of language models. Finally, we shed light on the potential of LMMs +to change how mathematicians work. + +
+
+
+
+
+ + ☆ Improved Visual Grounding through Self-Consistent Explanations + + +
+ Vision-and-language models trained to match images with text can be combined +with visual explanation methods to point to the locations of specific objects +in an image. Our work shows that the localization --"grounding"-- abilities of +these models can be further improved by finetuning for self-consistent visual +explanations. We propose a strategy for augmenting existing text-image datasets +with paraphrases using a large language model, and SelfEQ, a weakly-supervised +strategy on visual explanation maps for paraphrases that encourages +self-consistency. Specifically, for an input textual phrase, we attempt to +generate a paraphrase and finetune the model so that the phrase and paraphrase +map to the same region in the image. We posit that this both expands the +vocabulary that the model is able to handle, and improves the quality of the +object locations highlighted by gradient-based visual explanation methods (e.g. +GradCAM). We demonstrate that SelfEQ improves performance on Flickr30k, +ReferIt, and RefCOCO+ over a strong baseline method and several prior works. +Particularly, comparing to other methods that do not use any type of box +annotations, we obtain 84.07% on Flickr30k (an absolute improvement of 4.69%), +67.40% on ReferIt (an absolute improvement of 7.68%), and 75.10%, 55.49% on +RefCOCO+ test sets A and B respectively (an absolute improvement of 3.74% on +average). + +
+
+ comment: Project Page: https://catherine-r-he.github.io/SelfEQ/ +
+
+
+
+
+ + ☆ Generating Illustrated Instructions + + +
+ We introduce the new task of generating Illustrated Instructions, i.e., +visual instructions customized to a user's needs. We identify desiderata unique +to this task, and formalize it through a suite of automatic and human +evaluation metrics, designed to measure the validity, consistency, and efficacy +of the generations. We combine the power of large language models (LLMs) +together with strong text-to-image generation diffusion models to propose a +simple approach called StackedDiffusion, which generates such illustrated +instructions given text as input. The resulting model strongly outperforms +baseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases, +users even prefer it to human-generated articles. Most notably, it enables +various new and exciting applications far beyond what static articles on the +web can provide, such as personalized instructions complete with intermediate +steps and pictures in response to a user's individual situation. + +
+
+ comment: Project website: + http://facebookresearch.github.io/IllustratedInstructions +
+
+
+
+
+ + ☆ Multiview Aerial Visual Recognition (MAVREC): Can Multi-view Improve + Aerial Visual Perception? + + +
+ Despite the commercial abundance of UAVs, aerial data acquisition remains +challenging, and the existing Asia and North America-centric open-source UAV +datasets are small-scale or low-resolution and lack diversity in scene +contextuality. Additionally, the color content of the scenes, solar-zenith +angle, and population density of different geographies influence the data +diversity. These two factors conjointly render suboptimal aerial-visual +perception of the deep neural network (DNN) models trained primarily on the +ground-view data, including the open-world foundational models. + To pave the way for a transformative era of aerial detection, we present +Multiview Aerial Visual RECognition or MAVREC, a video dataset where we record +synchronized scenes from different perspectives -- ground camera and +drone-mounted camera. MAVREC consists of around 2.5 hours of industry-standard +2.7K resolution video sequences, more than 0.5 million frames, and 1.1 million +annotated bounding boxes. This makes MAVREC the largest ground and aerial-view +dataset, and the fourth largest among all drone-based datasets across all +modalities and tasks. Through our extensive benchmarking on MAVREC, we +recognize that augmenting object detectors with ground-view images from the +corresponding geographical location is a superior pre-training strategy for +aerial detection. Building on this strategy, we benchmark MAVREC with a +curriculum-based semi-supervised object detection approach that leverages +labeled (ground and aerial) and unlabeled (only aerial) images to enhance the +aerial detection. We publicly release the MAVREC dataset: +https://mavrec.github.io. + +
+
+
+
+
+ + ☆ PlayFusion: Skill Acquisition via Diffusion from Language-Annotated Play + + +
+ Learning from unstructured and uncurated data has become the dominant +paradigm for generative approaches in language and vision. Such unstructured +and unguided behavior data, commonly known as play, is also easier to collect +in robotics but much more difficult to learn from due to its inherently +multimodal, noisy, and suboptimal nature. In this paper, we study this problem +of learning goal-directed skill policies from unstructured play data which is +labeled with language in hindsight. Specifically, we leverage advances in +diffusion models to learn a multi-task diffusion model to extract robotic +skills from play data. Using a conditional denoising diffusion process in the +space of states and actions, we can gracefully handle the complexity and +multimodality of play data and generate diverse and interesting robot +behaviors. To make diffusion models more useful for skill learning, we +encourage robotic agents to acquire a vocabulary of skills by introducing +discrete bottlenecks into the conditional behavior generation process. In our +experiments, we demonstrate the effectiveness of our approach across a wide +variety of environments in both simulation and the real world. Results +visualizations and videos at https://play-fusion.github.io + +
+
+ comment: In CoRL 2023. Website at https://play-fusion.github.io +
+
+
+
+
+ + ☆ Adversarial Learning for Feature Shift Detection and Correction + + +
+ Data shift is a phenomenon present in many real-world applications, and while +there are multiple methods attempting to detect shifts, the task of localizing +and correcting the features originating such shifts has not been studied in +depth. Feature shifts can occur in many datasets, including in multi-sensor +data, where some sensors are malfunctioning, or in tabular and structured data, +including biomedical, financial, and survey data, where faulty standardization +and data processing pipelines can lead to erroneous features. In this work, we +explore using the principles of adversarial learning, where the information +from several discriminators trained to distinguish between two distributions is +used to both detect the corrupted features and fix them in order to remove the +distribution shift between datasets. We show that mainstream supervised +classifiers, such as random forest or gradient boosting trees, combined with +simple iterative heuristics, can localize and correct feature shifts, +outperforming current statistical and neural network-based techniques. The code +is available at https://github.com/AI-sandbox/DataFix. + +
+
+
+
+
+ + ☆ SoK: Unintended Interactions among Machine Learning Defenses and Risks + + +
+ Machine learning (ML) models cannot neglect risks to security, privacy, and +fairness. Several defenses have been proposed to mitigate such risks. When a +defense is effective in mitigating one risk, it may correspond to increased or +decreased susceptibility to other risks. Existing research lacks an effective +framework to recognize and explain these unintended interactions. We present +such a framework, based on the conjecture that overfitting and memorization +underlie unintended interactions. We survey existing literature on unintended +interactions, accommodating them within our framework. We use our framework to +conjecture on two previously unexplored interactions, and empirically validate +our conjectures. + +
+
+
+
+
+ + ☆ Sim-to-Real Causal Transfer: A Metric Learning Approach to + Causally-Aware Interaction Representations + + +
+ Modeling spatial-temporal interactions among neighboring agents is at the +heart of multi-agent problems such as motion forecasting and crowd navigation. +Despite notable progress, it remains unclear to which extent modern +representations can capture the causal relationships behind agent interactions. +In this work, we take an in-depth look at the causal awareness of these +representations, from computational formalism to real-world practice. First, we +cast doubt on the notion of non-causal robustness studied in the recent +CausalAgents benchmark. We show that recent representations are already +partially resilient to perturbations of non-causal agents, and yet modeling +indirect causal effects involving mediator agents remains challenging. To +address this challenge, we introduce a metric learning approach that +regularizes latent representations with causal annotations. Our controlled +experiments show that this approach not only leads to higher degrees of causal +awareness but also yields stronger out-of-distribution robustness. To further +operationalize it in practice, we propose a sim-to-real causal transfer method +via cross-domain multi-task learning. Experiments on pedestrian datasets show +that our method can substantially boost generalization, even in the absence of +real-world causal annotations. We hope our work provides a new perspective on +the challenges and potential pathways towards causally-aware representations of +multi-agent interactions. Our code is available at +https://github.com/socialcausality. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Trajeglish: Learning the Language of Driving Scenarios + + +
+ A longstanding challenge for self-driving development is simulating dynamic +driving scenarios seeded from recorded driving logs. In pursuit of this +functionality, we apply tools from discrete sequence modeling to model how +vehicles, pedestrians and cyclists interact in driving scenarios. Using a +simple data-driven tokenization scheme, we discretize trajectories to +centimeter-level resolution using a small vocabulary. We then model the +multi-agent sequence of motion tokens with a GPT-like encoder-decoder that is +autoregressive in time and takes into account intra-timestep interaction +between agents. Scenarios sampled from our model exhibit state-of-the-art +realism; our model tops the Waymo Sim Agents Benchmark, surpassing prior work +along the realism meta metric by 3.3% and along the interaction metric by 9.9%. +We ablate our modeling choices in full autonomy and partial autonomy settings, +and show that the representations learned by our model can quickly be adapted +to improve performance on nuScenes. We additionally evaluate the scalability of +our model with respect to parameter count and dataset size, and use density +estimates from our model to quantify the saliency of context length and +intra-timestep interaction for the traffic modeling task. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Dream2Real: Zero-Shot 3D Object Rearrangement with Vision-Language + Models + + +
+ We introduce Dream2Real, a robotics framework which integrates +vision-language models (VLMs) trained on 2D data into a 3D object rearrangement +pipeline. This is achieved by the robot autonomously constructing a 3D +representation of the scene, where objects can be rearranged virtually and an +image of the resulting arrangement rendered. These renders are evaluated by a +VLM, so that the arrangement which best satisfies the user instruction is +selected and recreated in the real world with pick-and-place. This enables +language-conditioned rearrangement to be performed zero-shot, without needing +to collect a training dataset of example arrangements. Results on a series of +real-world tasks show that this framework is robust to distractors, +controllable by language, capable of understanding complex multi-object +relations, and readily applicable to both tabletop and 6-DoF rearrangement +tasks. + +
+
+ comment: Project webpage with videos: https://www.robot-learning.uk/dream2real +
+
+
+
+
+ + ☆ Using Large Language Models for Hyperparameter Optimization + + +
+ This paper studies using foundational large language models (LLMs) to make +decisions during hyperparameter optimization (HPO). Empirical evaluations +demonstrate that in settings with constrained search budgets, LLMs can perform +comparably or better than traditional HPO methods like random search and +Bayesian optimization on standard benchmarks. Furthermore, we propose to treat +the code specifying our model as a hyperparameter, which the LLM outputs, going +beyond the capabilities of existing HPO approaches. Our findings suggest that +LLMs are a promising tool for improving efficiency in the traditional +decision-making problem of hyperparameter optimization. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ A Block Metropolis-Hastings Sampler for Controllable Energy-based Text + Generation + + +
+ Recent work has shown that energy-based language modeling is an effective +framework for controllable text generation because it enables flexible +integration of arbitrary discriminators. However, because energy-based LMs are +globally normalized, approximate techniques like Metropolis-Hastings (MH) are +required for inference. Past work has largely explored simple proposal +distributions that modify a single token at a time, like in Gibbs sampling. In +this paper, we develop a novel MH sampler that, in contrast, proposes re-writes +of the entire sequence in each step via iterative prompting of a large language +model. Our new sampler (a) allows for more efficient and accurate sampling from +a target distribution and (b) allows generation length to be determined through +the sampling procedure rather than fixed in advance, as past work has required. +We perform experiments on two controlled generation tasks, showing both +downstream performance gains and more accurate target distribution sampling in +comparison with single-token proposal techniques. + +
+
+
+
+
+ + ☆ Coordination-free Decentralised Federated Learning on Complex Networks: + Overcoming Heterogeneity + + +
+ Federated Learning (FL) is a well-known framework for successfully performing +a learning task in an edge computing scenario where the devices involved have +limited resources and incomplete data representation. The basic assumption of +FL is that the devices communicate directly or indirectly with a parameter +server that centrally coordinates the whole process, overcoming several +challenges associated with it. However, in highly pervasive edge scenarios, the +presence of a central controller that oversees the process cannot always be +guaranteed, and the interactions (i.e., the connectivity graph) between devices +might not be predetermined, resulting in a complex network structure. Moreover, +the heterogeneity of data and devices further complicates the learning process. +This poses new challenges from a learning standpoint that we address by +proposing a communication-efficient Decentralised Federated Learning (DFL) +algorithm able to cope with them. Our solution allows devices communicating +only with their direct neighbours to train an accurate model, overcoming the +heterogeneity induced by data and different training histories. Our results +show that the resulting local models generalise better than those trained with +competing approaches, and do so in a more communication-efficient way. + +
+
+ comment: Supported by the H2020 HumaneAI Net (#952026), H2020 + INFRAIA-01-2018-2019 SoBigData++ (#871042), and by the CHIST-ERA-19-XAI010 + SAI projects, FWF (grant No. I 5205). Also funded by PNRR MUR Partenariato + Esteso PE00000013 FAIR, PNRR MUR Partenariato Esteso PE00000001 - "RESTART" +
+
+
+
+
+ + ☆ Graph Metanetworks for Processing Diverse Neural Architectures + + +
+ Neural networks efficiently encode learned information within their +parameters. Consequently, many tasks can be unified by treating neural networks +themselves as input data. When doing so, recent studies demonstrated the +importance of accounting for the symmetries and geometry of parameter spaces. +However, those works developed architectures tailored to specific networks such +as MLPs and CNNs without normalization layers, and generalizing such +architectures to other types of networks can be challenging. In this work, we +overcome these challenges by building new metanetworks - neural networks that +take weights from other neural networks as input. Put simply, we carefully +build graphs representing the input neural networks and process the graphs +using graph neural networks. Our approach, Graph Metanetworks (GMNs), +generalizes to neural architectures where competing methods struggle, such as +multi-head attention layers, normalization layers, convolutional layers, ResNet +blocks, and group-equivariant linear layers. We prove that GMNs are expressive +and equivariant to parameter permutation symmetries that leave the input neural +network functions unchanged. We validate the effectiveness of our method on +several metanetwork tasks over diverse neural network architectures. + +
+
+ comment: 29 pages +
+
+
+
+
+ + ☆ Chain of Code: Reasoning with a Language Model-Augmented Code Emulator + + +
+ Code provides a general syntactic structure to build complex programs and +perform precise computations when paired with a code interpreter -- we +hypothesize that language models (LMs) can leverage code-writing to improve +Chain of Thought reasoning not only for logic and arithmetic tasks, but also +for linguistic ones (and in particular, those that are a mix of both). For +example, consider prompting an LM to write code that counts the number of times +it detects sarcasm in an essay: the LM may struggle to write an implementation +for "detect_sarcasm(string)" that can be executed by the interpreter (handling +the edge cases would be insurmountable). However, LMs may still produce a valid +solution if they are used not only to write the code, but also to selectively +"emulate" the interpreter by generating the expected output of +"detect_sarcasm(string)" and other lines of code (e.g., that the interpreter +could not compile). In this work, we propose Chain of Code (CoT), a simple yet +surprisingly effective extension that improves LM code-driven reasoning. The +key idea is to encourage LMs to format linguistic sub-tasks in a program as +flexible pseudocode that the compiler can explicitly catch undefined behaviors +and hand off to simulate with an LM (as an "LMulator"). Experiments demonstrate +that Chain of Code outperforms Chain of Thought and other baselines across a +variety of benchmarks; on BIG-Bench Hard, Chain of Code achieves 84%, a gain of +12% over Chain of Thought. CoT scales well with large and small models alike, +and broadens the scope of reasoning questions that LMs can correctly answer by +"thinking in code". Project webpage: https://chain-of-code.github.io/. + +
+
+
+
+
+ + ☆ On the Learnability of Watermarks for Language Models + + +
+ Watermarking of language model outputs enables statistical detection of +model-generated text, which has many applications in the responsible deployment +of language models. Existing watermarking strategies operate by altering the +decoder of an existing language model, and the ability for a language model to +directly learn to generate the watermark would have significant implications +for the real-world deployment of watermarks. First, learned watermarks could be +used to build open models that naturally generate watermarked text, allowing +for open models to benefit from watermarking. Second, if watermarking is used +to determine the provenance of generated text, an adversary can hurt the +reputation of a victim model by spoofing its watermark and generating damaging +watermarked text. To investigate the learnability of watermarks, we propose +watermark distillation, which trains a student model to behave like a teacher +model that uses decoding-based watermarking. We test our approach on three +distinct decoding-based watermarking strategies and various hyperparameter +settings, finding that models can learn to generate watermarked text with high +detectability. We also find limitations to learnability, including the loss of +watermarking capabilities under fine-tuning on normal text and high sample +complexity when learning low-distortion watermarks. + +
+
+
+
+
+ + ☆ Horizon-Free and Instance-Dependent Regret Bounds for Reinforcement + Learning with General Function Approximation + + +
+ To tackle long planning horizon problems in reinforcement learning with +general function approximation, we propose the first algorithm, termed as +UCRL-WVTR, that achieves both \emph{horizon-free} and +\emph{instance-dependent}, since it eliminates the polynomial dependency on the +planning horizon. The derived regret bound is deemed \emph{sharp}, as it +matches the minimax lower bound when specialized to linear mixture MDPs up to +logarithmic factors. Furthermore, UCRL-WVTR is \emph{computationally efficient} +with access to a regression oracle. The achievement of such a horizon-free, +instance-dependent, and sharp regret bound hinges upon (i) novel algorithm +designs: weighted value-targeted regression and a high-order moment estimator +in the context of general function approximation; and (ii) fine-grained +analyses: a novel concentration bound of weighted non-linear least squares and +a refined analysis which leads to the tight instance-dependent bound. We also +conduct comprehensive experiments to corroborate our theoretical findings. + +
+
+
+
+
+ + ☆ PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding + + +
+ Recent advances in text-to-image generation have made remarkable progress in +synthesizing realistic human photos conditioned on given text prompts. However, +existing personalized generation methods cannot simultaneously satisfy the +requirements of high efficiency, promising identity (ID) fidelity, and flexible +text controllability. In this work, we introduce PhotoMaker, an efficient +personalized text-to-image generation method, which mainly encodes an arbitrary +number of input ID images into a stack ID embedding for preserving ID +information. Such an embedding, serving as a unified ID representation, can not +only encapsulate the characteristics of the same input ID comprehensively, but +also accommodate the characteristics of different IDs for subsequent +integration. This paves the way for more intriguing and practically valuable +applications. Besides, to drive the training of our PhotoMaker, we propose an +ID-oriented data construction pipeline to assemble the training data. Under the +nourishment of the dataset constructed through the proposed pipeline, our +PhotoMaker demonstrates better ID preservation ability than test-time +fine-tuning based methods, yet provides significant speed improvements, +high-quality generation results, strong generalization capabilities, and a wide +range of applications. Our project page is available at +https://photo-maker.github.io/ + +
+
+ comment: Tech report; Project page: https://photo-maker.github.io/ +
+
+
+
+
+ + ☆ Fortify the Shortest Stave in Attention: Enhancing Context Awareness of + Large Language Models for Effective Tool Use + + +
+ Recent advancements in large language models (LLMs) have significantly +expanded their functionality and skills as tool agents. In this paper, we argue +that a waveform pattern in the model's attention allocation has an impact on +the tool use performance, which degrades when the position of essential +information hits the trough zone. To address this issue, we propose a novel +inference method named Attention Buckets. This approach enables LLMs to handle +context by conducting parallel processes, each featuring a unique RoPE angle +base that shapes the attention waveform. Attention Buckets ensures that an +attention trough of a particular process can be compensated with an attention +peak of another run, reducing the risk of the LLM missing essential information +residing within the attention trough. Our extensive experiments on the widely +recognized tool use benchmark demonstrate the efficacy of our approach, where a +7B-parameter open-source model enhanced by Attention Buckets achieves SOTA +performance on par with GPT-4. + +
+
+
+
+
+ + ☆ Privacy-preserving quantum federated learning via gradient hiding + + +
+ Distributed quantum computing, particularly distributed quantum machine +learning, has gained substantial prominence for its capacity to harness the +collective power of distributed quantum resources, transcending the limitations +of individual quantum nodes. Meanwhile, the critical concern of privacy within +distributed computing protocols remains a significant challenge, particularly +in standard classical federated learning (FL) scenarios where data of +participating clients is susceptible to leakage via gradient inversion attacks +by the server. This paper presents innovative quantum protocols with quantum +communication designed to address the FL problem, strengthen privacy measures, +and optimize communication efficiency. In contrast to previous works that +leverage expressive variational quantum circuits or differential privacy +techniques, we consider gradient information concealment using quantum states +and propose two distinct FL protocols, one based on private inner-product +estimation and the other on incremental learning. These protocols offer +substantial advancements in privacy preservation with low communication +resources, forging a path toward efficient quantum communication-assisted FL +protocols and contributing to the development of secure distributed quantum +machine learning, thus addressing critical privacy concerns in the quantum +computing era. + +
+
+ comment: 12 pages, 2 figures, 1 table +
+
+
+
+
+ + ☆ FreqFed: A Frequency Analysis-Based Approach for Mitigating Poisoning + Attacks in Federated Learning NDSS + + +
+ Federated learning (FL) is a collaborative learning paradigm allowing +multiple clients to jointly train a model without sharing their training data. +However, FL is susceptible to poisoning attacks, in which the adversary injects +manipulated model updates into the federated model aggregation process to +corrupt or destroy predictions (untargeted poisoning) or implant hidden +functionalities (targeted poisoning or backdoors). Existing defenses against +poisoning attacks in FL have several limitations, such as relying on specific +assumptions about attack types and strategies or data distributions or not +sufficiently robust against advanced injection techniques and strategies and +simultaneously maintaining the utility of the aggregated model. To address the +deficiencies of existing defenses, we take a generic and completely different +approach to detect poisoning (targeted and untargeted) attacks. We present +FreqFed, a novel aggregation mechanism that transforms the model updates (i.e., +weights) into the frequency domain, where we can identify the core frequency +components that inherit sufficient information about weights. This allows us to +effectively filter out malicious updates during local training on the clients, +regardless of attack types, strategies, and clients' data distributions. We +extensively evaluate the efficiency and effectiveness of FreqFed in different +application domains, including image classification, word prediction, IoT +intrusion detection, and speech recognition. We demonstrate that FreqFed can +mitigate poisoning attacks effectively with a negligible impact on the utility +of the aggregated model. + +
+
+ comment: To appear in the Network and Distributed System Security (NDSS) + Symposium 2024. 16 pages, 8 figures, 12 tables, 1 algorithm, 3 equations +
+
+
+
+
+ + ☆ Monitoring Sustainable Global Development Along Shared Socioeconomic + Pathways NeurIPS 2023 + + +
+ Sustainable global development is one of the most prevalent challenges facing +the world today, hinging on the equilibrium between socioeconomic growth and +environmental sustainability. We propose approaches to monitor and quantify +sustainable development along the Shared Socioeconomic Pathways (SSPs), +including mathematically derived scoring algorithms, and machine learning +methods. These integrate socioeconomic and environmental datasets, to produce +an interpretable metric for SSP alignment. An initial study demonstrates +promising results, laying the groundwork for the application of different +methods to the monitoring of sustainable global development. + +
+
+ comment: 5 pages, 1 figure. Presented at NeurIPS 2023 Workshop: Tackling + Climate Change with Machine Learning +
+
+
+
+
+ + ☆ On the Impact of Multi-dimensional Local Differential Privacy on + Fairness + + +
+ Automated decision systems are increasingly used to make consequential +decisions in people's lives. Due to the sensitivity of the manipulated data as +well as the resulting decisions, several ethical concerns need to be addressed +for the appropriate use of such technologies, in particular, fairness and +privacy. Unlike previous work, which focused on centralized differential +privacy (DP) or local DP (LDP) for a single sensitive attribute, in this paper, +we examine the impact of LDP in the presence of several sensitive attributes +(i.e., multi-dimensional data) on fairness. Detailed empirical analysis on +synthetic and benchmark datasets revealed very relevant observations. In +particular, (1) multi-dimensional LDP is an efficient approach to reduce +disparity, (2) the multi-dimensional approach of LDP (independent vs. combined) +matters only at low privacy guarantees, and (3) the outcome Y distribution has +an important effect on which group is more sensitive to the obfuscation. Last, +we summarize our findings in the form of recommendations to guide practitioners +in adopting effective privacy-preserving practices while maintaining fairness +and utility in ML applications. + +
+
+
+
+
+ + ☆ Semi-Supervised Active Learning for Semantic Segmentation in Unknown + Environments Using Informative Path Planning + + +
+ Semantic segmentation enables robots to perceive and reason about their +environments beyond geometry. Most of such systems build upon deep learning +approaches. As autonomous robots are commonly deployed in initially unknown +environments, pre-training on static datasets cannot always capture the variety +of domains and limits the robot's perception performance during missions. +Recently, self-supervised and fully supervised active learning methods emerged +to improve a robot's vision. These approaches rely on large in-domain +pre-training datasets or require substantial human labelling effort. We propose +a planning method for semi-supervised active learning of semantic segmentation +that substantially reduces human labelling requirements compared to fully +supervised approaches. We leverage an adaptive map-based planner guided towards +the frontiers of unexplored space with high model uncertainty collecting +training data for human labelling. A key aspect of our approach is to combine +the sparse high-quality human labels with pseudo labels automatically extracted +from highly certain environment map areas. Experimental results show that our +method reaches segmentation performance close to fully supervised approaches +with drastically reduced human labelling effort while outperforming +self-supervised approaches. + +
+
+ comment: 8 pages, 9 figures +
+
+
+
+
+ + ☆ Intelligent Anomaly Detection for Lane Rendering Using Transformer with + Self-Supervised Pre-Training and Customized Fine-Tuning + + +
+ The burgeoning navigation services using digital maps provide great +convenience to drivers. Nevertheless, the presence of anomalies in lane +rendering map images occasionally introduces potential hazards, as such +anomalies can be misleading to human drivers and consequently contribute to +unsafe driving conditions. In response to this concern and to accurately and +effectively detect the anomalies, this paper transforms lane rendering image +anomaly detection into a classification problem and proposes a four-phase +pipeline consisting of data pre-processing, self-supervised pre-training with +the masked image modeling (MiM) method, customized fine-tuning using +cross-entropy based loss with label smoothing, and post-processing to tackle it +leveraging state-of-the-art deep learning techniques, especially those +involving Transformer models. Various experiments verify the effectiveness of +the proposed pipeline. Results indicate that the proposed pipeline exhibits +superior performance in lane rendering image anomaly detection, and notably, +the self-supervised pre-training with MiM can greatly enhance the detection +accuracy while significantly reducing the total training time. For instance, +employing the Swin Transformer with Uniform Masking as self-supervised +pretraining (Swin-Trans-UM) yielded a heightened accuracy at 94.77% and an +improved Area Under The Curve (AUC) score of 0.9743 compared with the pure Swin +Transformer without pre-training (Swin-Trans) with an accuracy of 94.01% and an +AUC of 0.9498. The fine-tuning epochs were dramatically reduced to 41 from the +original 280. In conclusion, the proposed pipeline, with its incorporation of +self-supervised pre-training using MiM and other advanced deep learning +techniques, emerges as a robust solution for enhancing the accuracy and +efficiency of lane rendering image anomaly detection in digital navigation +systems. + +
+
+ comment: 20 pages, 6 figures, accepted by the 103rd Transportation Research + Board (TRB) Annual Meeting, under review by Transportation Research Record: + Journal of the Transportation Research Board +
+
+
+
+
+ + ☆ Model-Based Epistemic Variance of Values for Risk-Aware Policy + Optimization + + +
+ We consider the problem of quantifying uncertainty over expected cumulative +rewards in model-based reinforcement learning. In particular, we focus on +characterizing the variance over values induced by a distribution over MDPs. +Previous work upper bounds the posterior variance over values by solving a +so-called uncertainty Bellman equation (UBE), but the over-approximation may +result in inefficient exploration. We propose a new UBE whose solution +converges to the true posterior variance over values and leads to lower regret +in tabular exploration problems. We identify challenges to apply the UBE theory +beyond tabular problems and propose a suitable approximation. Based on this +approximation, we introduce a general-purpose policy optimization algorithm, +Q-Uncertainty Soft Actor-Critic (QU-SAC), that can be applied for either +risk-seeking or risk-averse policy optimization with minimal changes. +Experiments in both online and offline RL demonstrate improved performance +compared to other uncertainty estimation methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2302.12526 +
+
+
+
+
+ + ☆ Deep Dynamics: Vehicle Dynamics Modeling with a Physics-Informed Neural + Network for Autonomous Racing + + +
+ Autonomous racing is a critical research area for autonomous driving, +presenting significant challenges in vehicle dynamics modeling, such as +balancing model precision and computational efficiency at high speeds +(>280kmph), where minor errors in modeling have severe consequences. Existing +physics-based models for vehicle dynamics require elaborate testing setups and +tuning, which are hard to implement, time-intensive, and cost-prohibitive. +Conversely, purely data-driven approaches do not generalize well and cannot +adequately ensure physical constraints on predictions. This paper introduces +Deep Dynamics, a physics-informed neural network (PINN) for vehicle dynamics +modeling of an autonomous racecar. It combines physics coefficient estimation +and dynamical equations to accurately predict vehicle states at high speeds and +includes a unique Physics Guard layer to ensure internal coefficient estimates +remain within their nominal physical ranges. Open-loop and closed-loop +performance assessments, using a physics-based simulator and full-scale +autonomous Indy racecar data, highlight Deep Dynamics as a promising approach +for modeling racecar vehicle dynamics. + +
+
+ comment: This work has been submitted to the IEEE RA-L for possible + publication +
+
+
+
+
+ + ☆ A Scalable Network-Aware Multi-Agent Reinforcement Learning Framework + for Decentralized Inverter-based Voltage Control + + +
+ This paper addresses the challenges associated with decentralized voltage +control in power grids due to an increase in distributed generations (DGs). +Traditional model-based voltage control methods struggle with the rapid energy +fluctuations and uncertainties of these DGs. While multi-agent reinforcement +learning (MARL) has shown potential for decentralized secondary control, +scalability issues arise when dealing with a large number of DGs. This problem +lies in the dominant centralized training and decentralized execution (CTDE) +framework, where the critics take global observations and actions. To overcome +these challenges, we propose a scalable network-aware (SNA) framework that +leverages network structure to truncate the input to the critic's Q-function, +thereby improving scalability and reducing communication costs during training. +Further, the SNA framework is theoretically grounded with provable +approximation guarantee, and it can seamlessly integrate with multiple +multi-agent actor-critic algorithms. The proposed SNA framework is successfully +demonstrated in a system with 114 DGs, providing a promising solution for +decentralized voltage control in increasingly complex power grid systems. + +
+
+
+
+
+ + ☆ Investigating the Design Space of Diffusion Models for Speech + Enhancement + + +
+ Diffusion models are a new class of generative models that have shown +outstanding performance in image generation literature. As a consequence, +studies have attempted to apply diffusion models to other tasks, such as speech +enhancement. A popular approach in adapting diffusion models to speech +enhancement consists in modelling a progressive transformation between the +clean and noisy speech signals. However, one popular diffusion model framework +previously laid in image generation literature did not account for such a +transformation towards the system input, which prevents from relating the +existing diffusion-based speech enhancement systems with the aforementioned +diffusion model framework. To address this, we extend this framework to account +for the progressive transformation between the clean and noisy speech signals. +This allows us to apply recent developments from image generation literature, +and to systematically investigate design aspects of diffusion models that +remain largely unexplored for speech enhancement, such as the neural network +preconditioning, the training loss weighting, the stochastic differential +equation (SDE), or the amount of stochasticity injected in the reverse process. +We show that the performance of previous diffusion-based speech enhancement +systems cannot be attributed to the progressive transformation between the +clean and noisy speech signals. Moreover, we show that a proper choice of +preconditioning, training loss weighting, SDE and sampler allows to outperform +a popular diffusion-based speech enhancement system in terms of perceptual +metrics while using fewer sampling steps, thus reducing the computational cost +by a factor of four. + +
+
+
+
+
+ + ☆ NeuJeans: Private Neural Network Inference with Joint Optimization of + Convolution and Bootstrapping + + +
+ Fully homomorphic encryption (FHE) is a promising cryptographic primitive for +realizing private neural network inference (PI) services by allowing a client +to fully offload the inference task to a cloud server while keeping the client +data oblivious to the server. This work proposes NeuJeans, an FHE-based +solution for the PI of deep convolutional neural networks (CNNs). NeuJeans +tackles the critical problem of the enormous computational cost for the FHE +evaluation of convolutional layers (conv2d), mainly due to the high cost of +data reordering and bootstrapping. We first propose an encoding method +introducing nested structures inside encoded vectors for FHE, which enables us +to develop efficient conv2d algorithms with reduced data reordering costs. +However, the new encoding method also introduces additional computations for +conversion between encoding methods, which could negate its advantages. We +discover that fusing conv2d with bootstrapping eliminates such computations +while reducing the cost of bootstrapping. Then, we devise optimized execution +flows for various types of conv2d and apply them to end-to-end implementation +of CNNs. NeuJeans accelerates the performance of conv2d by up to 5.68 times +compared to state-of-the-art FHE-based PI work and performs the PI of a CNN at +the scale of ImageNet (ResNet18) within a mere few seconds + +
+
+ comment: 16 pages, 9 figures +
+
+
+
+
+ + ☆ CLadder: A Benchmark to Assess Causal Reasoning Capabilities of Language + Models NeurIPS 2023 + + +
+ The ability to perform causal reasoning is widely considered a core feature +of intelligence. In this work, we investigate whether large language models +(LLMs) can coherently reason about causality. Much of the existing work in +natural language processing (NLP) focuses on evaluating commonsense causal +reasoning in LLMs, thus failing to assess whether a model can perform causal +inference in accordance with a set of well-defined formal rules. To address +this, we propose a new NLP task, causal inference in natural language, inspired +by the "causal inference engine" postulated by Judea Pearl et al. We compose a +large dataset, CLadder, with 10K samples: based on a collection of causal +graphs and queries (associational, interventional, and counterfactual), we +obtain symbolic questions and ground-truth answers, through an oracle causal +inference engine. These are then translated into natural language. We evaluate +multiple LLMs on our dataset, and we introduce and evaluate a bespoke +chain-of-thought prompting strategy, CausalCoT. We show that our task is highly +challenging for LLMs, and we conduct an in-depth analysis to gain deeper +insight into the causal reasoning abilities of LLMs. Our data is open-sourced +at https://huggingface.co/datasets/causalNLP/cladder, and our code can be found +at https://github.com/causalNLP/cladder. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ Improved Efficient Two-Stage Denoising Diffusion Power System + Measurement Recovery Against False Data Injection Attacks and Data Losses + + +
+ Measurement uncertainties, represented by cyber-attacks and data losses, +seriously degrade the quality of power system measurements. Fortunately, the +powerful generation ability of the denoising diffusion models can enable more +precise measurement generation for power system data recovery. However, the +controllable data generation and efficient computing methods of denoising +diffusion models for deterministic trajectory still need further investigation. +To this end, this paper proposes an improved two-stage denoising diffusion +model (TSDM) to identify and reconstruct the measurements with various +measurement uncertainties. The first stage of the model comprises a +classifier-guided conditional anomaly detection component, while the second +stage involves diffusion-based measurement imputation component. Moreover, the +proposed TSDM adopts precise means and optimal variances to accelerate the +diffusion generation process with subsequence sampling. Extensive numerical +case studies demonstrate that the proposed TSDM can accurately recover power +system measurements despite strong randomness under renewable energy +integration and highly nonlinear dynamics under complex cyber-physical +contingencies. Additionally, the proposed TSDM has stronger robustness compared +to existing reconstruction networks and exhibits lower computational complexity +than general denoising diffusion models. + +
+
+
+
+
+ + ☆ Enhancing Medical Task Performance in GPT-4V: A Comprehensive Study on + Prompt Engineering Strategies + + +
+ OpenAI's latest large vision-language model (LVLM), GPT-4V(ision), has piqued +considerable interest for its potential in medical applications. Despite its +promise, recent studies and internal reviews highlight its underperformance in +specialized medical tasks. This paper explores the boundary of GPT-4V's +capabilities in medicine, particularly in processing complex imaging data from +endoscopies, CT scans, and MRIs etc. Leveraging open-source datasets, we +assessed its foundational competencies, identifying substantial areas for +enhancement. Our research emphasizes prompt engineering, an often-underutilized +strategy for improving AI responsiveness. Through iterative testing, we refined +the model's prompts, significantly improving its interpretative accuracy and +relevance in medical imaging. From our comprehensive evaluations, we distilled +10 effective prompt engineering techniques, each fortifying GPT-4V's medical +acumen. These methodical enhancements facilitate more reliable, precise, and +clinically valuable insights from GPT-4V, advancing its operability in critical +healthcare environments. Our findings are pivotal for those employing AI in +medicine, providing clear, actionable guidance on harnessing GPT-4V's full +diagnostic potential. + +
+
+
+
+
+ + ☆ Causality and Explainability for Trustworthy Integrated Pest Management NeurIPS 2023 + + +
+ Pesticides serve as a common tool in agricultural pest control but +significantly contribute to the climate crisis. To combat this, Integrated Pest +Management (IPM) stands as a climate-smart alternative. Despite its potential, +IPM faces low adoption rates due to farmers' skepticism about its +effectiveness. To address this challenge, we introduce an advanced data +analysis framework tailored to enhance IPM adoption. Our framework provides i) +robust pest population predictions across diverse environments with invariant +and causal learning, ii) interpretable pest presence predictions using +transparent models, iii) actionable advice through counterfactual explanations +for in-season IPM interventions, iv) field-specific treatment effect +estimations, and v) assessments of the effectiveness of our advice using causal +inference. By incorporating these features, our framework aims to alleviate +skepticism and encourage wider adoption of IPM practices among farmers. + +
+
+ comment: Accepted at NeurIPS 2023 Workshop on Tackling Climate Change with + Machine Learning: Blending New and Existing Knowledge Systems +
+
+
+
+
+ + ☆ Merging by Matching Models in Task Subspaces + + +
+ Model merging aims to cheaply combine individual task-specific models into a +single multitask model. In this work, we view past merging methods as +leveraging different notions of a ''task subspace'' in which models are matched +before being merged. We connect the task subspace of a given model to its loss +landscape and formalize how this approach to model merging can be seen as +solving a linear system of equations. While past work has generally been +limited to linear systems that have a closed-form solution, we consider using +the conjugate gradient method to find a solution. We show that using the +conjugate gradient method can outperform closed-form solutions, enables merging +via linear systems that are otherwise intractable to solve, and flexibly allows +choosing from a wide variety of initializations and estimates for the ''task +subspace''. We ultimately demonstrate that our merging framework called +''Matching Models in their Task Subspace'' (MaTS) achieves state-of-the-art +results in multitask and intermediate-task model merging. We release all of the +code and checkpoints used in our work at https://github.com/r-three/mats. + +
+
+
+
+
+ + ☆ Surrogate Modelling for Sea Ice Concentration using Lightweight Neural + Ensemble + + +
+ The modeling and forecasting of sea ice conditions in the Arctic region are +important tasks for ship routing, offshore oil production, and environmental +monitoring. We propose the adaptive surrogate modeling approach named LANE-SI +(Lightweight Automated Neural Ensembling for Sea Ice) that uses ensemble of +relatively simple deep learning models with different loss functions for +forecasting of spatial distribution for sea ice concentration in the specified +water area. Experimental studies confirm the quality of a long-term forecast +based on a deep learning model fitted to the specific water area is comparable +to resource-intensive physical modeling, and for some periods of the year, it +is superior. We achieved a 20% improvement against the state-of-the-art +physics-based forecast system SEAS5 for the Kara Sea. + +
+
+ comment: 7 pages, 6 figures +
+
+
+
+
+ + ☆ Learning to sample in Cartesian MRI + + +
+ Despite its exceptional soft tissue contrast, Magnetic Resonance Imaging +(MRI) faces the challenge of long scanning times compared to other modalities +like X-ray radiography. Shortening scanning times is crucial in clinical +settings, as it increases patient comfort, decreases examination costs and +improves throughput. Recent advances in compressed sensing (CS) and deep +learning allow accelerated MRI acquisition by reconstructing high-quality +images from undersampled data. While reconstruction algorithms have received +most of the focus, designing acquisition trajectories to optimize +reconstruction quality remains an open question. This thesis explores two +approaches to address this gap in the context of Cartesian MRI. First, we +propose two algorithms, lazy LBCS and stochastic LBCS, that significantly +improve upon G\"ozc\"u et al.'s greedy learning-based CS (LBCS) approach. These +algorithms scale to large, clinically relevant scenarios like multi-coil 3D MR +and dynamic MRI, previously inaccessible to LBCS. Additionally, we demonstrate +that generative adversarial networks (GANs) can serve as a natural criterion +for adaptive sampling by leveraging variance in the measurement domain to guide +acquisition. Second, we delve into the underlying structures or assumptions +that enable mask design algorithms to perform well in practice. Our experiments +reveal that state-of-the-art deep reinforcement learning (RL) approaches, while +capable of adaptation and long-horizon planning, offer only marginal +improvements over stochastic LBCS, which is neither adaptive nor does long-term +planning. Altogether, our findings suggest that stochastic LBCS and similar +methods represent promising alternatives to deep RL. They shine in particular +by their scalability and computational efficiency and could be key in the +deployment of optimized acquisition trajectories in Cartesian MRI. + +
+
+ comment: PhD Thesis; 198 pages +
+
+
+
+
+ + ☆ Equivariant Scalar Fields for Molecular Docking with Fast Fourier + Transforms + + +
+ Molecular docking is critical to structure-based virtual screening, yet the +throughput of such workflows is limited by the expensive optimization of +scoring functions involved in most docking algorithms. We explore how machine +learning can accelerate this process by learning a scoring function with a +functional form that allows for more rapid optimization. Specifically, we +define the scoring function to be the cross-correlation of multi-channel ligand +and protein scalar fields parameterized by equivariant graph neural networks, +enabling rapid optimization over rigid-body degrees of freedom with fast +Fourier transforms. The runtime of our approach can be amortized at several +levels of abstraction, and is particularly favorable for virtual screening +settings with a common binding pocket. We benchmark our scoring functions on +two simplified docking-related tasks: decoy pose scoring and rigid conformer +docking. Our method attains similar but faster performance on crystal +structures compared to the widely-used Vina and Gnina scoring functions, and is +more robust on computationally predicted structures. Code is available at +https://github.com/bjing2016/scalar-fields. + +
+
+
+
+
+ + ☆ MIMo: A Multi-Modal Infant Model for Studying Cognitive Development + + +
+ Human intelligence and human consciousness emerge gradually during the +process of cognitive development. Understanding this development is an +essential aspect of understanding the human mind and may facilitate the +construction of artificial minds with similar properties. Importantly, human +cognitive development relies on embodied interactions with the physical and +social environment, which is perceived via complementary sensory modalities. +These interactions allow the developing mind to probe the causal structure of +the world. This is in stark contrast to common machine learning approaches, +e.g., for large language models, which are merely passively ``digesting'' large +amounts of training data, but are not in control of their sensory inputs. +However, computational modeling of the kind of self-determined embodied +interactions that lead to human intelligence and consciousness is a formidable +challenge. Here we present MIMo, an open-source multi-modal infant model for +studying early cognitive development through computer simulations. MIMo's body +is modeled after an 18-month-old child with detailed five-fingered hands. MIMo +perceives its surroundings via binocular vision, a vestibular system, +proprioception, and touch perception through a full-body virtual skin, while +two different actuation models allow control of his body. We describe the +design and interfaces of MIMo and provide examples illustrating its use. All +code is available at https://github.com/trieschlab/MIMo . + +
+
+ comment: 11 pages, 8 figures. Submitted to IEEE Transactions on Congnitive and + Developmental Systems (TCDS) +
+
+
+
+
+ + ☆ Stochastic-Constrained Stochastic Optimization with Markovian Data + + +
+ This paper considers stochastic-constrained stochastic optimization where the +stochastic constraint is to satisfy that the expectation of a random function +is below a certain threshold. In particular, we study the setting where data +samples are drawn from a Markov chain and thus are not independent and +identically distributed. We generalize the drift-plus-penalty framework, a +primal-dual stochastic gradient method developed for the i.i.d. case, to the +Markov chain sampling setting. We propose two variants of drift-plus-penalty; +one is for the case when the mixing time of the underlying Markov chain is +known while the other is for the case of unknown mixing time. In fact, our +algorithms apply to a more general setting of constrained online convex +optimization where the sequence of constraint functions follows a Markov chain. +Both algorithms are adaptive in that the first works without knowledge of the +time horizon while the second uses AdaGrad-style algorithm parameters, which is +of independent interest. We demonstrate the effectiveness of our proposed +methods through numerical experiments on classification with fairness +constraints. + +
+
+
+
+
+ + ☆ Finding Interpretable Class-Specific Patterns through Efficient Neural + Search + + +
+ Discovering patterns in data that best describe the differences between +classes allows to hypothesize and reason about class-specific mechanisms. In +molecular biology, for example, this bears promise of advancing the +understanding of cellular processes differing between tissues or diseases, +which could lead to novel treatments. To be useful in practice, methods that +tackle the problem of finding such differential patterns have to be readily +interpretable by domain experts, and scalable to the extremely high-dimensional +data. + In this work, we propose a novel, inherently interpretable binary neural +network architecture DIFFNAPS that extracts differential patterns from data. +DiffNaps is scalable to hundreds of thousands of features and robust to noise, +thus overcoming the limitations of current state-of-the-art methods in +large-scale applications such as in biology. We show on synthetic and real +world data, including three biological applications, that, unlike its +competitors, DiffNaps consistently yields accurate, succinct, and interpretable +class descriptions + +
+
+
+
+
+ + ☆ A Structural-Clustering Based Active Learning for Graph Neural Networks + + +
+ In active learning for graph-structured data, Graph Neural Networks (GNNs) +have shown effectiveness. However, a common challenge in these applications is +the underutilization of crucial structural information. To address this +problem, we propose the Structural-Clustering PageRank method for improved +Active learning (SPA) specifically designed for graph-structured data. SPA +integrates community detection using the SCAN algorithm with the PageRank +scoring method for efficient and informative sample selection. SPA prioritizes +nodes that are not only informative but also central in structure. Through +extensive experiments, SPA demonstrates higher accuracy and macro-F1 score over +existing methods across different annotation budgets and achieves significant +reductions in query time. In addition, the proposed method only adds two +hyperparameters, $\epsilon$ and $\mu$ in the algorithm to finely tune the +balance between structural learning and node selection. This simplicity is a +key advantage in active learning scenarios, where extensive hyperparameter +tuning is often impractical. + +
+
+
+
+
+ + ☆ Simulating the Air Quality Impact of Prescribed Fires Using a Graph + Neural Network-Based PM$_{2.5}$ Emissions Forecasting System NeurIPS 2023 + + +
+ The increasing size and severity of wildfires across western North America +have generated dangerous levels of PM$_{2.5}$ pollution in recent years. In a +warming climate, expanding the use of prescribed fires is widely considered to +be the most robust fire mitigation strategy. However, reliably forecasting the +potential air quality impact from these prescribed fires, a critical ingredient +in determining the fires' location and time, at hourly to daily time scales +remains a challenging problem. This paper proposes a novel integration of +prescribed fire simulation with a spatio-temporal graph neural network-based +PM$_{2.5}$ forecasting model. The experiments in this work focus on determining +the optimal time for implementing prescribed fires in California as well as +quantifying the potential air quality trade-offs involved in conducting more +prescribed fires outside the fire season. + +
+
+ comment: 4 + 3 pages; accepted to the Tackling Climate Change with Machine + Learning Workshop at NeurIPS 2023 +
+
+
+
+
+ + ☆ Factor-Assisted Federated Learning for Personalized Optimization with + Heterogeneous Data + + +
+ Federated learning is an emerging distributed machine learning framework +aiming at protecting data privacy. Data heterogeneity is one of the core +challenges in federated learning, which could severely degrade the convergence +rate and prediction performance of deep neural networks. To address this issue, +we develop a novel personalized federated learning framework for heterogeneous +data, which we refer to as FedSplit. This modeling framework is motivated by +the finding that, data in different clients contain both common knowledge and +personalized knowledge. Then the hidden elements in each neural layer can be +split into the shared and personalized groups. With this decomposition, a novel +objective function is established and optimized. We demonstrate FedSplit +enjoyers a faster convergence speed than the standard federated learning method +both theoretically and empirically. The generalization bound of the FedSplit +method is also studied. To practically implement the proposed method on real +datasets, factor analysis is introduced to facilitate the decoupling of hidden +elements. This leads to a practically implemented model for FedSplit and we +further refer to as FedFac. We demonstrated by simulation studies that, using +factor analysis can well recover the underlying shared/personalized +decomposition. The superior prediction performance of FedFac is further +verified empirically by comparison with various state-of-the-art federated +learning methods on several real datasets. + +
+
+ comment: 29 pages, 10 figures +
+
+
+
+
+ + ☆ Estimating Countries with Similar Maternal Mortality Rate using Cluster + Analysis and Pairing Countries with Identical MMR + + +
+ In the evolving world, we require more additionally the young era to flourish +and evolve into developed land. Most of the population all around the world are +unaware of the complications involved in the routine they follow while they are +pregnant and how hospital facilities affect maternal health. Maternal Mortality +is the death of a pregnant woman due to intricacies correlated to pregnancy, +underlying circumstances exacerbated by the pregnancy or management of these +situations. It is crucial to consider the Maternal Mortality Rate (MMR) in +diverse locations and determine which human routines and hospital facilities +diminish the Maternal Mortality Rate (MMR). This research aims to examine and +discover the countries which are keeping more lavish threats of MMR and +countries alike in MMR encountered. Data is examined and collected for various +countries, data consists of the earlier years' observation. From the +perspective of Machine Learning, Unsupervised Machine Learning is implemented +to perform Cluster Analysis. Therefore the pairs of countries with similar MMR +as well as the extreme opposite pair concerning the MMR are found. + +
+
+ comment: 14 pages, 6 figures +
+
+
+
+
+ + ☆ Invariant Random Forest: Tree-Based Model Solution for OOD + Generalization + + +
+ Out-Of-Distribution (OOD) generalization is an essential topic in machine +learning. However, recent research is only focusing on the corresponding +methods for neural networks. This paper introduces a novel and effective +solution for OOD generalization of decision tree models, named Invariant +Decision Tree (IDT). IDT enforces a penalty term with regard to the +unstable/varying behavior of a split across different environments during the +growth of the tree. Its ensemble version, the Invariant Random Forest (IRF), is +constructed. Our proposed method is motivated by a theoretical result under +mild conditions, and validated by numerical tests with both synthetic and real +datasets. The superior performance compared to non-OOD tree models implies that +considering OOD generalization for tree models is absolutely necessary and +should be given more attention. + +
+
+
+
+
+ + ☆ Graph Convolutions Enrich the Self-Attention in Transformers! + + +
+ Transformers, renowned for their self-attention mechanism, have achieved +state-of-the-art performance across various tasks in natural language +processing, computer vision, time-series modeling, etc. However, one of the +challenges with deep Transformer models is the oversmoothing problem, where +representations across layers converge to indistinguishable values, leading to +significant performance degradation. We interpret the original self-attention +as a simple graph filter and redesign it from a graph signal processing (GSP) +perspective. We propose graph-filter-based self-attention (GFSA) to learn a +general yet effective one, whose complexity, however, is slightly larger than +that of the original self-attention mechanism. We demonstrate that GFSA +improves the performance of Transformers in various fields, including computer +vision, natural language processing, graph pattern classification, speech +recognition, and code classification. + +
+
+
+
+
+ + ☆ CODEX: A Cluster-Based Method for Explainable Reinforcement Learning IJCAI + + +
+ Despite the impressive feats demonstrated by Reinforcement Learning (RL), +these algorithms have seen little adoption in high-risk, real-world +applications due to current difficulties in explaining RL agent actions and +building user trust. We present Counterfactual Demonstrations for Explanation +(CODEX), a method that incorporates semantic clustering, which can effectively +summarize RL agent behavior in the state-action space. Experimentation on the +MiniGrid and StarCraft II gaming environments reveals the semantic clusters +retain temporal as well as entity information, which is reflected in the +constructed summary of agent behavior. Furthermore, clustering the +discrete+continuous game-state latent representations identifies the most +crucial episodic events, demonstrating a relationship between the latent and +semantic spaces. This work contributes to the growing body of work that strives +to unlock the power of RL for widespread use by leveraging and extending +techniques from Natural Language Processing. + +
+
+ comment: Presented at the International Joint Conference on Artificial + Intelligence (IJCAI) 2023 Workshop on Explainable Artificial Intelligence + (XAI) +
+
+
+
+
+ + ☆ Guided Reconstruction with Conditioned Diffusion Models for Unsupervised + Anomaly Detection in Brain MRIs + + +
+ Unsupervised anomaly detection in Brain MRIs aims to identify abnormalities +as outliers from a healthy training distribution. Reconstruction-based +approaches that use generative models to learn to reconstruct healthy brain +anatomy are commonly used for this task. Diffusion models are an emerging class +of deep generative models that show great potential regarding reconstruction +fidelity. However, they face challenges in preserving intensity characteristics +in the reconstructed images, limiting their performance in anomaly detection. +To address this challenge, we propose to condition the denoising mechanism of +diffusion models with additional information about the image to reconstruct +coming from a latent representation of the noise-free input image. This +conditioning enables high-fidelity reconstruction of healthy brain structures +while aligning local intensity characteristics of input-reconstruction pairs. +We evaluate our method's reconstruction quality, domain adaptation features and +finally segmentation performance on publicly available data sets with various +pathologies. Using our proposed conditioning mechanism we can reduce the +false-positive predictions and enable a more precise delineation of anomalies +which significantly enhances the anomaly detection performance compared to +established state-of-the-art approaches to unsupervised anomaly detection in +brain MRI. Furthermore, our approach shows promise in domain adaptation across +different MRI acquisitions and simulated contrasts, a crucial property of +general anomaly detection methods. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Constrained Hierarchical Clustering via Graph Coarsening and Optimal + Cuts + + +
+ Motivated by extracting and summarizing relevant information in short +sentence settings, such as satisfaction questionnaires, hotel reviews, and +X/Twitter, we study the problem of clustering words in a hierarchical fashion. +In particular, we focus on the problem of clustering with horizontal and +vertical structural constraints. Horizontal constraints are typically +cannot-link and must-link among words, while vertical constraints are +precedence constraints among cluster levels. We overcome state-of-the-art +bottlenecks by formulating the problem in two steps: first, as a +soft-constrained regularized least-squares which guides the result of a +sequential graph coarsening algorithm towards the horizontal feasible set. +Then, flat clusters are extracted from the resulting hierarchical tree by +computing optimal cut heights based on the available constraints. We show that +the resulting approach compares very well with respect to existing algorithms +and is computationally light. + +
+
+ comment: 5 pages, appeared at the Asilomar Conference on Signals, Systems, and + Computer, 11/2023 +
+
+
+
+
+ + ☆ Wavelength-multiplexed Delayed Inputs for Memory Enhancement of + Microring-based Reservoir Computing + + +
+ We numerically demonstrate a silicon add-drop microring-based reservoir +computing scheme that combines parallel delayed inputs and wavelength division +multiplexing. The scheme solves memory-demanding tasks like time-series +prediction with good performance without requiring external optical feedback. + +
+
+ comment: 2 pages, 2 figures. Submitted to Conference on Lasers and + Electro-Optics (CLEO) 2024 +
+
+
+
+
+ + ☆ Language Model Knowledge Distillation for Efficient Question Answering + in Spanish + + +
+ Recent advances in the development of pre-trained Spanish language models has +led to significant progress in many Natural Language Processing (NLP) tasks, +such as question answering. However, the lack of efficient models imposes a +barrier for the adoption of such models in resource-constrained environments. +Therefore, smaller distilled models for the Spanish language could be proven to +be highly scalable and facilitate their further adoption on a variety of tasks +and scenarios. In this work, we take one step in this direction by developing +SpanishTinyRoBERTa, a compressed language model based on RoBERTa for efficient +question answering in Spanish. To achieve this, we employ knowledge +distillation from a large model onto a lighter model that allows for a wider +implementation, even in areas with limited computational resources, whilst +attaining negligible performance sacrifice. Our experiments show that the dense +distilled model can still preserve the performance of its larger counterpart, +while significantly increasing inference speedup. This work serves as a +starting point for further research and investigation of model compression +efforts for Spanish language models across various NLP tasks. + +
+
+ comment: 6 pages, 2 tables +
+
+
+
+
+ + ☆ Coherent energy and force uncertainty in deep learning force fields NeurIPS 2023 + + +
+ In machine learning energy potentials for atomic systems, forces are commonly +obtained as the negative derivative of the energy function with respect to +atomic positions. To quantify aleatoric uncertainty in the predicted energies, +a widely used modeling approach involves predicting both a mean and variance +for each energy value. However, this model is not differentiable under the +usual white noise assumption, so energy uncertainty does not naturally +translate to force uncertainty. In this work we propose a machine learning +potential energy model in which energy and force aleatoric uncertainty are +linked through a spatially correlated noise process. We demonstrate our +approach on an equivariant messages passing neural network potential trained on +energies and forces on two out-of-equilibrium molecular datasets. Furthermore, +we also show how to obtain epistemic uncertainties in this setting based on a +Bayesian interpretation of deep ensemble models. + +
+
+ comment: Presented at Advancing Molecular Machine Learning - Overcoming + Limitations [ML4Molecules], ELLIS workshop, VIRTUAL, December 8, 2023, + unofficial NeurIPS 2023 side-event +
+
+
+
+
+ + ☆ A novel feature selection framework for incomplete data + + +
+ Feature selection on incomplete datasets is an exceptionally challenging +task. Existing methods address this challenge by first employing imputation +methods to complete the incomplete data and then conducting feature selection +based on the imputed data. Since imputation and feature selection are entirely +independent steps, the importance of features cannot be considered during +imputation. However, in real-world scenarios or datasets, different features +have varying degrees of importance. To address this, we propose a novel +incomplete data feature selection framework that considers feature importance. +The framework mainly consists of two alternating iterative stages: the M-stage +and the W-stage. In the M-stage, missing values are imputed based on a given +feature importance vector and multiple initial imputation results. In the +W-stage, an improved reliefF algorithm is employed to learn the feature +importance vector based on the imputed data. Specifically, the feature +importance vector obtained in the current iteration of the W-stage serves as +input for the next iteration of the M-stage. Experimental results on both +artificially generated and real incomplete datasets demonstrate that the +proposed method outperforms other approaches significantly. + +
+
+
+
+
+ + ☆ Augmentation-Free Dense Contrastive Knowledge Distillation for Efficient + Semantic Segmentation NeurIPS 2023 + + +
+ In recent years, knowledge distillation methods based on contrastive learning +have achieved promising results on image classification and object detection +tasks. However, in this line of research, we note that less attention is paid +to semantic segmentation. Existing methods heavily rely on data augmentation +and memory buffer, which entail high computational resource demands when +applying them to handle semantic segmentation that requires to preserve +high-resolution feature maps for making dense pixel-wise predictions. In order +to address this problem, we present Augmentation-free Dense Contrastive +Knowledge Distillation (Af-DCD), a new contrastive distillation learning +paradigm to train compact and accurate deep neural networks for semantic +segmentation applications. Af-DCD leverages a masked feature mimicking +strategy, and formulates a novel contrastive learning loss via taking advantage +of tactful feature partitions across both channel and spatial dimensions, +allowing to effectively transfer dense and structured local knowledge learnt by +the teacher model to a target student model while maintaining training +efficiency. Extensive experiments on five mainstream benchmarks with various +teacher-student network pairs demonstrate the effectiveness of our approach. +For instance, the DeepLabV3-Res18|DeepLabV3-MBV2 model trained by Af-DCD +reaches 77.03%|76.38% mIOU on Cityscapes dataset when choosing DeepLabV3-Res101 +as the teacher, setting new performance records. Besides that, Af-DCD achieves +an absolute mIOU improvement of 3.26%|3.04%|2.75%|2.30%|1.42% compared with +individually trained counterpart on Cityscapes|Pascal +VOC|Camvid|ADE20K|COCO-Stuff-164K. Code is available at +https://github.com/OSVAI/Af-DCD + +
+
+ comment: The paper of Af-DCD is accepted to NeurIPS 2023. Code and models are + available at https://github.com/OSVAI/Af-DCD +
+
+
+
+
+ + ☆ Mixture of Dynamical Variational Autoencoders for Multi-Source + Trajectory Modeling and Separation + + +
+ In this paper, we propose a latent-variable generative model called mixture +of dynamical variational autoencoders (MixDVAE) to model the dynamics of a +system composed of multiple moving sources. A DVAE model is pre-trained on a +single-source dataset to capture the source dynamics. Then, multiple instances +of the pre-trained DVAE model are integrated into a multi-source mixture model +with a discrete observation-to-source assignment latent variable. The posterior +distributions of both the discrete observation-to-source assignment variable +and the continuous DVAE variables representing the sources content/position are +estimated using a variational expectation-maximization algorithm, leading to +multi-source trajectories estimation. We illustrate the versatility of the +proposed MixDVAE model on two tasks: a computer vision task, namely +multi-object tracking, and an audio processing task, namely single-channel +audio source separation. Experimental results show that the proposed method +works well on these two tasks, and outperforms several baseline methods. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2202.09315 +
+
+
+
+
+ + ☆ Improving Communication Efficiency of Federated Distillation via + Accumulating Local Updates + + +
+ As an emerging federated learning paradigm, federated distillation enables +communication-efficient model training by transmitting only small-scale +knowledge during the learning process. To further improve the communication +efficiency of federated distillation, we propose a novel technique, ALU, which +accumulates multiple rounds of local updates before transferring the knowledge +to the central server. ALU drastically decreases the frequency of communication +in federated distillation, thereby significantly reducing the communication +overhead during the training process. Empirical experiments demonstrate the +substantial effect of ALU in improving the communication efficiency of +federated distillation. + +
+
+ comment: 2 pages, 3 figures +
+
+
+
+
+ + ☆ Multi-scale Residual Transformer for VLF Lightning Transients + Classification + + +
+ The utilization of Very Low Frequency (VLF) electromagnetic signals in +navigation systems is widespread. However, the non-stationary behavior of +lightning signals can affect VLF electromagnetic signal transmission. +Accurately classifying lightning signals is important for reducing interference +and noise in VLF, thereby improving the reliability and overall performance of +navigation systems. In recent years, the evolution of deep learning, +specifically Convolutional Neural Network (CNNs), has sparked a transformation +in lightning classification, surpassing traditional statistical methodologies. +Existing CNN models have limitations as they overlook the diverse attributes of +lightning signals across different scales and neglect the significance of +temporal sequencing in sequential signals. This study introduces an innovative +multi-scale residual transform (MRTransformer) that not only has the ability to +discern intricate fine-grained patterns while also weighing the significance of +different aspects within the input lightning signal sequence. This model +performs the attributes of the lightning signal across different scales and the +level of accuracy reached 90% in the classification. In future work, this model +has the potential applied to a comprehensive understanding of the localization +and waveform characteristics of lightning signals. + +
+
+
+
+
+ + ☆ Zero-Touch Networks: Towards Next-Generation Network Automation + + +
+ The Zero-touch network and Service Management (ZSM) framework represents an +emerging paradigm in the management of the fifth-generation (5G) and Beyond +(5G+) networks, offering automated self-management and self-healing +capabilities to address the escalating complexity and the growing data volume +of modern networks. ZSM frameworks leverage advanced technologies such as +Machine Learning (ML) to enable intelligent decision-making and reduce human +intervention. This paper presents a comprehensive survey of Zero-Touch Networks +(ZTNs) within the ZSM framework, covering network optimization, traffic +monitoring, energy efficiency, and security aspects of next-generational +networks. The paper explores the challenges associated with ZSM, particularly +those related to ML, which necessitate the need to explore diverse network +automation solutions. In this context, the study investigates the application +of Automated ML (AutoML) in ZTNs, to reduce network management costs and +enhance performance. AutoML automates the selection and tuning process of a ML +model for a given task. Specifically, the focus is on AutoML's ability to +predict application throughput and autonomously adapt to data drift. +Experimental results demonstrate the superiority of the proposed AutoML +pipeline over traditional ML in terms of prediction accuracy. Integrating +AutoML and ZSM concepts significantly reduces network configuration and +management efforts, allowing operators to allocate more time and resources to +other important tasks. The paper also provides a high-level 5G system +architecture incorporating AutoML and ZSM concepts. This research highlights +the potential of ZTNs and AutoML to revolutionize the management of 5G+ +networks, enabling automated decision-making and empowering network operators +to achieve higher efficiency, improved performance, and enhanced user +experience. + +
+
+ comment: 115 pages, 12 figures, 15 tables, submitted to Computer Networks +
+
+
+
+
+ + ☆ Resource Allocation for Semantic Communication under Physical-layer + Security + + +
+ Semantic communication is deemed as a revolution of Shannon's paradigm in the +six-generation (6G) wireless networks. It aims at transmitting the extracted +information rather than the original data, which receivers will try to recover. +Intuitively, the larger extracted information, the longer latency of semantic +communication will be. Besides, larger extracted information will result in +more accurate reconstructed information, thereby causing a higher utility of +the semantic communication system. Shorter latency and higher utility are +desirable objectives for the system, so there will be a trade-off between +utility and latency. This paper proposes a joint optimization algorithm for +total latency and utility. Moreover, security is essential for the semantic +communication system. We incorporate the secrecy rate, a physical-layer +security method, into the optimization problem. The secrecy rate is the +communication rate at which no information is disclosed to an eavesdropper. +Experimental results demonstrate that the proposed algorithm obtains the best +joint optimization performance compared to the baselines. + +
+
+ comment: This paper appears in IEEE Global Communications Conference + (GLOBECOM) 2023 +
+
+
+
+
+ + ☆ Diffusing Colors: Image Colorization with Text Guided Diffusion SIGGRAPH + + +
+ The colorization of grayscale images is a complex and subjective task with +significant challenges. Despite recent progress in employing large-scale +datasets with deep neural networks, difficulties with controllability and +visual quality persist. To tackle these issues, we present a novel image +colorization framework that utilizes image diffusion techniques with granular +text prompts. This integration not only produces colorization outputs that are +semantically appropriate but also greatly improves the level of control users +have over the colorization process. Our method provides a balance between +automation and control, outperforming existing techniques in terms of visual +quality and semantic coherence. We leverage a pretrained generative Diffusion +Model, and show that we can finetune it for the colorization task without +losing its generative power or attention to text prompts. Moreover, we present +a novel CLIP-based ranking model that evaluates color vividness, enabling +automatic selection of the most suitable level of vividness based on the +specific scene semantics. Our approach holds potential particularly for color +enhancement and historical image colorization. + +
+
+ comment: SIGGRAPH Asia 2023 +
+
+
+
+
+ + ☆ TimeDRL: Disentangled Representation Learning for Multivariate + Time-Series + + +
+ Multivariate time-series data in numerous real-world applications (e.g., +healthcare and industry) are informative but challenging due to the lack of +labels and high dimensionality. Recent studies in self-supervised learning have +shown their potential in learning rich representations without relying on +labels, yet they fall short in learning disentangled embeddings and addressing +issues of inductive bias (e.g., transformation-invariance). To tackle these +challenges, we propose TimeDRL, a generic multivariate time-series +representation learning framework with disentangled dual-level embeddings. +TimeDRL is characterized by three novel features: (i) disentangled derivation +of timestamp-level and instance-level embeddings from patched time-series data +using a [CLS] token strategy; (ii) utilization of timestamp-predictive and +instance-contrastive tasks for disentangled representation learning, with the +former optimizing timestamp-level embeddings with predictive loss, and the +latter optimizing instance-level embeddings with contrastive loss; and (iii) +avoidance of augmentation methods to eliminate inductive biases, such as +transformation-invariance from cropping and masking. Comprehensive experiments +on 6 time-series forecasting datasets and 5 time-series classification datasets +have shown that TimeDRL consistently surpasses existing representation learning +approaches, achieving an average improvement of forecasting by 57.98% in MSE +and classification by 1.25% in accuracy. Furthermore, extensive ablation +studies confirmed the relative contribution of each component in TimeDRL's +architecture, and semi-supervised learning evaluations demonstrated its +effectiveness in real-world scenarios, even with limited labeled data. + +
+
+ comment: This paper is currently under review. The code will be made available + upon acceptance +
+
+
+
+
+ + ☆ A Novel Federated Learning-based Intrusion Detection System for Flying + Ad Hoc Networks + + +
+ Unmanned aerial vehicles (UAVs) in flying ad-hoc networks (FANETs) face +security challenges due to the dynamic and distributed nature of these +networks. This paper presents the Federated Learning-based Intrusion Detection +System (FL-IDS), an innovative approach designed to improve FANET security. +FL-IDS leverages federated learning to address privacy concerns of centralized +intrusion detection systems. FL-IDS operates in a decentralized manner, +enabling UAVs to collaboratively train a global intrusion detection model +without sharing raw data. Local models are assigned to each UAV, using +client-specific data, and only updated model weights are shared with a central +server. This preserves privacy while utilizing collective intelligence for +effective intrusion detection. Experimental results show FL-IDS's competitive +performance with Central IDS (C-IDS) while mitigating privacy concerns. The +Bias Towards Specific Clients (BTSC) method further enhances FL-IDS +performance, surpassing C-IDS even at lower attacker ratios. A comparative +analysis with traditional intrusion detection methods, including Local IDS +(L-IDS), provides insights into FL-IDS's strengths. This study significantly +contributes to FANET security by introducing a privacy-aware, decentralized +intrusion detection approach tailored to the unique challenges of UAV networks. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Caregiver Talk Shapes Toddler Vision: A Computational Study of Dyadic + Play + + +
+ Infants' ability to recognize and categorize objects develops gradually. The +second year of life is marked by both the emergence of more semantic visual +representations and a better understanding of word meaning. This suggests that +language input may play an important role in shaping visual representations. +However, even in suitable contexts for word learning like dyadic play sessions, +caregivers utterances are sparse and ambiguous, often referring to objects that +are different from the one to which the child attends. Here, we systematically +investigate to what extent caregivers' utterances can nevertheless enhance +visual representations. For this we propose a computational model of visual +representation learning during dyadic play. We introduce a synthetic dataset of +ego-centric images perceived by a toddler-agent that moves and rotates toy +objects in different parts of its home environment while hearing caregivers' +utterances, modeled as captions. We propose to model toddlers' learning as +simultaneously aligning representations for 1) close-in-time images and 2) +co-occurring images and utterances. We show that utterances with statistics +matching those of real caregivers give rise to representations supporting +improved category recognition. Our analysis reveals that a small +decrease/increase in object-relevant naming frequencies can drastically impact +the learned representations. This affects the attention on object names within +an utterance, which is required for efficient visuo-linguistic alignment. +Overall, our results support the hypothesis that caregivers' naming utterances +can improve toddlers' visual representations. + +
+
+ comment: Proceedings of the 2023 IEEE International Conference on Development + and Learning (ICDL) +
+
+
+
+
+ + ☆ Breaking the Entanglement of Homophily and Heterophily in + Semi-supervised Node Classification + + +
+ Recently, graph neural networks (GNNs) have shown prominent performance in +semi-supervised node classification by leveraging knowledge from the graph +database. However, most existing GNNs follow the homophily assumption, where +connected nodes are more likely to exhibit similar feature distributions and +the same labels, and such an assumption has proven to be vulnerable in a +growing number of practical applications. As a supplement, heterophily reflects +dissimilarity in connected nodes, which has gained significant attention in +graph learning. To this end, data engineers aim to develop a powerful GNN model +that can ensure performance under both homophily and heterophily. Despite +numerous attempts, most existing GNNs struggle to achieve optimal node +representations due to the constraints of undirected graphs. The neglect of +directed edges results in sub-optimal graph representations, thereby hindering +the capacity of GNNs. To address this issue, we introduce AMUD, which +quantifies the relationship between node profiles and topology from a +statistical perspective, offering valuable insights for \underline{A}daptively +\underline{M}odeling the natural directed graphs as the \underline{U}ndirected +or \underline{D}irected graph to maximize the benefits from subsequent graph +learning. Furthermore, we propose \underline{A}daptive \underline{D}irected +\underline{P}attern \underline{A}ggregation (ADPA) as a new directed graph +learning paradigm for AMUD. Empirical studies have demonstrated that AMUD +guides efficient graph learning. Meanwhile, extensive experiments on 14 +benchmark datasets substantiate the impressive performance of ADPA, +outperforming baselines by significant margins of 3.96\%. + +
+
+ comment: 14 pages, 7 figures +
+
+
+
+
+ + ☆ Small Area Estimation of Case Growths for Timely COVID-19 Outbreak + Detection + + +
+ The COVID-19 pandemic has exerted a profound impact on the global economy and +continues to exact a significant toll on human lives. The COVID-19 case growth +rate stands as a key epidemiological parameter to estimate and monitor for +effective detection and containment of the resurgence of outbreaks. A +fundamental challenge in growth rate estimation and hence outbreak detection is +balancing the accuracy-speed tradeoff, where accuracy typically degrades with +shorter fitting windows. In this paper, we develop a machine learning (ML) +algorithm, which we call Transfer Learning Generalized Random Forest (TLGRF), +that balances this accuracy-speed tradeoff. Specifically, we estimate the +instantaneous COVID-19 exponential growth rate for each U.S. county by using +TLGRF that chooses an adaptive fitting window size based on relevant day-level +and county-level features affecting the disease spread. Through transfer +learning, TLGRF can accurately estimate case growth rates for counties with +small sample sizes. Out-of-sample prediction analysis shows that TLGRF +outperforms established growth rate estimation methods. Furthermore, we +conducted a case study based on outbreak case data from the state of Colorado +and showed that the timely detection of outbreaks could have been improved by +up to 224% using TLGRF when compared to the decisions made by Colorado's +Department of Health and Environment (CDPHE). To facilitate implementation, we +have developed a publicly available outbreak detection tool for timely +detection of COVID-19 outbreaks in each U.S. county, which received substantial +attention from policymakers. + +
+
+ comment: Equal contributions by co-first authors Zhaowei She, Zilong Wang (in + alphabetical order) +
+
+
+
+
+ + ☆ Enhancing the Rationale-Input Alignment for Self-explaining + Rationalization ICDE 2024 + + +
+ Rationalization empowers deep learning models with self-explaining +capabilities through a cooperative game, where a generator selects a +semantically consistent subset of the input as a rationale, and a subsequent +predictor makes predictions based on the selected rationale. In this paper, we +discover that rationalization is prone to a problem named \emph{rationale +shift}, which arises from the algorithmic bias of the cooperative game. +Rationale shift refers to a situation where the semantics of the selected +rationale may deviate from the original input, but the predictor still produces +accurate predictions based on the deviation, resulting in a compromised +generator with misleading feedback. + To address this issue, we first demonstrate the importance of the alignment +between the rationale and the full input through both empirical observations +and theoretical analysis. Subsequently, we introduce a novel approach called +DAR (\textbf{D}iscriminatively \textbf{A}ligned \textbf{R}ationalization), +which utilizes an auxiliary module pretrained on the full input to +discriminatively align the selected rationale and the original input. We +theoretically illustrate how DAR accomplishes the desired alignment, thereby +overcoming the rationale shift problem. The experiments on two widely used +real-world benchmarks show that the proposed method significantly improves the +explanation quality (measured by the overlap between the model-selected +explanation and the human-annotated rationale) as compared to state-of-the-art +techniques. Additionally, results on two synthetic settings further validate +the effectiveness of DAR in addressing the rationale shift problem. + +
+
+ comment: Accept at ICDE 2024 +
+
+
+
+
+ + ☆ Learn to Unlearn for Deep Neural Networks: Minimizing Unlearning + Interference with Gradient Projection WACV 2024 + + +
+ Recent data-privacy laws have sparked interest in machine unlearning, which +involves removing the effect of specific training samples from a learnt model +as if they were never present in the original training dataset. The challenge +of machine unlearning is to discard information about the ``forget'' data in +the learnt model without altering the knowledge about the remaining dataset and +to do so more efficiently than the naive retraining approach. To achieve this, +we adopt a projected-gradient based learning method, named as +Projected-Gradient Unlearning (PGU), in which the model takes steps in the +orthogonal direction to the gradient subspaces deemed unimportant for the +retaining dataset, so as to its knowledge is preserved. By utilizing Stochastic +Gradient Descent (SGD) to update the model weights, our method can efficiently +scale to any model and dataset size. We provide empirically evidence to +demonstrate that our unlearning method can produce models that behave similar +to models retrained from scratch across various metrics even when the training +dataset is no longer accessible. Our code is available at +https://github.com/hnanhtuan/projected_gradient_unlearning. + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ☆ On the adaptation of in-context learners for system identification + + +
+ In-context system identification aims at constructing meta-models to describe +classes of systems, differently from traditional approaches that model single +systems. This paradigm facilitates the leveraging of knowledge acquired from +observing the behaviour of different, yet related dynamics. This paper +discusses the role of meta-model adaptation. Through numerical examples, we +demonstrate how meta-model adaptation can enhance predictive performance in +three realistic scenarios: tailoring the meta-model to describe a specific +system rather than a class; extending the meta-model to capture the behaviour +of systems beyond the initial training class; and recalibrating the model for +new prediction tasks. Results highlight the effectiveness of meta-model +adaptation to achieve a more robust and versatile meta-learning framework for +system identification. + +
+
+
+
+
+ + ☆ A Transformer Model for Symbolic Regression towards Scientific Discovery NeurIPS2023 + + +
+ Symbolic Regression (SR) searches for mathematical expressions which best +describe numerical datasets. This allows to circumvent interpretation issues +inherent to artificial neural networks, but SR algorithms are often +computationally expensive. This work proposes a new Transformer model aiming at +Symbolic Regression particularly focused on its application for Scientific +Discovery. We propose three encoder architectures with increasing flexibility +but at the cost of column-permutation equivariance violation. Training results +indicate that the most flexible architecture is required to prevent from +overfitting. Once trained, we apply our best model to the SRSD datasets +(Symbolic Regression for Scientific Discovery datasets) which yields +state-of-the-art results using the normalized tree-based edit distance, at no +extra computational cost. + +
+
+ comment: Accepted for oral presentation at NeurIPS2023 AI4Science Workshop. + OpenReview: https://openreview.net/forum?id=AIfqWNHKjo +
+
+
+
+
+ + ☆ Making Translators Privacy-aware on the User's Side + + +
+ We propose PRISM to enable users of machine translation systems to preserve +the privacy of data on their own initiative. There is a growing demand to apply +machine translation systems to data that require privacy protection. While +several machine translation engines claim to prioritize privacy, the extent and +specifics of such protection are largely ambiguous. First, there is often a +lack of clarity on how and to what degree the data is protected. Even if +service providers believe they have sufficient safeguards in place, +sophisticated adversaries might still extract sensitive information. Second, +vulnerabilities may exist outside of these protective measures, such as within +communication channels, potentially leading to data leakage. As a result, users +are hesitant to utilize machine translation engines for data demanding high +levels of privacy protection, thereby missing out on their benefits. PRISM +resolves this problem. Instead of relying on the translation service to keep +data safe, PRISM provides the means to protect data on the user's side. This +approach ensures that even machine translation engines with inadequate privacy +measures can be used securely. For platforms already equipped with privacy +safeguards, PRISM acts as an additional protection layer, reinforcing their +security furthermore. PRISM adds these privacy features without significantly +compromising translation accuracy. Our experiments demonstrate the +effectiveness of PRISM using real-world translators, T5 and ChatGPT +(GPT-3.5-turbo), and the datasets with two languages. PRISM effectively +balances privacy protection with translation accuracy. + +
+
+
+
+
+ + ☆ MeanCut: A Greedy-Optimized Graph Clustering via Path-based Similarity + and Degree Descent Criterion + + +
+ As the most typical graph clustering method, spectral clustering is popular +and attractive due to the remarkable performance, easy implementation, and +strong adaptability. Classical spectral clustering measures the edge weights of +graph using pairwise Euclidean-based metric, and solves the optimal graph +partition by relaxing the constraints of indicator matrix and performing +Laplacian decomposition. However, Euclidean-based similarity might cause skew +graph cuts when handling non-spherical data distributions, and the relaxation +strategy introduces information loss. Meanwhile, spectral clustering requires +specifying the number of clusters, which is hard to determine without enough +prior knowledge. In this work, we leverage the path-based similarity to enhance +intra-cluster associations, and propose MeanCut as the objective function and +greedily optimize it in degree descending order for a nondestructive graph +partition. This algorithm enables the identification of arbitrary shaped +clusters and is robust to noise. To reduce the computational complexity of +similarity calculation, we transform optimal path search into generating the +maximum spanning tree (MST), and develop a fast MST (FastMST) algorithm to +further improve its time-efficiency. Moreover, we define a density gradient +factor (DGF) for separating the weakly connected clusters. The validity of our +algorithm is demonstrated by testifying on real-world benchmarks and +application of face recognition. The source code of MeanCut is available at +https://github.com/ZPGuiGroupWhu/MeanCut-Clustering. + +
+
+ comment: 17 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ A Robust and Efficient Boundary Point Detection Method by Measuring + Local Direction Dispersion + + +
+ Boundary points pose a significant challenge for machine learning tasks, +including classification, clustering, and dimensionality reduction. Due to the +similarity of features, boundary areas can result in mixed-up classes or +clusters, leading to a crowding problem in dimensionality reduction. To address +this challenge, numerous boundary point detection methods have been developed, +but they are insufficiently to accurately and efficiently identify the boundary +points in non-convex structures and high-dimensional manifolds. In this work, +we propose a robust and efficient method for detecting boundary points using +Local Direction Dispersion (LoDD). LoDD considers that internal points are +surrounded by neighboring points in all directions, while neighboring points of +a boundary point tend to be distributed only in a certain directional range. +LoDD adopts a density-independent K-Nearest Neighbors (KNN) method to determine +neighboring points, and defines a statistic-based metric using the eigenvalues +of the covariance matrix of KNN coordinates to measure the centrality of a +query point. We demonstrated the validity of LoDD on five synthetic datasets +(2-D and 3-D) and ten real-world benchmarks, and tested its clustering +performance by equipping with two typical clustering methods, K-means and Ncut. +Our results show that LoDD achieves promising and robust detection accuracy in +a time-efficient manner. + +
+
+ comment: 11 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ DiscoBAX: Discovery of Optimal Intervention Sets in Genomic Experiment + Design + + +
+ The discovery of therapeutics to treat genetically-driven pathologies relies +on identifying genes involved in the underlying disease mechanisms. Existing +approaches search over the billions of potential interventions to maximize the +expected influence on the target phenotype. However, to reduce the risk of +failure in future stages of trials, practical experiment design aims to find a +set of interventions that maximally change a target phenotype via diverse +mechanisms. We propose DiscoBAX, a sample-efficient method for maximizing the +rate of significant discoveries per experiment while simultaneously probing for +a wide range of diverse mechanisms during a genomic experiment campaign. We +provide theoretical guarantees of approximate optimality under standard +assumptions, and conduct a comprehensive experimental evaluation covering both +synthetic as well as real-world experimental design tasks. DiscoBAX outperforms +existing state-of-the-art methods for experimental design, selecting effective +and diverse perturbations in biological systems. + +
+
+
+
+
+ + ☆ Jointly spatial-temporal representation learning for individual + trajectories + + +
+ Individual trajectories, containing substantial information on +human-environment interactions across space and time, is a crucial input for +geospatial foundation models (GeoFMs). However, existing attempts, leveraging +trajectory data for various applications have overlooked the implicit +spatial-temporal dependency within trajectories and failed to encode and +represent it in a format friendly to deep learning, posing a challenge in +obtaining general-purpose trajectory representations. Therefore, this paper +proposes a spatial-temporal joint representation learning method (ST-GraphRL) +to formalize learnable spatial-temporal dependencies into trajectory +representations. The proposed ST-GraphRL consists of three compositions: (i) a +weighted directed spatial-temporal graph to explicitly construct mobility +interactions over both space and time dimensions; (ii) a two-stage jointly +encoder (i.e., decoupling and fusion) to learn entangled spatial-temporal +dependencies by independently decomposing and jointly aggregating space and +time information; (iii) a decoder guides ST-GraphRL to learn explicit mobility +regularities by simulating the spatial-temporal distributions of trajectories. +Tested on three real-world human mobility datasets, the proposed ST-GraphRL +outperformed all the baseline models in predicting movement spatial-temporal +distributions and preserving trajectory similarity with high spatial-temporal +correlations. We also explore how spatial-temporal features presented in latent +space, validating that ST-GraphRL understands spatial-temporal patterns. This +method is also transferable for general-purpose geospatial data representations +for broad downstream tasks, as well advancing GeoFMs developing. + +
+
+ comment: 27 pages, 3 tables, 7 figures +
+
+
+
+
+ + ☆ Reconstruction of dynamical systems from data without time labels + + +
+ In this paper, we study the method to reconstruct dynamical systems from data +without time labels. Data without time labels appear in many applications, such +as molecular dynamics, single-cell RNA sequencing etc. Reconstruction of +dynamical system from time sequence data has been studied extensively. However, +these methods do not apply if time labels are unknown. Without time labels, +sequence data becomes distribution data. Based on this observation, we propose +to treat the data as samples from a probability distribution and try to +reconstruct the underlying dynamical system by minimizing the distribution +loss, sliced Wasserstein distance more specifically. Extensive experiment +results demonstrate the effectiveness of the proposed method. + +
+
+
+
+
+ + ☆ DiffusionPhase: Motion Diffusion in Frequency Domain + + +
+ In this study, we introduce a learning-based method for generating +high-quality human motion sequences from text descriptions (e.g., ``A person +walks forward"). Existing techniques struggle with motion diversity and smooth +transitions in generating arbitrary-length motion sequences, due to limited +text-to-motion datasets and the pose representations used that often lack +expressiveness or compactness. To address these issues, we propose the first +method for text-conditioned human motion generation in the frequency domain of +motions. We develop a network encoder that converts the motion space into a +compact yet expressive parameterized phase space with high-frequency details +encoded, capturing the local periodicity of motions in time and space with high +accuracy. We also introduce a conditional diffusion model for predicting +periodic motion parameters based on text descriptions and a start pose, +efficiently achieving smooth transitions between motion sequences associated +with different text descriptions. Experiments demonstrate that our approach +outperforms current methods in generating a broader variety of high-quality +motions, and synthesizing long sequences with natural transitions. + +
+
+
+
+
+ + ☆ RoAST: Robustifying Language Models via Adversarial Perturbation with + Selective Training EMNLP 2023 + + +
+ Fine-tuning pre-trained language models (LMs) has become the de facto +standard in many NLP tasks. Nevertheless, fine-tuned LMs are still prone to +robustness issues, such as adversarial robustness and model calibration. +Several perspectives of robustness for LMs have been studied independently, but +lacking a unified consideration in multiple perspectives. In this paper, we +propose Robustifying LMs via Adversarial perturbation with Selective Training +(RoAST), a simple yet effective fine-tuning technique to enhance the +multi-perspective robustness of LMs in a unified way. RoAST effectively +incorporates two important sources for the model robustness, robustness on the +perturbed inputs and generalizable knowledge in pre-trained LMs. To be +specific, RoAST introduces adversarial perturbation during fine-tuning while +the model parameters are selectively updated upon their relative importance to +minimize unnecessary deviation. Under a unified evaluation of fine-tuned LMs by +incorporating four representative perspectives of model robustness, we +demonstrate the effectiveness of RoAST compared to state-of-the-art fine-tuning +methods on six different types of LMs, which indicates its usefulness in +practice. + +
+
+ comment: 33 pages, accepted at EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Modeling Boundedly Rational Agents with Latent Inference Budgets + + +
+ We study the problem of modeling a population of agents pursuing unknown +goals subject to unknown computational constraints. In standard models of +bounded rationality, sub-optimal decision-making is simulated by adding +homoscedastic noise to optimal decisions rather than explicitly simulating +constrained inference. In this work, we introduce a latent inference budget +model (L-IBM) that models agents' computational constraints explicitly, via a +latent variable (inferred jointly with a model of agents' goals) that controls +the runtime of an iterative inference algorithm. L-IBMs make it possible to +learn agent models using data from diverse populations of suboptimal actors. In +three modeling tasks -- inferring navigation goals from routes, inferring +communicative intents from human utterances, and predicting next moves in human +chess games -- we show that L-IBMs match or outperform Boltzmann models of +decision-making under uncertainty. Inferred inference budgets are themselves +meaningful, efficient to compute, and correlated with measures of player skill, +partner skill and task difficulty. + +
+
+
+
+
+ + ☆ The sample complexity of multi-distribution learning + + +
+ Multi-distribution learning generalizes the classic PAC learning to handle +data coming from multiple distributions. Given a set of $k$ data distributions +and a hypothesis class of VC dimension $d$, the goal is to learn a hypothesis +that minimizes the maximum population loss over $k$ distributions, up to +$\epsilon$ additive error. In this paper, we settle the sample complexity of +multi-distribution learning by giving an algorithm of sample complexity +$\widetilde{O}((d+k)\epsilon^{-2}) \cdot (k/\epsilon)^{o(1)}$. This matches the +lower bound up to sub-polynomial factor and resolves the COLT 2023 open problem +of Awasthi, Haghtalab and Zhao [AHZ23]. + +
+
+
+
+
+ + ☆ k* Distribution: Evaluating the Latent Space of Deep Neural Networks + using Local Neighborhood Analysis + + +
+ Most examinations of neural networks' learned latent spaces typically employ +dimensionality reduction techniques such as t-SNE or UMAP. While these methods +effectively capture the overall sample distribution in the entire learned +latent space, they tend to distort the structure of sample distributions within +specific classes in the subset of the latent space. This distortion complicates +the task of easily distinguishing classes identifiable by neural networks. In +response to this challenge, we introduce the k* Distribution methodology. This +approach focuses on capturing the characteristics and structure of sample +distributions for individual classes within the subset of the learned latent +space using local neighborhood analysis. The key concept is to facilitate easy +comparison of different k* distributions, enabling analysis of how various +classes are processed by the same neural network. This provides a more profound +understanding of existing contemporary visualizations. Our study reveals three +distinct distributions of samples within the learned latent space subset: a) +Fractured, b) Overlapped, and c) Clustered. We note and demonstrate that the +distribution of samples within the network's learned latent space significantly +varies depending on the class. Furthermore, we illustrate that our analysis can +be applied to explore the latent space of diverse neural network architectures, +various layers within neural networks, transformations applied to input +samples, and the distribution of training and testing data for neural networks. +We anticipate that our approach will facilitate more targeted investigations +into neural networks by collectively examining the distribution of different +samples within the learned latent space. + +
+
+
+
+
+ + ☆ A Study on the Calibration of In-context Learning NeurIPS 2023 + + +
+ Modern auto-regressive language models are trained to minimize log loss on +broad data by predicting the next token so they are expected to get calibrated +answers when framing a problem as a next-token prediction task. We study this +for in-context learning (ICL), a widely used way to adapt frozen large language +models (LLMs) via crafting prompts, and investigate the trade-offs between +performance and calibration on a wide range of natural language understanding +and reasoning tasks. We conduct extensive experiments to show that such +trade-offs may get worse as we increase model size, incorporate more ICL +examples, and fine-tune models using instruction, dialog, or reinforcement +learning from human feedback (RLHF) on carefully curated datasets. Furthermore, +we find that common recalibration techniques that are widely effective such as +temperature scaling provide limited gains in calibration errors, suggesting +that new methods may be required for settings where models are expected to be +reliable. + +
+
+ comment: Spotlight Talk at NeurIPS 2023 Workshop on Failure Modes in the Age + of Foundation Models +
+
+
+
+
+ + ☆ LiDAR: Sensing Linear Probing Performance in Joint Embedding SSL + Architectures + + +
+ Joint embedding (JE) architectures have emerged as a promising avenue for +acquiring transferable data representations. A key obstacle to using JE +methods, however, is the inherent challenge of evaluating learned +representations without access to a downstream task, and an annotated dataset. +Without efficient and reliable evaluation, it is difficult to iterate on +architectural and training choices for JE methods. In this paper, we introduce +LiDAR (Linear Discriminant Analysis Rank), a metric designed to measure the +quality of representations within JE architectures. Our metric addresses +several shortcomings of recent approaches based on feature covariance rank by +discriminating between informative and uninformative features. In essence, +LiDAR quantifies the rank of the Linear Discriminant Analysis (LDA) matrix +associated with the surrogate SSL task -- a measure that intuitively captures +the information content as it pertains to solving the SSL task. We empirically +demonstrate that LiDAR significantly surpasses naive rank based approaches in +its predictive power of optimal hyperparameters. Our proposed criterion +presents a more robust and intuitive means of assessing the quality of +representations within JE architectures, which we hope facilitates broader +adoption of these powerful techniques in various domains. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Series2Vec: Similarity-based Self-supervised Representation Learning for + Time Series Classification + + +
+ We argue that time series analysis is fundamentally different in nature to +either vision or natural language processing with respect to the forms of +meaningful self-supervised learning tasks that can be defined. Motivated by +this insight, we introduce a novel approach called \textit{Series2Vec} for +self-supervised representation learning. Unlike other self-supervised methods +in time series, which carry the risk of positive sample variants being less +similar to the anchor sample than series in the negative set, Series2Vec is +trained to predict the similarity between two series in both temporal and +spectral domains through a self-supervised task. Series2Vec relies primarily on +the consistency of the unsupervised similarity step, rather than the intrinsic +quality of the similarity measurement, without the need for hand-crafted data +augmentation. To further enforce the network to learn similar representations +for similar time series, we propose a novel approach that applies +order-invariant attention to each representation within the batch during +training. Our evaluation of Series2Vec on nine large real-world datasets, along +with the UCR/UEA archive, shows enhanced performance compared to current +state-of-the-art self-supervised techniques for time series. Additionally, our +extensive experiments show that Series2Vec performs comparably with fully +supervised training and offers high efficiency in datasets with limited-labeled +data. Finally, we show that the fusion of Series2Vec with other representation +learning models leads to enhanced performance for time series classification. +Code and models are open-source at +\url{https://github.com/Navidfoumani/Series2Vec.} + +
+
+
+
+
+ + ☆ MICRO: Model-Based Offline Reinforcement Learning with a Conservative + Bellman Operator + + +
+ Offline reinforcement learning (RL) faces a significant challenge of +distribution shift. Model-free offline RL penalizes the Q value for +out-of-distribution (OOD) data or constrains the policy closed to the behavior +policy to tackle this problem, but this inhibits the exploration of the OOD +region. Model-based offline RL, which uses the trained environment model to +generate more OOD data and performs conservative policy optimization within +that model, has become an effective method for this problem. However, the +current model-based algorithms rarely consider agent robustness when +incorporating conservatism into policy. Therefore, the new model-based offline +algorithm with a conservative Bellman operator (MICRO) is proposed. This method +trades off performance and robustness via introducing the robust Bellman +operator into the algorithm. Compared with previous model-based algorithms with +robust adversarial models, MICRO can significantly reduce the computation cost +by only choosing the minimal Q value in the state uncertainty set. Extensive +experiments demonstrate that MICRO outperforms prior RL algorithms in offline +RL benchmark and is considerably robust to adversarial perturbations. + +
+
+ comment: 13 pages, 5 figures +
+
+
+
+
+ + ☆ Rapid detection of rare events from in situ X-ray diffraction data using + machine learning + + +
+ High-energy X-ray diffraction methods can non-destructively map the 3D +microstructure and associated attributes of metallic polycrystalline +engineering materials in their bulk form. These methods are often combined with +external stimuli such as thermo-mechanical loading to take snapshots over time +of the evolving microstructure and attributes. However, the extreme data +volumes and the high costs of traditional data acquisition and reduction +approaches pose a barrier to quickly extracting actionable insights and +improving the temporal resolution of these snapshots. Here we present a fully +automated technique capable of rapidly detecting the onset of plasticity in +high-energy X-ray microscopy data. Our technique is computationally faster by +at least 50 times than the traditional approaches and works for data sets that +are up to 9 times sparser than a full data set. This new technique leverages +self-supervised image representation learning and clustering to transform +massive data into compact, semantic-rich representations of visually salient +characteristics (e.g., peak shapes). These characteristics can be a rapid +indicator of anomalous events such as changes in diffraction peak shapes. We +anticipate that this technique will provide just-in-time actionable information +to drive smarter experiments that effectively deploy multi-modal X-ray +diffraction methods that span many decades of length scales. + +
+
+
+
+
+ + ☆ Node-aware Bi-smoothing: Certified Robustness against Graph Injection + Attacks + + +
+ Deep Graph Learning (DGL) has emerged as a crucial technique across various +domains. However, recent studies have exposed vulnerabilities in DGL models, +such as susceptibility to evasion and poisoning attacks. While empirical and +provable robustness techniques have been developed to defend against graph +modification attacks (GMAs), the problem of certified robustness against graph +injection attacks (GIAs) remains largely unexplored. To bridge this gap, we +introduce the node-aware bi-smoothing framework, which is the first certifiably +robust approach for general node classification tasks against GIAs. Notably, +the proposed node-aware bi-smoothing scheme is model-agnostic and is applicable +for both evasion and poisoning attacks. Through rigorous theoretical analysis, +we establish the certifiable conditions of our smoothing scheme. We also +explore the practical implications of our node-aware bi-smoothing schemes in +two contexts: as an empirical defense approach against real-world GIAs and in +the context of recommendation systems. Furthermore, we extend two +state-of-the-art certified robustness frameworks to address node injection +attacks and compare our approach against them. Extensive evaluations +demonstrate the effectiveness of our proposed certificates. + +
+
+
+
+
+ + ☆ PerSival: Neural-network-based visualisation for pervasive + continuum-mechanical simulations in musculoskeletal biomechanics + + +
+ This paper presents a novel neural network architecture for the purpose of +pervasive visualisation of a 3D human upper limb musculoskeletal system model. +Bringing simulation capabilities to resource-poor systems like mobile devices +is of growing interest across many research fields, to widen applicability of +methods and results. Until recently, this goal was thought to be out of reach +for realistic continuum-mechanical simulations of musculoskeletal systems, due +to prohibitive computational cost. Within this work we use a sparse grid +surrogate to capture the surface deformation of the m.~biceps brachii in order +to train a deep learning model, used for real-time visualisation of the same +muscle. Both these surrogate models take 5 muscle activation levels as input +and output Cartesian coordinate vectors for each mesh node on the muscle's +surface. Thus, the neural network architecture features a significantly lower +input than output dimension. 5 muscle activation levels were sufficient to +achieve an average error of 0.97 +/- 0.16 mm, or 0.57 +/- 0.10 % for the 2809 +mesh node positions of the biceps. The model achieved evaluation times of 9.88 +ms per predicted deformation state on CPU only and 3.48 ms with GPU-support, +leading to theoretical frame rates of 101 fps and 287 fps respectively. Deep +learning surrogates thus provide a way to make continuum-mechanical simulations +accessible for visual real-time applications. + +
+
+ comment: 10 pages, 4 figures, 5 tables, to be submitted to Medical Image + Analysis +
+
+
+
+
+ + ♻ ☆ Evaluation of Active Feature Acquisition Methods for Time-varying + Feature Settings + + +
+ Machine learning methods often assume input features are available at no +cost. However, in domains like healthcare, where acquiring features could be +expensive or harmful, it is necessary to balance a feature's acquisition cost +against its predictive value. The task of training an AI agent to decide which +features to acquire is called active feature acquisition (AFA). By deploying an +AFA agent, we effectively alter the acquisition strategy and trigger a +distribution shift. To safely deploy AFA agents under this distribution shift, +we present the problem of active feature acquisition performance evaluation +(AFAPE). We examine AFAPE under i) a no direct effect (NDE) assumption, stating +that acquisitions don't affect the underlying feature values; and ii) a no +unobserved confounding (NUC) assumption, stating that retrospective feature +acquisition decisions were only based on observed features. We show that one +can apply offline reinforcement learning under the NUC assumption and missing +data methods under the NDE assumption. When NUC and NDE hold, we propose a +novel semi-offline reinforcement learning framework, which requires a weaker +positivity assumption and yields more data-efficient estimators. We introduce +three novel estimators: a direct method (DM), an inverse probability weighting +(IPW), and a double reinforcement learning (DRL) estimator. + +
+
+ comment: 43 pages, 3 tables, 8 Figures +
+
+
+
+
+ + ♻ ☆ Evaluation of Active Feature Acquisition Methods for Static Feature + Settings + + +
+ Active feature acquisition (AFA) agents, crucial in domains like healthcare +where acquiring features is often costly or harmful, determine the optimal set +of features for a subsequent classification task. As deploying an AFA agent +introduces a shift in missingness distribution, it's vital to assess its +expected performance at deployment using retrospective data. In a companion +paper, we introduce a semi-offline reinforcement learning (RL) framework for +active feature acquisition performance evaluation (AFAPE) where features are +assumed to be time-dependent. Here, we study and extend the AFAPE problem to +cover static feature settings, where features are time-invariant, and hence +provide more flexibility to the AFA agents in deciding the order of the +acquisitions. In this static feature setting, we derive and adapt new inverse +probability weighting (IPW), direct method (DM), and double reinforcement +learning (DRL) estimators within the semi-offline RL framework. These +estimators can be applied when the missingness in the retrospective dataset +follows a missing-at-random (MAR) pattern. They also can be applied to +missing-not-at-random (MNAR) patterns in conjunction with appropriate existing +missing data techniques. We illustrate the improved data efficiency offered by +the semi-offline RL estimators in synthetic and real-world data experiments +under synthetic MAR and MNAR missingness. + +
+
+ comment: 38 pages, 7 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Can Large Language Models Transform Computational Social Science? + + +
+ Large Language Models (LLMs) are capable of successfully performing many +language processing tasks zero-shot (without training data). If zero-shot LLMs +can also reliably classify and explain social phenomena like persuasiveness and +political ideology, then LLMs could augment the Computational Social Science +(CSS) pipeline in important ways. This work provides a road map for using LLMs +as CSS tools. Towards this end, we contribute a set of prompting best practices +and an extensive evaluation pipeline to measure the zero-shot performance of 13 +language models on 25 representative English CSS benchmarks. On taxonomic +labeling tasks (classification), LLMs fail to outperform the best fine-tuned +models but still achieve fair levels of agreement with humans. On free-form +coding tasks (generation), LLMs produce explanations that often exceed the +quality of crowdworkers' gold references. We conclude that the performance of +today's LLMs can augment the CSS research pipeline in two ways: (1) serving as +zero-shot data annotators on human annotation teams, and (2) bootstrapping +challenging creative generation tasks (e.g., explaining the underlying +attributes of a text). In summary, LLMs are posed to meaningfully participate +in} social science analysis in partnership with humans. + +
+
+ comment: To appear in "Computational Linguistics" (CL) +
+
+
+
+
+ + ♻ ☆ Preserving privacy in domain transfer of medical AI models comes at no + performance costs: The integral role of differential privacy + + +
+ Developing robust and effective artificial intelligence (AI) models in +medicine requires access to large amounts of patient data. The use of AI models +solely trained on large multi-institutional datasets can help with this, yet +the imperative to ensure data privacy remains, particularly as membership +inference risks breaching patient confidentiality. As a proposed remedy, we +advocate for the integration of differential privacy (DP). We specifically +investigate the performance of models trained with DP as compared to models +trained without DP on data from institutions that the model had not seen during +its training (i.e., external validation) - the situation that is reflective of +the clinical use of AI models. By leveraging more than 590,000 chest +radiographs from five institutions, we evaluated the efficacy of DP-enhanced +domain transfer (DP-DT) in diagnosing cardiomegaly, pleural effusion, +pneumonia, atelectasis, and in identifying healthy subjects. We juxtaposed +DP-DT with non-DP-DT and examined diagnostic accuracy and demographic fairness +using the area under the receiver operating characteristic curve (AUC) as the +main metric, as well as accuracy, sensitivity, and specificity. Our results +show that DP-DT, even with exceptionally high privacy levels (epsilon around +1), performs comparably to non-DP-DT (P>0.119 across all domains). Furthermore, +DP-DT led to marginal AUC differences - less than 1% - for nearly all +subgroups, relative to non-DP-DT. Despite consistent evidence suggesting that +DP models induce significant performance degradation for on-domain +applications, we show that off-domain performance is almost not affected. +Therefore, we ardently advocate for the adoption of DP in training diagnostic +medical AI models, given its minimal impact on performance. + +
+
+ comment: Published in Radiology: Artificial Intelligence. RSNA +
+
+
+
+
+ + ♻ ☆ LAVA: Data Valuation without Pre-Specified Learning Algorithms ICLR 2023 + + +
+ Traditionally, data valuation (DV) is posed as a problem of equitably +splitting the validation performance of a learning algorithm among the training +data. As a result, the calculated data values depend on many design choices of +the underlying learning algorithm. However, this dependence is undesirable for +many DV use cases, such as setting priorities over different data sources in a +data acquisition process and informing pricing mechanisms in a data +marketplace. In these scenarios, data needs to be valued before the actual +analysis and the choice of the learning algorithm is still undetermined then. +Another side-effect of the dependence is that to assess the value of individual +points, one needs to re-run the learning algorithm with and without a point, +which incurs a large computation burden. This work leapfrogs over the current +limits of data valuation methods by introducing a new framework that can value +training data in a way that is oblivious to the downstream learning algorithm. +Our main results are as follows. (1) We develop a proxy for the validation +performance associated with a training set based on a non-conventional +class-wise Wasserstein distance between training and validation sets. We show +that the distance characterizes the upper bound of the validation performance +for any given model under certain Lipschitz conditions. (2) We develop a novel +method to value individual data based on the sensitivity analysis of the +class-wise Wasserstein distance. Importantly, these values can be directly +obtained for free from the output of off-the-shelf optimization solvers when +computing the distance. (3) We evaluate our new data valuation framework over +various use cases related to detecting low-quality data and show that, +surprisingly, the learning-agnostic feature of our framework enables a +significant improvement over SOTA performance while being orders of magnitude +faster. + +
+
+ comment: ICLR 2023 Spotlight +
+
+
+
+
+ + ♻ ☆ A Stability Analysis of Fine-Tuning a Pre-Trained Model + + +
+ Fine-tuning a pre-trained model (such as BERT, ALBERT, RoBERTa, T5, GPT, +etc.) has proven to be one of the most promising paradigms in recent NLP +research. However, numerous recent works indicate that fine-tuning suffers from +the instability problem, i.e., tuning the same model under the same setting +results in significantly different performance. Many recent works have proposed +different methods to solve this problem, but there is no theoretical +understanding of why and how these methods work. In this paper, we propose a +novel theoretical stability analysis of fine-tuning that focuses on two +commonly used settings, namely, full fine-tuning and head tuning. We define the +stability under each setting and prove the corresponding stability bounds. The +theoretical bounds explain why and how several existing methods can stabilize +the fine-tuning procedure. In addition to being able to explain most of the +observed empirical discoveries, our proposed theoretical analysis framework can +also help in the design of effective and provable methods. Based on our theory, +we propose three novel strategies to stabilize the fine-tuning procedure, +namely, Maximal Margin Regularizer (MMR), Multi-Head Loss (MHLoss), and Self +Unsupervised Re-Training (SURT). We extensively evaluate our proposed +approaches on 11 widely used real-world benchmark datasets, as well as hundreds +of synthetic classification datasets. The experiment results show that our +proposed methods significantly stabilize the fine-tuning procedure and also +corroborate our theoretical analysis. + +
+
+
+
+
+ + ♻ ☆ If your data distribution shifts, use self-learning + + +
+ We demonstrate that self-learning techniques like entropy minimization and +pseudo-labeling are simple and effective at improving performance of a deployed +computer vision model under systematic domain shifts. We conduct a wide range +of large-scale experiments and show consistent improvements irrespective of the +model architecture, the pre-training technique or the type of distribution +shift. At the same time, self-learning is simple to use in practice because it +does not require knowledge or access to the original training data or scheme, +is robust to hyperparameter choices, is straight-forward to implement and +requires only a few adaptation epochs. This makes self-learning techniques +highly attractive for any practitioner who applies machine learning algorithms +in the real world. We present state-of-the-art adaptation results on CIFAR10-C +(8.5% error), ImageNet-C (22.0% mCE), ImageNet-R (17.4% error) and ImageNet-A +(14.8% error), theoretically study the dynamics of self-supervised adaptation +methods and propose a new classification dataset (ImageNet-D) which is +challenging even with adaptation. + +
+
+ comment: Web: https://domainadaptation.org/selflearning +
+
+
+
+
+ + ♻ ☆ Distributed Bayesian Estimation in Sensor Networks: Consensus on + Marginal Densities + + +
+ In this paper, we aim to design and analyze distributed Bayesian estimation +algorithms for sensor networks. The challenges we address are to (i) derive a +distributed provably-correct algorithm in the functional space of probability +distributions over continuous variables, and (ii) leverage these results to +obtain new distributed estimators restricted to subsets of variables observed +by individual agents. This relates to applications such as cooperative +localization and federated learning, where the data collected at any agent +depends on a subset of all variables of interest. We present Bayesian density +estimation algorithms using data from non-linear likelihoods at agents in +centralized, distributed, and marginal distributed settings. After setting up a +distributed estimation objective, we prove almost-sure convergence to the +optimal set of pdfs at each agent. Then, we prove the same for a storage-aware +algorithm estimating densities only over relevant variables at each agent. +Finally, we present a Gaussian version of these algorithms and implement it in +a mapping problem using variational inference to handle non-linear likelihood +models associated with LiDAR sensing. + +
+
+
+
+
+ + ♻ ☆ Training Deep 3D Convolutional Neural Networks to Extract BSM Physics + Parameters Directly from HEP Data: a Proof-of-Concept Study Using Monte Carlo + Simulations + + +
+ We report on a novel application of computer vision techniques to extract +beyond the Standard Model (BSM) parameters directly from high energy physics +(HEP) flavor data. We develop a method of transforming angular and kinematic +distributions into "quasi-images" that can be used to train a convolutional +neural network to perform regression tasks, similar to fitting. This contrasts +with the usual classification functions performed using ML/AI in HEP. As a +proof-of-concept, we train a 34-layer Residual Neural Network to regress on +these images and determine the Wilson Coefficient $C_{9}$ in MC (Monte Carlo) +simulations of $B \rightarrow K^{*}\mu^{+}\mu^{-}$ decays. The technique +described here can be generalized and may find applicability across various HEP +experiments and elsewhere. + +
+
+
+
+
+ + ♻ ☆ Point Cloud-based Proactive Link Quality Prediction for Millimeter-wave + Communications + + +
+ This study demonstrates the feasibility of point cloud-based proactive link +quality prediction for millimeter-wave (mmWave) communications. Previous +studies have proposed machine learning-based methods to predict received signal +strength for future time periods using time series of depth images to mitigate +the line-of-sight (LOS) path blockage by pedestrians in mmWave communication. +However, these image-based methods have limited applicability due to privacy +concerns as camera images may contain sensitive information. This study +proposes a point cloud-based method for mmWave link quality prediction and +demonstrates its feasibility through experiments. Point clouds represent +three-dimensional (3D) spaces as a set of points and are sparser and less +likely to contain sensitive information than camera images. Additionally, point +clouds provide 3D position and motion information, which is necessary for +understanding the radio propagation environment involving pedestrians. This +study designs the mmWave link quality prediction method and conducts realistic +indoor experiments, where the link quality fluctuates significantly due to +human blockage, using commercially available IEEE 802.11ad-based 60 GHz +wireless LAN devices and Kinect v2 RGB-D camera and Velodyne VLP-16 light +detection and ranging (LiDAR) for point cloud acquisition. The experimental +results showed that our proposed method can predict future large attenuation of +mmWave received signal strength and throughput induced by the LOS path blockage +by pedestrians with comparable or superior accuracy to image-based prediction +methods. Hence, our point cloud-based method can serve as a viable alternative +to image-based methods. + +
+
+
+
+
+ + ♻ ☆ DeepGraphDMD: Interpretable Spatio-Temporal Decomposition of Non-linear + Functional Brain Network Dynamics MICCAI 2023 + + +
+ Functional brain dynamics is supported by parallel and overlapping functional +network modes that are associated with specific neural circuits. Decomposing +these network modes from fMRI data and finding their temporal characteristics +is challenging due to their time-varying nature and the non-linearity of the +functional dynamics. Dynamic Mode Decomposition (DMD) algorithms have been +quite popular for solving this decomposition problem in recent years. In this +work, we apply GraphDMD -- an extension of the DMD for network data -- to +extract the dynamic network modes and their temporal characteristics from the +fMRI time series in an interpretable manner. GraphDMD, however, regards the +underlying system as a linear dynamical system that is sub-optimal for +extracting the network modes from non-linear functional data. In this work, we +develop a generalized version of the GraphDMD algorithm -- DeepGraphDMD -- +applicable to arbitrary non-linear graph dynamical systems. DeepGraphDMD is an +autoencoder-based deep learning model that learns Koopman eigenfunctions for +graph data and embeds the non-linear graph dynamics into a latent linear space. +We show the effectiveness of our method in both simulated data and the HCP +resting-state fMRI data. In the HCP data, DeepGraphDMD provides novel insights +into cognitive brain functions by discovering two major network modes related +to fluid and crystallized intelligence. + +
+
+ comment: Published in MICCAI 2023 +
+
+
+
+
+ + ♻ ☆ Active Learning of Discrete-Time Dynamics for Uncertainty-Aware Model + Predictive Control + + +
+ Model-based control requires an accurate model of the system dynamics for +precisely and safely controlling the robot in complex and dynamic environments. +Moreover, in the presence of variations in the operating conditions, the model +should be continuously refined to compensate for dynamics changes. In this +paper, we present a self-supervised learning approach that actively models the +dynamics of nonlinear robotic systems. We combine offline learning from past +experience and online learning from current robot interaction with the unknown +environment. These two ingredients enable a highly sample-efficient and +adaptive learning process, capable of accurately inferring model dynamics in +real-time even in operating regimes that greatly differ from the training +distribution. Moreover, we design an uncertainty-aware model predictive +controller that is heuristically conditioned to the aleatoric (data) +uncertainty of the learned dynamics. This controller actively chooses the +optimal control actions that (i) optimize the control performance and (ii) +improve the efficiency of online learning sample collection. We demonstrate the +effectiveness of our method through a series of challenging real-world +experiments using a quadrotor system. Our approach showcases high resilience +and generalization capabilities by consistently adapting to unseen flight +conditions, while it significantly outperforms classical and adaptive control +baselines. + +
+
+
+
+
+ + ♻ ☆ Sem@$K$: Is my knowledge graph embedding model semantic-aware? + + +
+ Using knowledge graph embedding models (KGEMs) is a popular approach for +predicting links in knowledge graphs (KGs). Traditionally, the performance of +KGEMs for link prediction is assessed using rank-based metrics, which evaluate +their ability to give high scores to ground-truth entities. However, the +literature claims that the KGEM evaluation procedure would benefit from adding +supplementary dimensions to assess. That is why, in this paper, we extend our +previously introduced metric Sem@K that measures the capability of models to +predict valid entities w.r.t. domain and range constraints. In particular, we +consider a broad range of KGs and take their respective characteristics into +account to propose different versions of Sem@K. We also perform an extensive +study to qualify the abilities of KGEMs as measured by our metric. Our +experiments show that Sem@K provides a new perspective on KGEM quality. Its +joint analysis with rank-based metrics offers different conclusions on the +predictive power of models. Regarding Sem@K, some KGEMs are inherently better +than others, but this semantic superiority is not indicative of their +performance w.r.t. rank-based metrics. In this work, we generalize conclusions +about the relative performance of KGEMs w.r.t. rank-based and semantic-oriented +metrics at the level of families of models. The joint analysis of the +aforementioned metrics gives more insight into the peculiarities of each model. +This work paves the way for a more comprehensive evaluation of KGEM adequacy +for specific downstream tasks. + +
+
+
+
+
+ + ♻ ☆ Loss-Optimal Classification Trees: A Generalized Framework and the + Logistic Case + + +
+ The Classification Tree (CT) is one of the most common models in +interpretable machine learning. Although such models are usually built with +greedy strategies, in recent years, thanks to remarkable advances in +Mixer-Integer Programming (MIP) solvers, several exact formulations of the +learning problem have been developed. In this paper, we argue that some of the +most relevant ones among these training models can be encapsulated within a +general framework, whose instances are shaped by the specification of loss +functions and regularizers. Next, we introduce a novel realization of this +framework: specifically, we consider the logistic loss, handled in the MIP +setting by a linear piece-wise approximation, and couple it with +$\ell_1$-regularization terms. The resulting Optimal Logistic Tree model +numerically proves to be able to induce trees with enhanced interpretability +features and competitive generalization capabilities, compared to the +state-of-the-art MIP-based approaches. + +
+
+
+
+
+ + ♻ ☆ A Machine Learning Approach to Two-Stage Adaptive Robust Optimization + + +
+ We propose an approach based on machine learning to solve two-stage linear +adaptive robust optimization (ARO) problems with binary here-and-now variables +and polyhedral uncertainty sets. We encode the optimal here-and-now decisions, +the worst-case scenarios associated with the optimal here-and-now decisions, +and the optimal wait-and-see decisions into what we denote as the strategy. We +solve multiple similar ARO instances in advance using the column and constraint +generation algorithm and extract the optimal strategies to generate a training +set. We train a machine learning model that predicts high-quality strategies +for the here-and-now decisions, the worst-case scenarios associated with the +optimal here-and-now decisions, and the wait-and-see decisions. We also +introduce an algorithm to reduce the number of different target classes the +machine learning algorithm needs to be trained on. We apply the proposed +approach to the facility location, the multi-item inventory control and the +unit commitment problems. Our approach solves ARO problems drastically faster +than the state-of-the-art algorithms with high accuracy. + +
+
+
+
+
+ + ♻ ☆ Retrieval-Based Reconstruction For Time-series Contrastive Learning + + +
+ The success of self-supervised contrastive learning hinges on identifying +positive data pairs that, when pushed together in embedding space, encode +useful information for subsequent downstream tasks. However, in time-series, +this is challenging because creating positive pairs via augmentations may break +the original semantic meaning. We hypothesize that if we can retrieve +information from one subsequence to successfully reconstruct another +subsequence, then they should form a positive pair. Harnessing this intuition, +we introduce our novel approach: REtrieval-BAsed Reconstruction (REBAR) +contrastive learning. First, we utilize a convolutional cross-attention +architecture to calculate the REBAR error between two different time-series. +Then, through validation experiments, we show that the REBAR error is a +predictor of mutual class membership, justifying its usage as a +positive/negative labeler. Finally, once integrated into a contrastive learning +framework, our REBAR method can learn an embedding that achieves +state-of-the-art performance on downstream tasks across various modalities. + +
+
+
+
+
+ + ♻ ☆ Recurrent neural networks and transfer learning for elasto-plasticity in + woven composites + + +
+ As a surrogate for computationally intensive meso-scale simulation of woven +composites, this article presents Recurrent Neural Network (RNN) models. +Leveraging the power of transfer learning, the initialization challenges and +sparse data issues inherent in cyclic shear strain loads are addressed in the +RNN models. A mean-field model generates a comprehensive data set representing +elasto-plastic behavior. In simulations, arbitrary six-dimensional strain +histories are used to predict stresses under random walking as the source task +and cyclic loading conditions as the target task. Incorporating sub-scale +properties enhances RNN versatility. In order to achieve accurate predictions, +the model uses a grid search method to tune network architecture and +hyper-parameter configurations. The results of this study demonstrate that +transfer learning can be used to effectively adapt the RNN to varying strain +conditions, which establishes its potential as a useful tool for modeling +path-dependent responses in woven composites. + +
+
+ comment: There are 25 pages and 13 EPS images. The paper includes links to + supporting materials +
+
+
+
+
+ + ♻ ☆ Reinforcement Learning for Combining Search Methods in the Calibration + of Economic ABMs AAAI + + +
+ Calibrating agent-based models (ABMs) in economics and finance typically +involves a derivative-free search in a very large parameter space. In this +work, we benchmark a number of search methods in the calibration of a +well-known macroeconomic ABM on real data, and further assess the performance +of "mixed strategies" made by combining different methods. We find that methods +based on random-forest surrogates are particularly efficient, and that +combining search methods generally increases performance since the biases of +any single method are mitigated. Moving from these observations, we propose a +reinforcement learning (RL) scheme to automatically select and combine search +methods on-the-fly during a calibration run. The RL agent keeps exploiting a +specific method only as long as this keeps performing well, but explores new +strategies when the specific method reaches a performance plateau. The +resulting RL search scheme outperforms any other method or method combination +tested, and does not rely on any prior information or trial and error +procedure. + +
+
+ comment: 9 pages and 5 figures, presented at the AAAI bridge program 'AI for + Financial Institutions' (https://aaai23.bankit.art/), at the ICLR bridge + program 'AI4ABM' (https://ai4abm.org/workshop_iclr2023/) and at ICAIF '23 + (https://ai-finance.org/). Proceedings of the Fourth ACM International + Conference on AI in Finance, (ICAIF 23), Association for Computing Machinery, + New York, NY, USA +
+
+
+
+
+ + ♻ ☆ TSGBench: Time Series Generation Benchmark VLDB 2024 + + +
+ Synthetic Time Series Generation (TSG) is crucial in a range of applications, +including data augmentation, anomaly detection, and privacy preservation. +Although significant strides have been made in this field, existing methods +exhibit three key limitations: (1) They often benchmark against similar model +types, constraining a holistic view of performance capabilities. (2) The use of +specialized synthetic and private datasets introduces biases and hampers +generalizability. (3) Ambiguous evaluation measures, often tied to custom +networks or downstream tasks, hinder consistent and fair comparison. + To overcome these limitations, we introduce \textsf{TSGBench}, the inaugural +Time Series Generation Benchmark, designed for a unified and comprehensive +assessment of TSG methods. It comprises three modules: (1) a curated collection +of publicly available, real-world datasets tailored for TSG, together with a +standardized preprocessing pipeline; (2) a comprehensive evaluation measures +suite including vanilla measures, new distance-based assessments, and +visualization tools; (3) a pioneering generalization test rooted in Domain +Adaptation (DA), compatible with all methods. We have conducted comprehensive +experiments using \textsf{TSGBench} across a spectrum of ten real-world +datasets from diverse domains, utilizing ten advanced TSG methods and twelve +evaluation measures. The results highlight the reliability and efficacy of +\textsf{TSGBench} in evaluating TSG methods. Crucially, \textsf{TSGBench} +delivers a statistical analysis of the performance rankings of these methods, +illuminating their varying performance across different datasets and measures +and offering nuanced insights into the effectiveness of each method. + +
+
+ comment: Accepted and to appear in VLDB 2024 +
+
+
+
+
+ + ♻ ☆ Low-complexity subspace-descent over symmetric positive definite + manifold + + +
+ This work puts forth low-complexity Riemannian subspace descent algorithms +for the minimization of functions over the symmetric positive definite (SPD) +manifold. Different from the existing Riemannian gradient descent variants, the +proposed approach utilizes carefully chosen subspaces that allow the update to +be written as a product of the Cholesky factor of the iterate and a sparse +matrix. The resulting updates avoid the costly matrix operations like matrix +exponentiation and dense matrix multiplication, which are generally required in +almost all other Riemannian optimization algorithms on SPD manifold. We further +identify a broad class of functions, arising in diverse applications, such as +kernel matrix learning, covariance estimation of Gaussian distributions, +maximum likelihood parameter estimation of elliptically contoured +distributions, and parameter estimation in Gaussian mixture model problems, +over which the Riemannian gradients can be calculated efficiently. The proposed +uni-directional and multi-directional Riemannian subspace descent variants +incur per-iteration complexities of $\O(n)$ and $\O(n^2)$ respectively, as +compared to the $\O(n^3)$ or higher complexity incurred by all existing +Riemannian gradient descent variants. The superior runtime and low +per-iteration complexity of the proposed algorithms is also demonstrated via +numerical tests on large-scale covariance estimation and matrix square root +problems. + +
+
+
+
+
+ + ♻ ☆ Trajectory-User Linking via Hierarchical Spatio-Temporal Attention + Networks KDD + + +
+ Trajectory-User Linking (TUL) is crucial for human mobility modeling by +linking diferent trajectories to users with the exploration of complex mobility +patterns. Existing works mainly rely on the recurrent neural framework to +encode the temporal dependencies in trajectories, have fall short in capturing +spatial-temporal global context for TUL prediction. To ill this gap, this work +presents a new hierarchical spatio-temporal attention neural network, called +AttnTUL, to jointly encode the local trajectory transitional patterns and +global spatial dependencies for TUL. Speciically, our irst model component is +built over the graph neural architecture to preserve the local and global +context and enhance the representation paradigm of geographical regions and +user trajectories. Additionally, a hierarchically structured attention network +is designed to simultaneously encode the intra-trajectory and inter-trajectory +dependencies, with the integration of the temporal attention mechanism and +global elastic attentional encoder. Extensive experiments demonstrate the +superiority of our AttnTUL method as compared to state-of-the-art baselines on +various trajectory datasets. The source code of our model is available at +https://github.com/Onedean/AttnTUL. + +
+
+ comment: 22 pages, 8 figures, accepted by ACM Trans. Knowl. Discov. Data + Journal (TKDD) +
+
+
+
+
+ + ♻ ☆ A Parameterized Generative Adversarial Network Using Cyclic Projection + for Explainable Medical Image Classification + + +
+ Although current data augmentation methods are successful to alleviate the +data insufficiency, conventional augmentation are primarily intra-domain while +advanced generative adversarial networks (GANs) generate images remaining +uncertain, particularly in small-scale datasets. In this paper, we propose a +parameterized GAN (ParaGAN) that effectively controls the changes of synthetic +samples among domains and highlights the attention regions for downstream +classification. Specifically, ParaGAN incorporates projection distance +parameters in cyclic projection and projects the source images to the decision +boundary to obtain the class-difference maps. Our experiments show that ParaGAN +can consistently outperform the existing augmentation methods with explainable +classification on two small-scale medical datasets. + +
+
+ comment: 5 pages, 4 figures. This work has been submitted to the IEEE for + possible publication. Copyright may be transferred without notice, after + which this version may no longer be accessible +
+
+
+
+
+ + ♻ ☆ Efficient LLM Inference on CPUs NeurIPS'2023 + + +
+ Large language models (LLMs) have demonstrated remarkable performance and +tremendous potential across a wide range of tasks. However, deploying these +models has been challenging due to the astronomical amount of model parameters, +which requires a demand for large memory capacity and high memory bandwidth. In +this paper, we propose an effective approach that can make the deployment of +LLMs more efficiently. We support an automatic INT4 weight-only quantization +flow and design a special LLM runtime with highly-optimized kernels to +accelerate the LLM inference on CPUs. We demonstrate the general applicability +of our approach on popular LLMs including Llama2, Llama, GPT-NeoX, and showcase +the extreme inference efficiency on CPUs. The code is publicly available at: +https://github.com/intel/intel-extension-for-transformers. + +
+
+ comment: NeurIPS'2023 on Efficient Natural Language and Speech Processing +
+
+
+
+
+ + ♻ ☆ Are Transformers with One Layer Self-Attention Using Low-Rank Weight + Matrices Universal Approximators? + + +
+ Existing analyses of the expressive capacity of Transformer models have +required excessively deep layers for data memorization, leading to a +discrepancy with the Transformers actually used in practice. This is primarily +due to the interpretation of the softmax function as an approximation of the +hardmax function. By clarifying the connection between the softmax function and +the Boltzmann operator, we prove that a single layer of self-attention with +low-rank weight matrices possesses the capability to perfectly capture the +context of an entire input sequence. As a consequence, we show that one-layer +and single-head Transformers have a memorization capacity for finite samples, +and that Transformers consisting of one self-attention layer with two +feed-forward neural networks are universal approximators for continuous +permutation equivariant functions on a compact domain. + +
+
+
+
+
+ + ♻ ☆ Non Intrusive Intelligibility Predictor for Hearing Impaired Individuals + using Self Supervised Speech Representations SP + + +
+ Self-supervised speech representations (SSSRs) have been successfully applied +to a number of speech-processing tasks, e.g. as feature extractor for speech +quality (SQ) prediction, which is, in turn, relevant for assessment and +training speech enhancement systems for users with normal or impaired hearing. +However, exact knowledge of why and how quality-related information is encoded +well in such representations remains poorly understood. In this work, +techniques for non-intrusive prediction of SQ ratings are extended to the +prediction of intelligibility for hearing-impaired users. It is found that +self-supervised representations are useful as input features to non-intrusive +prediction models, achieving competitive performance to more complex systems. A +detailed analysis of the performance depending on Clarity Prediction Challenge +1 listeners and enhancement systems indicates that more data might be needed to +allow generalisation to unknown systems and (hearing-impaired) individuals + +
+
+ comment: Accepted @ ASRU 2023 SPARKS workshop +
+
+
+
+
+ + ♻ ☆ Bayesian Methods for Media Mix Modelling with shape and funnel effects + + +
+ In recent years, significant progress in generative AI has highlighted the +important role of physics-inspired models that utilize advanced mathematical +concepts based on fundamental physics principles to enhance artificial +intelligence capabilities. Among these models, those based on diffusion +equations have greatly improved image quality. This study aims to explore the +potential uses of Maxwell-Boltzmann equation, which forms the basis of the +kinetic theory of gases, and the Michaelis-Menten model in Marketing Mix +Modelling (MMM) applications. We propose incorporating these equations into +Hierarchical Bayesian models to analyse consumer behaviour in the context of +advertising. These equation sets excel in accurately describing the random +dynamics in complex systems like social interactions and consumer-advertising +interactions. + +
+
+ comment: Rev. 3, December 2023 +
+
+
+
+
+ + ♻ ☆ Similarity of Neural Architectures using Adversarial Attack + Transferability + + +
+ In recent years, many deep neural architectures have been developed for image +classification. Whether they are similar or dissimilar and what factors +contribute to their (dis)similarities remains curious. To address this +question, we aim to design a quantitative and scalable similarity measure +between neural architectures. We propose Similarity by Attack Transferability +(SAT) from the observation that adversarial attack transferability contains +information related to input gradients and decision boundaries widely used to +understand model behaviors. We conduct a large-scale analysis on 69 +state-of-the-art ImageNet classifiers using our proposed similarity function to +answer the question. Moreover, we observe neural architecture-related phenomena +using model similarity that model diversity can lead to better performance on +model ensembles and knowledge distillation under specific conditions. Our +results provide insights into why developing diverse neural architectures with +distinct components is necessary. + +
+
+ comment: 20pages, 13 figures, 2.3MB +
+
+
+
+
+ + ♻ ☆ Internal-Coordinate Density Modelling of Protein Structure: Covariance + Matters + + +
+ After the recent ground-breaking advances in protein structure prediction, +one of the remaining challenges in protein machine learning is to reliably +predict distributions of structural states. Parametric models of fluctuations +are difficult to fit due to complex covariance structures between degrees of +freedom in the protein chain, often causing models to either violate local or +global structural constraints. In this paper, we present a new strategy for +modelling protein densities in internal coordinates, which uses constraints in +3D space to induce covariance structure between the internal degrees of +freedom. We illustrate the potential of the procedure by constructing a +variational autoencoder with full covariance output induced by the constraints +implied by the conditional mean in 3D, and demonstrate that our approach makes +it possible to scale density models of internal coordinates to full protein +backbones in two settings: 1) a unimodal setting for proteins exhibiting small +fluctuations and limited amounts of available data, and 2) a multimodal setting +for larger conformational changes in a high data regime. + +
+
+ comment: Pages: 9 main, 3 references, 8 appendix. Figures: 5 main, 6 appendix +
+
+
+
+
+ + ♻ ☆ Classical Verification of Quantum Learning + + +
+ Quantum data access and quantum processing can make certain classically +intractable learning tasks feasible. However, quantum capabilities will only be +available to a select few in the near future. Thus, reliable schemes that allow +classical clients to delegate learning to untrusted quantum servers are +required to facilitate widespread access to quantum learning advantages. +Building on a recently introduced framework of interactive proof systems for +classical machine learning, we develop a framework for classical verification +of quantum learning. We exhibit learning problems that a classical learner +cannot efficiently solve on their own, but that they can efficiently and +reliably solve when interacting with an untrusted quantum prover. Concretely, +we consider the problems of agnostic learning parities and Fourier-sparse +functions with respect to distributions with uniform input marginal. We propose +a new quantum data access model that we call "mixture-of-superpositions" +quantum examples, based on which we give efficient quantum learning algorithms +for these tasks. Moreover, we prove that agnostic quantum parity and +Fourier-sparse learning can be efficiently verified by a classical verifier +with only random example or statistical query access. Finally, we showcase two +general scenarios in learning and verification in which quantum +mixture-of-superpositions examples do not lead to sample complexity +improvements over classical data. Our results demonstrate that the potential +power of quantum data for learning tasks, while not unlimited, can be utilized +by classical agents through interaction with untrusted quantum entities. + +
+
+ comment: 12 + 46 + 23 pages, 1 table, 1 figure; V2 fixes some typos and + includes new results in Section 6.3 +
+
+
+
+
+ + ♻ ☆ Optimizing K-means for Big Data: A Comparative Study + + +
+ This paper presents a comparative analysis of different optimization +techniques for the K-means algorithm in the context of big data. K-means is a +widely used clustering algorithm, but it can suffer from scalability issues +when dealing with large datasets. The paper explores different approaches to +overcome these issues, including parallelization, approximation, and sampling +methods. The authors evaluate the performance of these techniques on various +benchmark datasets and compare them in terms of speed, quality of clustering, +and scalability according to the LIMA dominance criterion. The results show +that different techniques are more suitable for different types of datasets and +provide insights into the trade-offs between speed and accuracy in K-means +clustering for big data. Overall, the paper offers a comprehensive guide for +practitioners and researchers on how to optimize K-means for big data +applications. + +
+
+
+
+
+ + ♻ ☆ Towards Sobolev Pruning + + +
+ The increasing use of stochastic models for describing complex phenomena +warrants surrogate models that capture the reference model characteristics at a +fraction of the computational cost, foregoing potentially expensive Monte Carlo +simulation. The predominant approach of fitting a large neural network and then +pruning it to a reduced size has commonly neglected shortcomings. The produced +surrogate models often will not capture the sensitivities and uncertainties +inherent in the original model. In particular, (higher-order) derivative +information of such surrogates could differ drastically. Given a large enough +network, we expect this derivative information to match. However, the pruned +model will almost certainly not share this behavior. + In this paper, we propose to find surrogate models by using sensitivity +information throughout the learning and pruning process. We build on work using +Interval Adjoint Significance Analysis for pruning and combine it with the +recent advancements in Sobolev Training to accurately model the original +sensitivity information in the pruned neural network based surrogate model. We +experimentally underpin the method on an example of pricing a multidimensional +Basket option modelled through a stochastic differential equation with Brownian +motion. The proposed method is, however, not limited to the domain of +quantitative finance, which was chosen as a case study for intuitive +interpretations of the sensitivities. It serves as a foundation for building +further surrogate modelling techniques considering sensitivity information. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ♻ ☆ XAI-TRIS: Non-linear image benchmarks to quantify false positive + post-hoc attribution of feature importance + + +
+ The field of 'explainable' artificial intelligence (XAI) has produced highly +cited methods that seek to make the decisions of complex machine learning (ML) +methods 'understandable' to humans, for example by attributing 'importance' +scores to input features. Yet, a lack of formal underpinning leaves it unclear +as to what conclusions can safely be drawn from the results of a given XAI +method and has also so far hindered the theoretical verification and empirical +validation of XAI methods. This means that challenging non-linear problems, +typically solved by deep neural networks, presently lack appropriate remedies. +Here, we craft benchmark datasets for three different non-linear classification +scenarios, in which the important class-conditional features are known by +design, serving as ground truth explanations. Using novel quantitative metrics, +we benchmark the explanation performance of a wide set of XAI methods across +three deep learning model architectures. We show that popular XAI methods are +often unable to significantly outperform random performance baselines and edge +detection methods. Moreover, we demonstrate that explanations derived from +different model architectures can be vastly different; thus, prone to +misinterpretation even under controlled conditions. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ LIPEx-Locally Interpretable Probabilistic Explanations-To Look Beyond + The True Class + + +
+ In this work, we instantiate a novel perturbation-based multi-class +explanation framework, LIPEx (Locally Interpretable Probabilistic Explanation). +We demonstrate that LIPEx not only locally replicates the probability +distributions output by the widely used complex classification models but also +provides insight into how every feature deemed to be important affects the +prediction probability for each of the possible classes. We achieve this by +defining the explanation as a matrix obtained via regression with respect to +the Hellinger distance in the space of probability distributions. Ablation +tests on text and image data, show that LIPEx-guided removal of important +features from the data causes more change in predictions for the underlying +model than similar tests based on other saliency-based or feature +importance-based Explainable AI (XAI) methods. It is also shown that compared +to LIME, LIPEx is more data efficient in terms of using a lesser number of +perturbations of the data to obtain a reliable explanation. This +data-efficiency is seen to manifest as LIPEx being able to compute its +explanation matrix around 53% faster than all-class LIME, for classification +experiments with text data. + +
+
+ comment: 19 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Resource Allocation of Federated Learning for the Metaverse with Mobile + Augmented Reality + + +
+ The Metaverse has received much attention recently. Metaverse applications +via mobile augmented reality (MAR) require rapid and accurate object detection +to mix digital data with the real world. Federated learning (FL) is an +intriguing distributed machine learning approach due to its privacy-preserving +characteristics. Due to privacy concerns and the limited computation resources +on mobile devices, we incorporate FL into MAR systems of the Metaverse to train +a model cooperatively. Besides, to balance the trade-off between energy, +execution latency and model accuracy, thereby accommodating different demands +and application scenarios, we formulate an optimization problem to minimize a +weighted combination of total energy consumption, completion time and model +accuracy. Through decomposing the non-convex optimization problem into two +subproblems, we devise a resource allocation algorithm to determine the +bandwidth allocation, transmission power, CPU frequency and video frame +resolution for each participating device. We further present the convergence +analysis and computational complexity of the proposed algorithm. Numerical +results show that our proposed algorithm has better performance (in terms of +energy consumption, completion time and model accuracy) under different weight +parameters compared to existing benchmarks. + +
+
+ comment: This paper appears in IEEE Transactions on Wireless Communications. + DOI: https://doi.org/10.1109/TWC.2023.3326884 It is the journal version of + 2022 IEEE 42nd International Conference on Distributed Computing Systems + (ICDCS) paper: arXiv:2209.14900; i.e., + https://doi.org/10.1109/ICDCS54860.2022.00101 +
+
+
+
+
+ + ♻ ☆ Calibration in Machine Learning Uncertainty Quantification: beyond + consistency to target adaptivity + + +
+ Reliable uncertainty quantification (UQ) in machine learning (ML) regression +tasks is becoming the focus of many studies in materials and chemical science. +It is now well understood that average calibration is insufficient, and most +studies implement additional methods testing the conditional calibration with +respect to uncertainty, i.e. consistency. Consistency is assessed mostly by +so-called reliability diagrams. There exists however another way beyond average +calibration, which is conditional calibration with respect to input features, +i.e. adaptivity. In practice, adaptivity is the main concern of the final users +of a ML-UQ method, seeking for the reliability of predictions and uncertainties +for any point in features space. This article aims to show that consistency and +adaptivity are complementary validation targets, and that a good consistency +does not imply a good adaptivity. Adapted validation methods are proposed and +illustrated on a representative example. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2303.07170 +
+
+
+
+
+ + ♻ ☆ On the Interplay Between Stepsize Tuning and Progressive Sharpening NeurIPS 2023 + + +
+ Recent empirical work has revealed an intriguing property of deep learning +models by which the sharpness (largest eigenvalue of the Hessian) increases +throughout optimization until it stabilizes around a critical value at which +the optimizer operates at the edge of stability, given a fixed stepsize (Cohen +et al, 2022). We investigate empirically how the sharpness evolves when using +stepsize-tuners, the Armijo linesearch and Polyak stepsizes, that adapt the +stepsize along the iterations to local quantities such as, implicitly, the +sharpness itself. We find that the surprisingly poor performance of a classical +Armijo linesearch may be well explained by its tendency to ever-increase the +sharpness of the objective in the full or large batch regimes. On the other +hand, we observe that Polyak stepsizes operate generally at the edge of +stability or even slightly beyond, while outperforming its Armijo and constant +stepsizes counterparts. We conclude with an analysis that suggests unlocking +stepsize tuners requires an understanding of the joint dynamics of the step +size and the sharpness. + +
+
+ comment: Presented at the NeurIPS 2023 OPT Wokshop +
+
+
+
+
+ + ♻ ☆ Constrained Few-Shot Learning: Human-Like Low Sample Complexity Learning + and Non-Episodic Text Classification + + +
+ Few-shot learning (FSL) is an emergent paradigm of learning that attempts to +learn to reason with low sample complexity to mimic the way humans learn, +generalise and extrapolate from only a few seen examples. While FSL attempts to +mimic these human characteristics, fundamentally, the task of FSL as +conventionally formulated using meta-learning with episodic-based training does +not in actuality align with how humans acquire and reason with knowledge. FSL +with episodic training, while only requires $K$ instances of each test class, +still requires a large number of labelled training instances from disjoint +classes. In this paper, we introduce the novel task of constrained few-shot +learning (CFSL), a special case of FSL where $M$, the number of instances of +each training class is constrained such that $M \leq K$ thus applying a similar +restriction during FSL training and test. We propose a method for CFSL +leveraging Cat2Vec using a novel categorical contrastive loss inspired by +cognitive theories such as fuzzy trace theory and prototype theory. + +
+
+ comment: Add additional references Update various sections for clarity +
+
+
+
+
+ + ♻ ☆ Activity Sparsity Complements Weight Sparsity for Efficient RNN + Inference NeurIPS 2023 + + +
+ Artificial neural networks open up unprecedented machine learning +capabilities at the cost of ever growing computational requirements. +Sparsifying the parameters, often achieved through weight pruning, has been +identified as a powerful technique to compress the number of model parameters +and reduce the computational operations of neural networks. Yet, sparse +activations, while omnipresent in both biological neural networks and deep +learning systems, have not been fully utilized as a compression technique in +deep learning. Moreover, the interaction between sparse activations and weight +pruning is not fully understood. In this work, we demonstrate that activity +sparsity can compose multiplicatively with parameter sparsity in a recurrent +neural network model based on the GRU that is designed to be activity sparse. +We achieve up to $20\times$ reduction of computation while maintaining +perplexities below $60$ on the Penn Treebank language modeling task. This +magnitude of reduction has not been achieved previously with solely sparsely +connected LSTMs, and the language modeling performance of our model has not +been achieved previously with any sparsely activated recurrent neural networks +or spiking neural networks. Neuromorphic computing devices are especially good +at taking advantage of the dynamic activity sparsity, and our results provide +strong evidence that making deep learning models activity sparse and porting +them to neuromorphic devices can be a viable strategy that does not compromise +on task performance. Our results also drive further convergence of methods from +deep learning and neuromorphic computing for efficient machine learning. + +
+
+ comment: Accepted to the First MLNCP Workshop @ NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Universal Online Learning with Gradient Variations: A Multi-layer Online + Ensemble Approach NeurIPS 2023 + + +
+ In this paper, we propose an online convex optimization approach with two +different levels of adaptivity. On a higher level, our approach is agnostic to +the unknown types and curvatures of the online functions, while at a lower +level, it can exploit the unknown niceness of the environments and attain +problem-dependent guarantees. Specifically, we obtain $\mathcal{O}(\log V_T)$, +$\mathcal{O}(d \log V_T)$ and $\widehat{\mathcal{O}}(\sqrt{V_T})$ regret bounds +for strongly convex, exp-concave and convex loss functions, respectively, where +$d$ is the dimension, $V_T$ denotes problem-dependent gradient variations and +the $\widehat{\mathcal{O}}(\cdot)$-notation omits $\log V_T$ factors. Our +result not only safeguards the worst-case guarantees but also directly implies +the small-loss bounds in analysis. Moreover, when applied to +adversarial/stochastic convex optimization and game theory problems, our result +enhances the existing universal guarantees. Our approach is based on a +multi-layer online ensemble framework incorporating novel ingredients, +including a carefully designed optimism for unifying diverse function types and +cascaded corrections for algorithmic stability. Notably, despite its +multi-layer structure, our algorithm necessitates only one gradient query per +round, making it favorable when the gradient evaluation is time-consuming. This +is facilitated by a novel regret decomposition with carefully designed +surrogate losses. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ MAUVE Scores for Generative Models: Theory and Practice + + +
+ Generative artificial intelligence has made significant strides, producing +text indistinguishable from human prose and remarkably photorealistic images. +Automatically measuring how close the generated data distribution is to the +target distribution is central to diagnosing existing models and developing +better ones. We present MAUVE, a family of comparison measures between pairs of +distributions such as those encountered in the generative modeling of text or +images. These scores are statistical summaries of divergence frontiers +capturing two types of errors in generative modeling. We explore three +approaches to statistically estimate these scores: vector quantization, +non-parametric estimation, and classifier-based estimation. We provide +statistical bounds for the vector quantization approach. + Empirically, we find that the proposed scores paired with a range of +$f$-divergences and statistical estimation methods can quantify the gaps +between the distributions of human-written text and those of modern neural +language models by correlating with human judgments and identifying known +properties of the generated texts. We demonstrate in the vision domain that +MAUVE can identify known properties of generated images on par with or better +than existing metrics. In conclusion, we present practical recommendations for +using MAUVE effectively with language and image modalities. + +
+
+ comment: Published in Journal of Machine Learning Research +
+
+
+
+
+ + ♻ ☆ Contrastive-Signal-Dependent Plasticity: Forward-Forward Learning of + Spiking Neural Systems + + +
+ We develop a neuro-mimetic architecture, composed of spiking neuronal units, +where individual layers of neurons operate in parallel and adapt their synaptic +efficacies without the use of feedback pathways. Specifically, we propose an +event-based generalization of forward-forward learning, which we call +contrastive-signal-dependent plasticity (CSDP), for a spiking neural system +that iteratively processes sensory input over a stimulus window. The dynamics +that underwrite this recurrent circuit entail computing the membrane potential +of each processing element, in each layer, as a function of local bottom-up, +top-down, and lateral signals, facilitating a dynamic, layer-wise parallel form +of neural computation. Unlike other models, such as spiking predictive coding, +which rely on feedback synapses to adjust neural electrical activity, our model +operates purely online and forward in time, offering a promising way to learn +distributed representations of sensory data patterns, with and without labeled +context information. Notably, our experimental results on several pattern +datasets demonstrate that the CSDP process works well for training a dynamic +recurrent spiking network capable of both classification and reconstruction. + +
+
+
+
+
+ + ♻ ☆ PAPR: Proximity Attention Point Rendering + + +
+ Learning accurate and parsimonious point cloud representations of scene +surfaces from scratch remains a challenge in 3D representation learning. +Existing point-based methods often suffer from the vanishing gradient problem +or require a large number of points to accurately model scene geometry and +texture. To address these limitations, we propose Proximity Attention Point +Rendering (PAPR), a novel method that consists of a point-based scene +representation and a differentiable renderer. Our scene representation uses a +point cloud where each point is characterized by its spatial position, +influence score, and view-independent feature vector. The renderer selects the +relevant points for each ray and produces accurate colours using their +associated features. PAPR effectively learns point cloud positions to represent +the correct scene geometry, even when the initialization drastically differs +from the target geometry. Notably, our method captures fine texture details +while using only a parsimonious set of points. We also demonstrate four +practical applications of our method: zero-shot geometry editing, object +manipulation, texture transfer, and exposure control. More results and code are +available on our project website at https://zvict.github.io/papr/. + +
+
+
+
+
+ + ♻ ☆ Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging + + +
+ Video face re-aging deals with altering the apparent age of a person to the +target age in videos. This problem is challenging due to the lack of paired +video datasets maintaining temporal consistency in identity and age. Most +re-aging methods process each image individually without considering the +temporal consistency of videos. While some existing works address the issue of +temporal coherence through video facial attribute manipulation in latent space, +they often fail to deliver satisfactory performance in age transformation. To +tackle the issues, we propose (1) a novel synthetic video dataset that features +subjects across a diverse range of age groups; (2) a baseline architecture +designed to validate the effectiveness of our proposed dataset, and (3) the +development of three novel metrics tailored explicitly for evaluating the +temporal consistency of video re-aging techniques. Our comprehensive +experiments on public datasets, such as VFHQ and CelebV-HQ, show that our +method outperforms the existing approaches in terms of both age transformation +and temporal consistency. + +
+
+ comment: 8 pages, 6 figures, 4 tables, Project page: + https://video-reaging.github.io/ +
+
+
+
+
+ + ♻ ☆ Large Language Models as Optimizers + + +
+ Optimization is ubiquitous. While derivative-based algorithms have been +powerful tools for various problems, the absence of gradient imposes challenges +on many real-world applications. In this work, we propose Optimization by +PROmpting (OPRO), a simple and effective approach to leverage large language +models (LLMs) as optimizers, where the optimization task is described in +natural language. In each optimization step, the LLM generates new solutions +from the prompt that contains previously generated solutions with their values, +then the new solutions are evaluated and added to the prompt for the next +optimization step. We first showcase OPRO on linear regression and traveling +salesman problems, then move on to prompt optimization where the goal is to +find instructions that maximize the task accuracy. With a variety of LLMs, we +demonstrate that the best prompts optimized by OPRO outperform human-designed +prompts by up to 8% on GSM8K, and by up to 50% on Big-Bench Hard tasks. Code at +https://github.com/google-deepmind/opro. + +
+
+ comment: 42 pages, 26 figures, 15 tables. Code at + https://github.com/google-deepmind/opro +
+
+
+
+
+ + ♻ ☆ Bandit Algorithms for Prophet Inequality and Pandora's Box + + +
+ The Prophet Inequality and Pandora's Box problems are fundamental stochastic +problem with applications in Mechanism Design, Online Algorithms, Stochastic +Optimization, Optimal Stopping, and Operations Research. A usual assumption in +these works is that the probability distributions of the $n$ underlying random +variables are given as input to the algorithm. Since in practice these +distributions need to be learned, we initiate the study of such stochastic +problems in the Multi-Armed Bandits model. + In the Multi-Armed Bandits model we interact with $n$ unknown distributions +over $T$ rounds: in round $t$ we play a policy $x^{(t)}$ and receive a partial +(bandit) feedback on the performance of $x^{(t)}$. The goal is to minimize the +regret, which is the difference over $T$ rounds in the total value of the +optimal algorithm that knows the distributions vs. the total value of our +algorithm that learns the distributions from the partial feedback. Our main +results give near-optimal $\tilde{O}(\mathsf{poly}(n)\sqrt{T})$ total regret +algorithms for both Prophet Inequality and Pandora's Box. + Our proofs proceed by maintaining confidence intervals on the unknown indices +of the optimal policy. The exploration-exploitation tradeoff prevents us from +directly refining these confidence intervals, so the main technique is to +design a regret upper bound that is learnable while playing low-regret Bandit +policies. + +
+
+
+
+
+ + ♻ ☆ ExpM+NF Tractable Exponential Mechanism via Normalizing Flow, A Path + through the Accuracy-Privacy Ceiling Constraining Differentially Private ML + + +
+ The Exponential Mechanism (ExpM), a differentially private optimization +method, promises many advantages over Differentially Private Stochastic +Gradient Descent (DPSGD), the state-of-the-art (SOTA) and de facto method for +differentially private machine learning (ML). Yet, ExpM has been historically +stymied from differentially private training of modern ML algorithms by two +obstructions: ExpM requires a sensitivity bound for the given loss function; +ExpM requires sampling from a historically intractable density. We prove a +sensitivity bound for $\ell(2)$ loss, and investigate using Normalizing Flows +(NFs), deep networks furnishing approximate sampling from the otherwise +intractable ExpM distribution. We prove that as the NF output converges to ExpM +distribution, the privacy ($\varepsilon$) of an NF sample converges to that of +the ExpM distribution. Under the assumption that the NF output distribution is +the ExpM distribution, we empirically test ExpM+NF against DPSGD using the SOTA +implementation (Opacus \cite{opacus} with PRV accounting) in multiple +classification tasks on the Adult Dataset (census data) and MIMIC-III Dataset +(healthcare records) using Logistic Regression and GRU-D, a deep learning +recurrent neural network with \smallsim 20K-100K parameters. In all experiments +we find ExpM+NF achieves greater than 94\% of the non-private training accuracy +(AUC) with $\varepsilon$-DP for $\varepsilon$ a low as $1\mathrm{e}{-3}$ -- +three orders of magnitude stronger privacy with similar accuracy. Further, +performance results show ExpM+NF training time is comparable to (slightly less) +than DPSGD. Limitations and future directions are provided; notably, research +on NF approximation accuracy and its effect on privacy are a promising avenue +to substantially advancing the field. Code for these experiments \hl{will be +provided after review}. + +
+
+
+
+
+ + ♻ ☆ Data-Adaptive Probabilistic Likelihood Approximation for Ordinary + Differential Equations + + +
+ Estimating the parameters of ordinary differential equations (ODEs) is of +fundamental importance in many scientific applications. While ODEs are +typically approximated with deterministic algorithms, new research on +probabilistic solvers indicates that they produce more reliable parameter +estimates by better accounting for numerical errors. However, many ODE systems +are highly sensitive to their parameter values. This produces deep local maxima +in the likelihood function -- a problem which existing probabilistic solvers +have yet to resolve. Here we present a novel probabilistic ODE likelihood +approximation, DALTON, which can dramatically reduce parameter sensitivity by +learning from noisy ODE measurements in a data-adaptive manner. Our +approximation scales linearly in both ODE variables and time discretization +points, and is applicable to ODEs with both partially-unobserved components and +non-Gaussian measurement models. Several examples demonstrate that DALTON +produces more accurate parameter estimates via numerical optimization than +existing probabilistic ODE solvers, and even in some cases than the exact ODE +likelihood itself. + +
+
+ comment: 8 pages, 7 figures +
+
+
+
+
+ + ♻ ☆ FRAC-Q-Learning: A Reinforcement Learning with Boredom Avoidance + Processes for Social Robots + + +
+ The reinforcement learning algorithms have often been applied to social +robots. However, most reinforcement learning algorithms were not optimized for +the use of social robots, and consequently they may bore users. We proposed a +new reinforcement learning method specialized for the social robot, the +FRAC-Q-learning, that can avoid user boredom. The proposed algorithm consists +of a forgetting process in addition to randomizing and categorizing processes. +This study evaluated interest and boredom hardness scores of the +FRAC-Q-learning by a comparison with the traditional Q-learning. The +FRAC-Q-learning showed significantly higher trend of interest score, and +indicated significantly harder to bore users compared to the traditional +Q-learning. Therefore, the FRAC-Q-learning can contribute to develop a social +robot that will not bore users. The proposed algorithm can also find +applications in Web-based communication and educational systems. This paper +presents the entire process, detailed implementation and a detailed evaluation +method of the of the FRAC-Q-learning for the first time. + +
+
+
+
+
+ + ♻ ☆ Practical, Private Assurance of the Value of Collaboration + + +
+ Two parties wish to collaborate on their datasets. However, before they +reveal their datasets to each other, the parties want to have the guarantee +that the collaboration would be fruitful. We look at this problem from the +point of view of machine learning, where one party is promised an improvement +on its prediction model by incorporating data from the other party. The parties +would only wish to collaborate further if the updated model shows an +improvement in accuracy. Before this is ascertained, the two parties would not +want to disclose their models and datasets. In this work, we construct an +interactive protocol for this problem based on the fully homomorphic encryption +scheme over the Torus (TFHE) and label differential privacy, where the +underlying machine learning model is a neural network. Label differential +privacy is used to ensure that computations are not done entirely in the +encrypted domain, which is a significant bottleneck for neural network training +according to the current state-of-the-art FHE implementations. We prove the +security of our scheme in the universal composability framework assuming +honest-but-curious parties, but where one party may not have any expertise in +labelling its initial dataset. Experiments show that we can obtain the output, +i.e., the accuracy of the updated model, with time many orders of magnitude +faster than a protocol using entirely FHE operations. + +
+
+
+
+
+ + ♻ ☆ Towards Clinical Prediction with Transparency: An Explainable AI + Approach to Survival Modelling in Residential Aged Care + + +
+ Background: Accurate survival time estimates aid end-of-life medical +decision-making. Objectives: Develop an interpretable survival model for +elderly residential aged care residents using advanced machine learning. +Setting: A major Australasian residential aged care provider. Participants: +Residents aged 65+ admitted for long-term care from July 2017 to August 2023. +Sample size: 11,944 residents across 40 facilities. Predictors: Factors include +age, gender, health status, co-morbidities, cognitive function, mood, +nutrition, mobility, smoking, sleep, skin integrity, and continence. Outcome: +Probability of survival post-admission, specifically calibrated for 6-month +survival estimates. Statistical Analysis: Tested CoxPH, EN, RR, Lasso, GB, XGB, +and RF models in 20 experiments with a 90/10 train/test split. Evaluated +accuracy using C-index, Harrell's C-index, dynamic AUROC, IBS, and calibrated +ROC. Chose XGB for its performance and calibrated it for 1, 3, 6, and 12-month +predictions using Platt scaling. Employed SHAP values to analyze predictor +impacts. Results: GB, XGB, and RF models showed the highest C-Index values +(0.714, 0.712, 0.712). The optimal XGB model demonstrated a 6-month survival +prediction AUROC of 0.746 (95% CI 0.744-0.749). Key mortality predictors +include age, male gender, mobility, health status, pressure ulcer risk, and +appetite. Conclusions: The study successfully applies machine learning to +create a survival model for aged care, aligning with clinical insights on +mortality risk factors and enhancing model interpretability and clinical +utility through explainable AI. + +
+
+
+
+
+ + ♻ ☆ MIRACLE: Inverse Reinforcement and Curriculum Learning Model for + Human-inspired Mobile Robot Navigation + + +
+ In emergency scenarios, mobile robots must navigate like humans, interpreting +stimuli to locate potential victims rapidly without interfering with first +responders. Existing socially-aware navigation algorithms face computational +and adaptability challenges. To overcome these, we propose a solution, MIRACLE +-- an inverse reinforcement and curriculum learning model, that employs +gamified learning to gather stimuli-driven human navigational data. This data +is then used to train a Deep Inverse Maximum Entropy Reinforcement Learning +model, reducing reliance on demonstrator abilities. Testing reveals a low loss +of 2.7717 within a 400-sized environment, signifying human-like response +replication. Current databases lack comprehensive stimuli-driven data, +necessitating our approach. By doing so, we enable robots to navigate emergency +situations with human-like perception, enhancing their life-saving +capabilities. + +
+
+
+
+
+ + ♻ ☆ From Prediction to Action: Critical Role of Performance Estimation for + Machine-Learning-Driven Materials Discovery + + +
+ Materials discovery driven by statistical property models is an iterative +decision process, during which an initial data collection is extended with new +data proposed by a model-informed acquisition function--with the goal to +maximize a certain "reward" over time, such as the maximum property value +discovered so far. While the materials science community achieved much progress +in developing property models that predict well on average with respect to the +training distribution, this form of in-distribution performance measurement is +not directly coupled with the discovery reward. This is because an iterative +discovery process has a shifting reward distribution that is +over-proportionally determined by the model performance for exceptional +materials. We demonstrate this problem using the example of bulk modulus +maximization among double perovskite oxides. We find that the in-distribution +predictive performance suggests random forests as superior to Gaussian process +regression, while the results are inverse in terms of the discovery rewards. We +argue that the lack of proper performance estimation methods from pre-computed +data collections is a fundamental problem for improving data-driven materials +discovery, and we propose a novel such estimator that, in contrast to na\"ive +reward estimation, successfully predicts Gaussian processes with the "expected +improvement" acquisition function as the best out of four options in our +demonstrational study for double perovskites. Importantly, it does so without +requiring the over thousand ab initio computations that were needed to confirm +this prediction. + +
+
+ comment: Simplified notation +
+
+
+
+
+ + ♻ ☆ Convolutional layers are equivariant to discrete shifts but not + continuous translations + + +
+ The purpose of this short and simple note is to clarify a common +misconception about convolutional neural networks (CNNs). CNNs are made up of +convolutional layers which are shift equivariant due to weight sharing. +However, convolutional layers are not translation equivariant, even when +boundary effects are ignored and when pooling and subsampling are absent. This +is because shift equivariance is a discrete symmetry while translation +equivariance is a continuous symmetry. This fact is well known among +researchers in equivariant machine learning, but is usually overlooked among +non-experts. To minimize confusion, we suggest using the term `shift +equivariance' to refer to discrete shifts in pixels and `translation +equivariance' to refer to continuous translations. + +
+
+
+
+
+ + ♻ ☆ Domain Invariant Representation Learning and Sleep Dynamics Modeling for + Automatic Sleep Staging + + +
+ Sleep staging has become a critical task in diagnosing and treating sleep +disorders to prevent sleep related diseases. With rapidly growing large scale +public sleep databases and advances in machine learning, significant progress +has been made toward automatic sleep staging. However, previous studies face +some critical problems in sleep studies; the heterogeneity of subjects' +physiological signals, the inability to extract meaningful information from +unlabeled sleep signal data to improve predictive performances, the difficulty +in modeling correlations between sleep stages, and the lack of an effective +mechanism to quantify predictive uncertainty. In this study, we propose a +neural network based automatic sleep staging model, named DREAM, to learn +domain generalized representations from physiological signals and models sleep +dynamics. DREAM learns sleep related and subject invariant representations from +diverse subjects' sleep signal segments and models sleep dynamics by capturing +interactions between sequential signal segments and between sleep stages. In +the experiments, we demonstrate that DREAM outperforms the existing sleep +staging methods on three datasets. The case study demonstrates that our model +can learn the generalized decision function resulting in good prediction +performances for the new subjects, especially in case there are differences +between testing and training subjects. The usage of unlabeled data shows the +benefit of leveraging unlabeled EEG data. Further, uncertainty quantification +demonstrates that DREAM provides prediction uncertainty, making the model +reliable and helping sleep experts in real world applications. + +
+
+
+
+
+ + ♻ ☆ SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting + + +
+ Spatiotemporal forecasting tasks, such as weather forecasting and traffic +prediction, offer significant societal benefits. These tasks can be effectively +approached as image forecasting problems using computer vision models. Vector +quantization (VQ) is a well-known method for discrete representation that +improves the latent space, leading to enhanced generalization and transfer +learning capabilities. One of the main challenges in using VQ for +spatiotemporal forecasting is how to balance between keeping enough details and +removing noises from the original patterns for better generalization. We +address this challenge by developing sparse vector quantization, or {\bf SVQ} +for short, that leverages sparse regression to make better trade-off between +the two objectives. The main innovation of this work is to approximate sparse +regression by a two-layer MLP and a randomly fixed or learnable matrix, +dramatically improving its computational efficiency. Through experiments +conducted on diverse datasets in multiple fields including weather forecasting, +traffic flow prediction, and video forecasting, we unequivocally demonstrate +that our proposed method consistently enhances the performance of base models +and achieves state-of-the-art results across all benchmarks. + +
+
+
+
+
+ + ♻ ☆ Deep Learning for Hate Speech Detection: A Comparative Study + + +
+ Automated hate speech detection is an important tool in combating the spread +of hate speech, particularly in social media. Numerous methods have been +developed for the task, including a recent proliferation of deep-learning based +approaches. A variety of datasets have also been developed, exemplifying +various manifestations of the hate-speech detection problem. We present here a +large-scale empirical comparison of deep and shallow hate-speech detection +methods, mediated through the three most commonly used datasets. Our goal is to +illuminate progress in the area, and identify strengths and weaknesses in the +current state-of-the-art. We particularly focus our analysis on measures of +practical performance, including detection accuracy, computational efficiency, +capability in using pre-trained models, and domain generalization. In doing so +we aim to provide guidance as to the use of hate-speech detection in practice, +quantify the state-of-the-art, and identify future research directions. Code +and dataset are available at +https://github.com/jmjmalik22/Hate-Speech-Detection. + +
+
+ comment: 18 pages, 4 figures, and 6 tables +
+
+
+
+
+
+
+
+ + Multimedia 7 + +
+
+
+ + ☆ Generating Illustrated Instructions + + +
+ We introduce the new task of generating Illustrated Instructions, i.e., +visual instructions customized to a user's needs. We identify desiderata unique +to this task, and formalize it through a suite of automatic and human +evaluation metrics, designed to measure the validity, consistency, and efficacy +of the generations. We combine the power of large language models (LLMs) +together with strong text-to-image generation diffusion models to propose a +simple approach called StackedDiffusion, which generates such illustrated +instructions given text as input. The resulting model strongly outperforms +baseline approaches and state-of-the-art multimodal LLMs; and in 30% of cases, +users even prefer it to human-generated articles. Most notably, it enables +various new and exciting applications far beyond what static articles on the +web can provide, such as personalized instructions complete with intermediate +steps and pictures in response to a user's individual situation. + +
+
+ comment: Project website: + http://facebookresearch.github.io/IllustratedInstructions +
+
+
+
+
+ + ☆ PhotoMaker: Customizing Realistic Human Photos via Stacked ID Embedding + + +
+ Recent advances in text-to-image generation have made remarkable progress in +synthesizing realistic human photos conditioned on given text prompts. However, +existing personalized generation methods cannot simultaneously satisfy the +requirements of high efficiency, promising identity (ID) fidelity, and flexible +text controllability. In this work, we introduce PhotoMaker, an efficient +personalized text-to-image generation method, which mainly encodes an arbitrary +number of input ID images into a stack ID embedding for preserving ID +information. Such an embedding, serving as a unified ID representation, can not +only encapsulate the characteristics of the same input ID comprehensively, but +also accommodate the characteristics of different IDs for subsequent +integration. This paves the way for more intriguing and practically valuable +applications. Besides, to drive the training of our PhotoMaker, we propose an +ID-oriented data construction pipeline to assemble the training data. Under the +nourishment of the dataset constructed through the proposed pipeline, our +PhotoMaker demonstrates better ID preservation ability than test-time +fine-tuning based methods, yet provides significant speed improvements, +high-quality generation results, strong generalization capabilities, and a wide +range of applications. Our project page is available at +https://photo-maker.github.io/ + +
+
+ comment: Tech report; Project page: https://photo-maker.github.io/ +
+
+
+
+
+ + ☆ Deep3DSketch: 3D modeling from Free-hand Sketches with View- and + Structural-Aware Adversarial Training ICASSP 2023 + + +
+ This work aims to investigate the problem of 3D modeling using single +free-hand sketches, which is one of the most natural ways we humans express +ideas. Although sketch-based 3D modeling can drastically make the 3D modeling +process more accessible, the sparsity and ambiguity of sketches bring +significant challenges for creating high-fidelity 3D models that reflect the +creators' ideas. In this work, we propose a view- and structural-aware deep +learning approach, \textit{Deep3DSketch}, which tackles the ambiguity and fully +uses sparse information of sketches, emphasizing the structural information. +Specifically, we introduced random pose sampling on both 3D shapes and 2D +silhouettes, and an adversarial training scheme with an effective progressive +discriminator to facilitate learning of the shape structures. Extensive +experiments demonstrated the effectiveness of our approach, which outperforms +existing methods -- with state-of-the-art (SOTA) performance on both synthetic +and real datasets. + +
+
+ comment: ICASSP 2023. arXiv admin note: substantial text overlap with + arXiv:2310.18148 +
+
+
+
+
+ + ☆ GPT-4V with Emotion: A Zero-shot Benchmark for Multimodal Emotion + Understanding + + +
+ Recently, GPT-4 with Vision (GPT-4V) has shown remarkable performance across +various multimodal tasks. However, its efficacy in emotion recognition remains +a question. This paper quantitatively evaluates GPT-4V's capabilities in +multimodal emotion understanding, encompassing tasks such as facial emotion +recognition, visual sentiment analysis, micro-expression recognition, dynamic +facial emotion recognition, and multimodal emotion recognition. Our experiments +show that GPT-4V exhibits impressive multimodal and temporal understanding +capabilities, even surpassing supervised systems in some tasks. Despite these +achievements, GPT-4V is currently tailored for general domains. It performs +poorly in micro-expression recognition that requires specialized expertise. The +main purpose of this paper is to present quantitative results of GPT-4V on +emotion understanding and establish a zero-shot benchmark for future research. +Code and evaluation results are available at: +https://github.com/zeroQiaoba/gpt4v-emotion. + +
+
+
+
+
+ + ♻ ☆ Exploring the Interactive Guidance for Unified and Effective Image + Matting + + +
+ Recent image matting studies are developing towards proposing trimap-free or +interactive methods for complete complex image matting tasks. Although avoiding +the extensive labors of trimap annotation, existing methods still suffer from +two limitations: (1) For the single image with multiple objects, it is +essential to provide extra interaction information to help determining the +matting target; (2) For transparent objects, the accurate regression of alpha +matte from RGB image is much more difficult compared with the opaque ones. In +this work, we propose a Unified Interactive image Matting method, named UIM, +which solves the limitations and achieves satisfying matting results for any +scenario. Specifically, UIM leverages multiple types of user interaction to +avoid the ambiguity of multiple matting targets, and we compare the pros and +cons of different annotation types in detail. To unify the matting performance +for transparent and opaque objects, we decouple image matting into two stages, +i.e., foreground segmentation and transparency prediction. Moreover, we design +a multi-scale attentive fusion module to alleviate the vagueness in the +boundary region. Experimental results demonstrate that UIM achieves +state-of-the-art performance on the Composition-1K test set and a synthetic +unified dataset. + +
+
+
+
+
+ + ♻ ☆ A New Fine-grained Alignment Method for Image-text Matching + + +
+ Image-text retrieval is a widely studied topic in the field of computer +vision due to the exponential growth of multimedia data, whose core concept is +to measure the similarity between images and text. However, most existing +retrieval methods heavily rely on cross-attention mechanisms for cross-modal +fine-grained alignment, which takes into account excessive irrelevant regions +and treats prominent and non-significant words equally, thereby limiting +retrieval accuracy. This paper aims to investigate an alignment approach that +reduces the involvement of non-significant fragments in images and text while +enhancing the alignment of prominent segments. For this purpose, we introduce +the Cross-Modal Prominent Fragments Enhancement Aligning Network(CPFEAN), which +achieves improved retrieval accuracy by diminishing the participation of +irrelevant regions during alignment and relatively increasing the alignment +similarity of prominent words. Additionally, we incorporate prior textual +information into image regions to reduce misalignment occurrences. In practice, +we first design a novel intra-modal fragments relationship reasoning method, +and subsequently employ our proposed alignment mechanism to compute the +similarity between images and text. Extensive quantitative comparative +experiments on MS-COCO and Flickr30K datasets demonstrate that our approach +outperforms state-of-the-art methods by about 5% to 10% in the rSum metric. + +
+
+
+
+
+ + ♻ ☆ Video Face Re-Aging: Toward Temporally Consistent Face Re-Aging + + +
+ Video face re-aging deals with altering the apparent age of a person to the +target age in videos. This problem is challenging due to the lack of paired +video datasets maintaining temporal consistency in identity and age. Most +re-aging methods process each image individually without considering the +temporal consistency of videos. While some existing works address the issue of +temporal coherence through video facial attribute manipulation in latent space, +they often fail to deliver satisfactory performance in age transformation. To +tackle the issues, we propose (1) a novel synthetic video dataset that features +subjects across a diverse range of age groups; (2) a baseline architecture +designed to validate the effectiveness of our proposed dataset, and (3) the +development of three novel metrics tailored explicitly for evaluating the +temporal consistency of video re-aging techniques. Our comprehensive +experiments on public datasets, such as VFHQ and CelebV-HQ, show that our +method outperforms the existing approaches in terms of both age transformation +and temporal consistency. + +
+
+ comment: 8 pages, 6 figures, 4 tables, Project page: + https://video-reaging.github.io/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 73 + +
+
+
+ + ☆ OneLLM: One Framework to Align All Modalities with Language + + +
+ Multimodal large language models (MLLMs) have gained significant attention +due to their strong multimodal understanding capability. However, existing +works rely heavily on modality-specific encoders, which usually differ in +architecture and are limited to common modalities. In this paper, we present +OneLLM, an MLLM that aligns eight modalities to language using a unified +framework. We achieve this through a unified multimodal encoder and a +progressive multimodal alignment pipeline. In detail, we first train an image +projection module to connect a vision encoder with LLM. Then, we build a +universal projection module (UPM) by mixing multiple image projection modules +and dynamic routing. Finally, we progressively align more modalities to LLM +with the UPM. To fully leverage the potential of OneLLM in following +instructions, we also curated a comprehensive multimodal instruction dataset, +including 2M items from image, audio, video, point cloud, depth/normal map, IMU +and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks, +encompassing tasks such as multimodal captioning, question answering and +reasoning, where it delivers excellent performance. Code, data, model and +online demo are available at https://github.com/csuhan/OneLLM + +
+
+ comment: Code: https://github.com/csuhan/OneLLM +
+
+
+
+
+ + ☆ PROMISE: A Framework for Model-Driven Stateful Prompt Orchestration + + +
+ The advent of increasingly powerful language models has raised expectations +for language-based interactions. However, controlling these models is a +challenge, emphasizing the need to be able to investigate the feasibility and +value of their application. We present PROMISE, a framework that facilitates +the development of complex language-based interactions with information +systems. Its use of state machine modeling concepts enables model-driven, +dynamic prompt orchestration across hierarchically nested states and +transitions. This improves the control of the behavior of language models and +thus enables their effective and efficient use. We show the benefits of PROMISE +in the context of application scenarios within health information systems and +demonstrate its ability to handle complex interactions. + +
+
+
+
+
+ + ☆ Evaluating and Mitigating Discrimination in Language Model Decisions + + +
+ As language models (LMs) advance, interest is growing in applying them to +high-stakes societal decisions, such as determining financing or housing +eligibility. However, their potential for discrimination in such contexts +raises ethical concerns, motivating the need for better methods to evaluate +these risks. We present a method for proactively evaluating the potential +discriminatory impact of LMs in a wide range of use cases, including +hypothetical use cases where they have not yet been deployed. Specifically, we +use an LM to generate a wide array of potential prompts that decision-makers +may input into an LM, spanning 70 diverse decision scenarios across society, +and systematically vary the demographic information in each prompt. Applying +this methodology reveals patterns of both positive and negative discrimination +in the Claude 2.0 model in select settings when no interventions are applied. +While we do not endorse or permit the use of language models to make automated +decisions for the high-risk use cases we study, we demonstrate techniques to +significantly decrease both positive and negative discrimination through +careful prompt engineering, providing pathways toward safer deployment in use +cases where they may be appropriate. Our work enables developers and +policymakers to anticipate, measure, and address discrimination as language +model capabilities and applications continue to expand. We release our dataset +and prompts at https://huggingface.co/datasets/Anthropic/discrim-eval + +
+
+
+
+
+ + ☆ An Integration of Pre-Trained Speech and Language Models for End-to-End + Speech Recognition + + +
+ Advances in machine learning have made it possible to perform various text +and speech processing tasks, including automatic speech recognition (ASR), in +an end-to-end (E2E) manner. Since typical E2E approaches require large amounts +of training data and resources, leveraging pre-trained foundation models +instead of training from scratch is gaining attention. Although there have been +attempts to use pre-trained speech and language models in ASR, most of them are +limited to using either. This paper explores the potential of integrating a +pre-trained speech representation model with a large language model (LLM) for +E2E ASR. The proposed model enables E2E ASR by generating text tokens in an +autoregressive manner via speech representations as speech prompts, taking +advantage of the vast knowledge provided by the LLM. Furthermore, the proposed +model can incorporate remarkable developments for LLM utilization, such as +inference optimization and parameter-efficient domain adaptation. Experimental +results show that the proposed model achieves performance comparable to modern +E2E ASR models. + +
+
+ comment: 6 pages, 2 figures, 3 tables, The model is available at + https://huggingface.co/rinna/nue-asr +
+
+
+
+
+ + ☆ Generative agent-based modeling with actions grounded in physical, + social, or digital space using Concordia + + +
+ Agent-based modeling has been around for decades, and applied widely across +the social and natural sciences. The scope of this research method is now +poised to grow dramatically as it absorbs the new affordances provided by Large +Language Models (LLM)s. Generative Agent-Based Models (GABM) are not just +classic Agent-Based Models (ABM)s where the agents talk to one another. Rather, +GABMs are constructed using an LLM to apply common sense to situations, act +"reasonably", recall common semantic knowledge, produce API calls to control +digital technologies like apps, and communicate both within the simulation and +to researchers viewing it from the outside. Here we present Concordia, a +library to facilitate constructing and working with GABMs. Concordia makes it +easy to construct language-mediated simulations of physically- or +digitally-grounded environments. Concordia agents produce their behavior using +a flexible component system which mediates between two fundamental operations: +LLM calls and associative memory retrieval. A special agent called the Game +Master (GM), which was inspired by tabletop role-playing games, is responsible +for simulating the environment where the agents interact. Agents take actions +by describing what they want to do in natural language. The GM then translates +their actions into appropriate implementations. In a simulated physical world, +the GM checks the physical plausibility of agent actions and describes their +effects. In digital environments simulating technologies such as apps and +services, the GM may handle API calls to integrate with external tools such as +general AI assistants (e.g., Bard, ChatGPT), and digital apps (e.g., Calendar, +Email, Search, etc.). Concordia was designed to support a wide array of +applications both in scientific research and for evaluating performance of real +digital services by simulating users and/or generating synthetic data. + +
+
+ comment: 31 pages, 5 figures +
+
+
+
+
+ + ☆ Interpretability Illusions in the Generalization of Simplified Models + + +
+ A common method to study deep learning systems is to use simplified model +representations -- for example, using singular value decomposition to visualize +the model's hidden states in a lower dimensional space. This approach assumes +that the results of these simplified are faithful to the original model. Here, +we illustrate an important caveat to this assumption: even if the simplified +representations can accurately approximate the full model on the training set, +they may fail to accurately capture the model's behavior out of distribution -- +the understanding developed from simplified representations may be an illusion. +We illustrate this by training Transformer models on controlled datasets with +systematic generalization splits. First, we train models on the Dyck +balanced-parenthesis languages. We simplify these models using tools like +dimensionality reduction and clustering, and then explicitly test how these +simplified proxies match the behavior of the original model on various +out-of-distribution test sets. We find that the simplified proxies are +generally less faithful out of distribution. In cases where the original model +generalizes to novel structures or deeper depths, the simplified versions may +fail, or generalize better. This finding holds even if the simplified +representations do not directly depend on the training distribution. Next, we +study a more naturalistic task: predicting the next character in a dataset of +computer code. We find similar generalization gaps between the original model +and simplified proxies, and conduct further analysis to investigate which +aspects of the code completion task are associated with the largest gaps. +Together, our results raise questions about the extent to which mechanistic +interpretations derived using tools like SVD can reliably predict what a model +will do in novel situations. + +
+
+
+
+
+ + ☆ Not All Large Language Models (LLMs) Succumb to the "Reversal Curse": A + Comparative Study of Deductive Logical Reasoning in BERT and GPT Models + + +
+ The "Reversal Curse" refers to the scenario where auto-regressive decoder +large language models (LLMs), such as ChatGPT, trained on "A is B" fail to +learn "B is A", demonstrating a basic failure of logical deduction. This raises +a red flag in the use of GPT models for certain general tasks such as +constructing knowledge graphs, considering their adherence to this symmetric +principle. In our study, we examined a bidirectional LLM, BERT, and found that +it is immune to the reversal curse. Driven by ongoing efforts to construct +biomedical knowledge graphs with LLMs, we also embarked on evaluating more +complex but essential deductive reasoning capabilities. This process included +first training encoder and decoder language models to master the intersection +($\cap$) and union ($\cup$) operations on two sets and then moving on to assess +their capability to infer different combinations of union ($\cup$) and +intersection ($\cap$) operations on three newly created sets. The findings +showed that while both encoder and decoder language models, trained for tasks +involving two sets (union/intersection), were proficient in such scenarios, +they encountered difficulties when dealing with operations that included three +sets (various combinations of union and intersection). Our research highlights +the distinct characteristics of encoder and decoder models in simple and +complex logical reasoning. In practice, the choice between BERT and GPT should +be guided by the specific requirements and nature of the task at hand, +leveraging their respective strengths in bidirectional context comprehension +and sequence prediction. + +
+
+
+
+
+ + ☆ Improving Bias Mitigation through Bias Experts in Natural Language + Understanding EMNLP 2023 + + +
+ Biases in the dataset often enable the model to achieve high performance on +in-distribution data, while poorly performing on out-of-distribution data. To +mitigate the detrimental effect of the bias on the networks, previous works +have proposed debiasing methods that down-weight the biased examples identified +by an auxiliary model, which is trained with explicit bias labels. However, +finding a type of bias in datasets is a costly process. Therefore, recent +studies have attempted to make the auxiliary model biased without the guidance +(or annotation) of bias labels, by constraining the model's training +environment or the capability of the model itself. Despite the promising +debiasing results of recent works, the multi-class learning objective, which +has been naively used to train the auxiliary model, may harm the bias +mitigation effect due to its regularization effect and competitive nature +across classes. As an alternative, we propose a new debiasing framework that +introduces binary classifiers between the auxiliary model and the main model, +coined bias experts. Specifically, each bias expert is trained on a binary +classification task derived from the multi-class classification task via the +One-vs-Rest approach. Experimental results demonstrate that our proposed +strategy improves the bias identification ability of the auxiliary model. +Consequently, our debiased model consistently outperforms the state-of-the-art +on various challenge datasets. + +
+
+ comment: Accepted in EMNLP 2023 as a long paper +
+
+
+
+
+ + ☆ XAIQA: Explainer-Based Data Augmentation for Extractive Question + Answering ML4H + + +
+ Extractive question answering (QA) systems can enable physicians and +researchers to query medical records, a foundational capability for designing +clinical studies and understanding patient medical history. However, building +these systems typically requires expert-annotated QA pairs. Large language +models (LLMs), which can perform extractive QA, depend on high quality data in +their prompts, specialized for the application domain. We introduce a novel +approach, XAIQA, for generating synthetic QA pairs at scale from data naturally +available in electronic health records. Our method uses the idea of a +classification model explainer to generate questions and answers about medical +concepts corresponding to medical codes. In an expert evaluation with two +physicians, our method identifies $2.2\times$ more semantic matches and +$3.8\times$ more clinical abbreviations than two popular approaches that use +sentence transformers to create QA pairs. In an ML evaluation, adding our QA +pairs improves performance of GPT-4 as an extractive QA model, including on +difficult questions. In both the expert and ML evaluations, we examine +trade-offs between our method and sentence transformers for QA pair generation +depending on question difficulty. + +
+
+ comment: Extended Abstract presented at Machine Learning for Health (ML4H) + symposium 2023, December 10th, 2023, New Orleans, United States, 8 pages +
+
+
+
+
+ + ☆ Holmes: Towards Distributed Training Across Clusters with Heterogeneous + NIC Environment + + +
+ Large language models (LLMs) such as GPT-3, OPT, and LLaMA have demonstrated +remarkable accuracy in a wide range of tasks. However, training these models +can incur significant expenses, often requiring tens of thousands of GPUs for +months of continuous operation. Typically, this training is carried out in +specialized GPU clusters equipped with homogeneous high-speed Remote Direct +Memory Access (RDMA) network interface cards (NICs). The acquisition and +maintenance of such dedicated clusters is challenging. Current LLM training +frameworks, like Megatron-LM and Megatron-DeepSpeed, focus primarily on +optimizing training within homogeneous cluster settings. In this paper, we +introduce Holmes, a training framework for LLMs that employs thoughtfully +crafted data and model parallelism strategies over the heterogeneous NIC +environment. Our primary technical contribution lies in a novel scheduling +method that intelligently allocates distinct computational tasklets in LLM +training to specific groups of GPU devices based on the characteristics of +their connected NICs. Furthermore, our proposed framework, utilizing pipeline +parallel techniques, demonstrates scalability to multiple GPU clusters, even in +scenarios without high-speed interconnects between nodes in distinct clusters. +We conducted comprehensive experiments that involved various scenarios in the +heterogeneous NIC environment. In most cases, our framework achieves +performance levels close to those achievable with homogeneous RDMA-capable +networks (InfiniBand or RoCE), significantly exceeding training efficiency +within the pure Ethernet environment. Additionally, we verified that our +framework outperforms other mainstream LLM frameworks under heterogeneous NIC +environment in terms of training efficiency and can be seamlessly integrated +with them. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ Sig-Networks Toolkit: Signature Networks for Longitudinal Language + Modelling + + +
+ We present an open-source, pip installable toolkit, Sig-Networks, the first +of its kind for longitudinal language modelling. A central focus is the +incorporation of Signature-based Neural Network models, which have recently +shown success in temporal tasks. We apply and extend published research +providing a full suite of signature-based models. Their components can be used +as PyTorch building blocks in future architectures. Sig-Networks enables +task-agnostic dataset plug-in, seamless pre-processing for sequential data, +parameter flexibility, automated tuning across a range of models. We examine +signature networks under three different NLP tasks of varying temporal +granularity: counselling conversations, rumour stance switch and mood changes +in social media threads, showing SOTA performance in all three, and provide +guidance for future tasks. We release the Toolkit as a PyTorch package with an +introductory video, Git repositories for preprocessing and modelling including +sample notebooks on the modeled NLP tasks. + +
+
+
+
+
+ + ☆ Exploring Answer Information Methods for Question Generation with + Transformers + + +
+ There has been a lot of work in question generation where different methods +to provide target answers as input, have been employed. This experimentation +has been mostly carried out for RNN based models. We use three different +methods and their combinations for incorporating answer information and explore +their effect on several automatic evaluation metrics. The methods that are used +are answer prompting, using a custom product method using answer embeddings and +encoder outputs, choosing sentences from the input paragraph that have answer +related information, and using a separate cross-attention attention block in +the decoder which attends to the answer. We observe that answer prompting +without any additional modes obtains the best scores across rouge, meteor +scores. Additionally, we use a custom metric to calculate how many of the +generated questions have the same answer, as the answer which is used to +generate them. + +
+
+
+
+
+ + ☆ AMR Parsing is Far from Solved: GrAPES, the Granular AMR Parsing + Evaluation Suite EMNLP 2023 + + +
+ We present the Granular AMR Parsing Evaluation Suite (GrAPES), a challenge +set for Abstract Meaning Representation (AMR) parsing with accompanying +evaluation metrics. AMR parsers now obtain high scores on the standard AMR +evaluation metric Smatch, close to or even above reported inter-annotator +agreement. But that does not mean that AMR parsing is solved; in fact, human +evaluation in previous work indicates that current parsers still quite +frequently make errors on node labels or graph structure that substantially +distort sentence meaning. Here, we provide an evaluation suite that tests AMR +parsers on a range of phenomena of practical, technical, and linguistic +interest. Our 36 categories range from seen and unseen labels, to structural +generalization, to coreference. GrAPES reveals in depth the abilities and +shortcomings of current AMR parsers. + +
+
+ comment: Accepted at EMNLP 2023. For the associated GitHub repository, see + https://github.com/jgroschwitz/GrAPES +
+
+
+
+
+ + ☆ DBCopilot: Scaling Natural Language Querying to Massive Databases + + +
+ Text-to-SQL simplifies database interactions by enabling non-experts to +convert their natural language (NL) questions into Structured Query Language +(SQL) queries. While recent advances in large language models (LLMs) have +improved the zero-shot text-to-SQL paradigm, existing methods face scalability +challenges when dealing with massive, dynamically changing databases. This +paper introduces DBCopilot, a framework that addresses these challenges by +employing a compact and flexible copilot model for routing across massive +databases. Specifically, DBCopilot decouples the text-to-SQL process into +schema routing and SQL generation, leveraging a lightweight +sequence-to-sequence neural network-based router to formulate database +connections and navigate natural language questions through databases and +tables. The routed schemas and questions are then fed into LLMs for efficient +SQL generation. Furthermore, DBCopilot also introduced a reverse +schema-to-question generation paradigm, which can learn and adapt the router +over massive databases automatically without requiring manual intervention. +Experimental results demonstrate that DBCopilot is a scalable and effective +solution for real-world text-to-SQL tasks, providing a significant advancement +in handling large-scale schemas. + +
+
+ comment: Code and data are available at https://github.com/tshu-w/DBCopilot +
+
+
+
+
+ + ☆ Think from Words(TFW): Initiating Human-Like Cognition in Large Language + Models Through Think from Words for Japanese Text-level Classification + + +
+ The proliferation of Large Language Models (LLMs) has spurred extensive +research into LLM-related Prompt investigations, such as Instruction Learning +(IL), In-context Learning (ICL), and Chain-of-Thought (CoT). These approaches +aim to improve LLMs' responses by enabling them to provide concise statements +or examples for deeper contemplation when addressing questions. However, +independent thinking by LLMs can introduce variability in their thought +processes, leading to potential inaccuracies. In response, our study seeks to +bridge the gap between LLM and human-like thinking processes, recognizing that +text comprehension begins with understanding individual words. To tackle this +challenge, we have expanded the CoT method to cater to a specific domain. Our +approach, known as "Think from Words" (TFW), initiates the comprehension +process at the word level and then extends it to encompass the entire text. We +also propose "TFW with Extra word-level information" (TFW Extra), augmenting +comprehension with additional word-level data. To assess our methods, we employ +text classification on six Japanese datasets comprising text-level and +word-level elements. Our findings not only validate the effectiveness of TFW +but also shed light on the impact of various word-level information types on +LLMs' text comprehension, offering insights into their potential to cause +misinterpretations and errors in the overall comprehension of the final text. + +
+
+
+
+
+ + ☆ Compressed Context Memory For Online Language Model Interaction + + +
+ This paper presents a novel context compression method for Transformer +language models in online scenarios such as ChatGPT, where the context +continually expands. As the context lengthens, the attention process requires +more memory and computational resources, which in turn reduces the throughput +of the language model. To this end, we propose a compressed context memory +system that continually compresses the growing context into a compact memory +space. The compression process simply involves integrating a lightweight +conditional LoRA into the language model's forward pass during inference. Based +on the compressed context memory, the language model can perform inference with +reduced memory and attention operations. Through evaluations on conversation, +personalization, and multi-task learning, we demonstrate that our approach +achieves the performance level of a full context model with $5\times$ smaller +context memory space. Codes are available at +https://github.com/snu-mllab/context-memory. + +
+
+
+
+
+ + ☆ A Text-to-Text Model for Multilingual Offensive Language Identification AACL 2023 + + +
+ The ubiquity of offensive content on social media is a growing cause for +concern among companies and government organizations. Recently, +transformer-based models such as BERT, XLNET, and XLM-R have achieved +state-of-the-art performance in detecting various forms of offensive content +(e.g. hate speech, cyberbullying, and cyberaggression). However, the majority +of these models are limited in their capabilities due to their encoder-only +architecture, which restricts the number and types of labels in downstream +tasks. Addressing these limitations, this study presents the first pre-trained +model with encoder-decoder architecture for offensive language identification +with text-to-text transformers (T5) trained on two large offensive language +identification datasets; SOLID and CCTK. We investigate the effectiveness of +combining two datasets and selecting an optimal threshold in semi-supervised +instances in SOLID in the T5 retraining step. Our pre-trained T5 model +outperforms other transformer-based models fine-tuned for offensive language +detection, such as fBERT and HateBERT, in multiple English benchmarks. +Following a similar approach, we also train the first multilingual pre-trained +model for offensive language identification using mT5 and evaluate its +performance on a set of six different languages (German, Hindi, Korean, +Marathi, Sinhala, and Spanish). The results demonstrate that this multilingual +model achieves a new state-of-the-art on all the above datasets, showing its +usefulness in multilingual scenarios. Our proposed T5-based models will be made +freely available to the community. + +
+
+ comment: Accepted to Findings of IJCNLP-AACL 2023 +
+
+
+
+
+ + ☆ Lazy-k: Decoding for Constrained Token Classification EMNLP + + +
+ We explore the possibility of improving probabilistic models in structured +prediction. Specifically, we combine the models with constrained decoding +approaches in the context of token classification for information extraction. +The decoding methods search for constraint-satisfying label-assignments while +maximizing the total probability. To do this, we evaluate several existing +approaches, as well as propose a novel decoding method called Lazy-$k$. Our +findings demonstrate that constrained decoding approaches can significantly +improve the models' performances, especially when using smaller models. The +Lazy-$k$ approach allows for more flexibility between decoding time and +accuracy. The code for using Lazy-$k$ decoding can be found here: +https://github.com/ArthurDevNL/lazyk. + +
+
+ comment: Accepted EMNLP Main 2023 +
+
+
+
+
+ + ☆ KhabarChin: Automatic Detection of Important News in the Persian + Language + + +
+ Being aware of important news is crucial for staying informed and making +well-informed decisions efficiently. Natural Language Processing (NLP) +approaches can significantly automate this process. This paper introduces the +detection of important news, in a previously unexplored area, and presents a +new benchmarking dataset (Khabarchin) for detecting important news in the +Persian language. We define important news articles as those deemed significant +for a considerable portion of society, capable of influencing their mindset or +decision-making. The news articles are obtained from seven different prominent +Persian news agencies, resulting in the annotation of 7,869 samples and the +creation of the dataset. Two challenges of high disagreement and imbalance +between classes were faced, and solutions were provided for them. We also +propose several learning-based models, ranging from conventional machine +learning to state-of-the-art transformer models, to tackle this task. +Furthermore, we introduce the second task of important sentence detection in +news articles, as they often come with a significant contextual length that +makes it challenging for readers to identify important information. We identify +these sentences in a weakly supervised manner. + +
+
+ comment: 8 pages, 2 figures +
+
+
+
+
+ + ☆ Teaching Specific Scientific Knowledge into Large Language Models + through Additional Training + + +
+ Through additional training, we explore embedding specialized scientific +knowledge into the Llama 2 Large Language Model (LLM). Key findings reveal that +effective knowledge integration requires reading texts from multiple +perspectives, especially in instructional formats. We utilize text augmentation +to tackle the scarcity of specialized texts, including style conversions and +translations. Hyperparameter optimization proves crucial, with different size +models (7b, 13b, and 70b) reasonably undergoing additional training. Validating +our methods, we construct a dataset of 65,000 scientific papers. Although we +have succeeded in partially embedding knowledge, the study highlights the +complexities and limitations of incorporating specialized information into +LLMs, suggesting areas for further improvement. + +
+
+
+
+
+ + ☆ Topic and genre in dialogue + + +
+ In this paper we argue that topic plays a fundamental role in conversations, +and that the concept is needed in addition to that of genre to define +interactions. In particular, the concepts of genre and topic need to be +separated and orthogonally defined. This would enable modular, reliable and +controllable flexible-domain dialogue systems. + +
+
+
+
+
+ + ☆ Measuring Misogyny in Natural Language Generation: Preliminary Results + from a Case Study on two Reddit Communities EMNLP 2023 + + +
+ Generic `toxicity' classifiers continue to be used for evaluating the +potential for harm in natural language generation, despite mounting evidence of +their shortcomings. We consider the challenge of measuring misogyny in natural +language generation, and argue that generic `toxicity' classifiers are +inadequate for this task. We use data from two well-characterised `Incel' +communities on Reddit that differ primarily in their degrees of misogyny to +construct a pair of training corpora which we use to fine-tune two language +models. We show that an open source `toxicity' classifier is unable to +distinguish meaningfully between generations from these models. We contrast +this with a misogyny-specific lexicon recently proposed by feminist +subject-matter experts, demonstrating that, despite the limitations of simple +lexicon-based approaches, this shows promise as a benchmark to evaluate +language models for misogyny, and that it is sensitive enough to reveal the +known differences in these Reddit communities. Our preliminary findings +highlight the limitations of a generic approach to evaluating harms, and +further emphasise the need for careful benchmark design and selection in +natural language evaluation. + +
+
+ comment: This extended abstract was presented at the Generation, Evaluation + and Metrics workshop at Empirical Methods in Natural Language Processing in + 2023 (GEM@EMNLP 2023) in Singapore +
+
+
+
+
+ + ☆ Optimizing Two-Pass Cross-Lingual Transfer Learning: Phoneme Recognition + and Phoneme to Grapheme Translation + + +
+ This research optimizes two-pass cross-lingual transfer learning in +low-resource languages by enhancing phoneme recognition and phoneme-to-grapheme +translation models. Our approach optimizes these two stages to improve speech +recognition across languages. We optimize phoneme vocabulary coverage by +merging phonemes based on shared articulatory characteristics, thus improving +recognition accuracy. Additionally, we introduce a global phoneme noise +generator for realistic ASR noise during phoneme-to-grapheme training to reduce +error propagation. Experiments on the CommonVoice 12.0 dataset show significant +reductions in Word Error Rate (WER) for low-resource languages, highlighting +the effectiveness of our approach. This research contributes to the +advancements of two-pass ASR systems in low-resource languages, offering the +potential for improved cross-lingual transfer learning. + +
+
+ comment: 8 pages, ASRU 2023 Accepted +
+
+
+
+
+ + ☆ Dyport: Dynamic Importance-based Hypothesis Generation Benchmarking + Technique + + +
+ This paper presents a novel benchmarking framework Dyport for evaluating +biomedical hypothesis generation systems. Utilizing curated datasets, our +approach tests these systems under realistic conditions, enhancing the +relevance of our evaluations. We integrate knowledge from the curated databases +into a dynamic graph, accompanied by a method to quantify discovery importance. +This not only assesses hypothesis accuracy but also their potential impact in +biomedical research which significantly extends traditional link prediction +benchmarks. Applicability of our benchmarking process is demonstrated on +several link prediction systems applied on biomedical semantic knowledge +graphs. Being flexible, our benchmarking system is designed for broad +application in hypothesis generation quality verification, aiming to expand the +scope of scientific discovery within the biomedical research community. +Availability and implementation: Dyport framework is fully open-source. All +code and datasets are available at: https://github.com/IlyaTyagin/Dyport + +
+
+
+
+
+ + ☆ Can language agents be alternatives to PPO? A Preliminary Empirical + Study On OpenAI Gym + + +
+ The formidable capacity for zero- or few-shot decision-making in language +agents encourages us to pose a compelling question: Can language agents be +alternatives to PPO agents in traditional sequential decision-making tasks? To +investigate this, we first take environments collected in OpenAI Gym as our +testbeds and ground them to textual environments that construct the TextGym +simulator. This allows for straightforward and efficient comparisons between +PPO agents and language agents, given the widespread adoption of OpenAI Gym. To +ensure a fair and effective benchmarking, we introduce $5$ levels of scenario +for accurate domain-knowledge controlling and a unified RL-inspired framework +for language agents. Additionally, we propose an innovative +explore-exploit-guided language (EXE) agent to solve tasks within TextGym. +Through numerical experiments and ablation studies, we extract valuable +insights into the decision-making capabilities of language agents and make a +preliminary evaluation of their potential to be alternatives to PPO in +classical sequential decision-making problems. This paper sheds light on the +performance of language agents and paves the way for future research in this +exciting domain. Our code is publicly available +at~\url{https://github.com/mail-ecnu/Text-Gym-Agents}. + +
+
+
+
+
+ + ☆ Rethinking E-Commerce Search + + +
+ E-commerce search and recommendation usually operate on structured data such +as product catalogs and taxonomies. However, creating better search and +recommendation systems often requires a large variety of unstructured data +including customer reviews and articles on the web. Traditionally, the solution +has always been converting unstructured data into structured data through +information extraction, and conducting search over the structured data. +However, this is a costly approach that often has low quality. In this paper, +we envision a solution that does entirely the opposite. Instead of converting +unstructured data (web pages, customer reviews, etc) to structured data, we +instead convert structured data (product inventory, catalogs, taxonomies, etc) +into textual data, which can be easily integrated into the text corpus that +trains LLMs. Then, search and recommendation can be performed through a Q/A +mechanism through an LLM instead of using traditional information retrieval +methods over structured data. + +
+
+
+
+
+ + ☆ Detecting Rumor Veracity with Only Textual Information by Double-Channel + Structure + + +
+ Kyle (1985) proposes two types of rumors: informed rumors which are based on +some private information and uninformed rumors which are not based on any +information (i.e. bluffing). Also, prior studies find that when people have +credible source of information, they are likely to use a more confident textual +tone in their spreading of rumors. Motivated by these theoretical findings, we +propose a double-channel structure to determine the ex-ante veracity of rumors +on social media. Our ultimate goal is to classify each rumor into true, false, +or unverifiable category. We first assign each text into either certain +(informed rumor) or uncertain (uninformed rumor) category. Then, we apply lie +detection algorithm to informed rumors and thread-reply agreement detection +algorithm to uninformed rumors. Using the dataset of SemEval 2019 Task 7, which +requires ex-ante threefold classification (true, false, or unverifiable) of +social media rumors, our model yields a macro-F1 score of 0.4027, outperforming +all the baseline models and the second-place winner (Gorrell et al., 2019). +Furthermore, we empirically validate that the double-channel structure +outperforms single-channel structures which use either lie detection or +agreement detection algorithm to all posts. + +
+
+
+
+
+ + ☆ Corporate Bankruptcy Prediction with Domain-Adapted BERT + + +
+ This study performs BERT-based analysis, which is a representative +contextualized language model, on corporate disclosure data to predict +impending bankruptcies. Prior literature on bankruptcy prediction mainly +focuses on developing more sophisticated prediction methodologies with +financial variables. However, in our study, we focus on improving the quality +of input dataset. Specifically, we employ BERT model to perform sentiment +analysis on MD&A disclosures. We show that BERT outperforms dictionary-based +predictions and Word2Vec-based predictions in terms of adjusted R-square in +logistic regression, k-nearest neighbor (kNN-5), and linear kernel support +vector machine (SVM). Further, instead of pre-training the BERT model from +scratch, we apply self-learning with confidence-based filtering to corporate +disclosure data (10-K). We achieve the accuracy rate of 91.56% and demonstrate +that the domain adaptation procedure brings a significant improvement in +prediction accuracy. + +
+
+
+
+
+ + ☆ Collaboration or Corporate Capture? Quantifying NLP's Reliance on + Industry Artifacts and Contributions + + +
+ The advent of transformers, higher computational budgets, and big data has +engendered remarkable progress in Natural Language Processing (NLP). Impressive +performance of industry pre-trained models has garnered public attention in +recent years and made news headlines. That these are industry models is +noteworthy. Rarely, if ever, are academic institutes producing exciting new NLP +models. Using these models is critical for competing on NLP benchmarks and +correspondingly to stay relevant in NLP research. We surveyed 100 papers +published at EMNLP 2022 to determine whether this phenomenon constitutes a +reliance on industry for NLP publications. + We find that there is indeed a substantial reliance. Citations of industry +artifacts and contributions across categories is at least three times greater +than industry publication rates per year. Quantifying this reliance does not +settle how we ought to interpret the results. We discuss two possible +perspectives in our discussion: 1) Is collaboration with industry still +collaboration in the absence of an alternative? Or 2) has free NLP inquiry been +captured by the motivations and research direction of private corporations? + +
+
+
+
+
+ + ☆ A Pseudo-Semantic Loss for Autoregressive Models with Logical + Constraints + + +
+ Neuro-symbolic AI bridges the gap between purely symbolic and neural +approaches to learning. This often requires maximizing the likelihood of a +symbolic constraint w.r.t the neural network's output distribution. Such output +distributions are typically assumed to be fully-factorized. This limits the +applicability of neuro-symbolic learning to the more expressive autoregressive +distributions, e.g., transformers. Under such distributions, computing the +likelihood of even simple constraints is #P-hard. Instead of attempting to +enforce the constraint on the entire output distribution, we propose to do so +on a random, local approximation thereof. More precisely, we optimize the +likelihood of the constraint under a pseudolikelihood-based approximation +centered around a model sample. Our approximation is factorized, allowing the +reuse of solutions to sub-problems, a main tenet for efficiently computing +neuro-symbolic losses. Moreover, it is a local, high-fidelity approximation of +the likelihood, exhibiting low entropy and KL-divergence around the model +sample. We evaluate our approach on Sudoku and shortest-path prediction cast as +autoregressive generation, and observe that we greatly improve upon the base +model's ability to predict logically-consistent outputs. We also evaluate on +the task of detoxifying large language models. Using a simple constraint +disallowing a list of toxic words, we are able to steer the model's outputs +away from toxic generations, achieving SoTA detoxification compared to previous +approaches. + +
+
+
+
+
+ + ☆ Revisiting the Optimality of Word Lengths EMNLP 2023 + + +
+ Zipf (1935) posited that wordforms are optimized to minimize utterances' +communicative costs. Under the assumption that cost is given by an utterance's +length, he supported this claim by showing that words' lengths are inversely +correlated with their frequencies. Communicative cost, however, can be +operationalized in different ways. Piantadosi et al. (2011) claim that cost +should be measured as the distance between an utterance's information rate and +channel capacity, which we dub the channel capacity hypothesis (CCH) here. +Following this logic, they then proposed that a word's length should be +proportional to the expected value of its surprisal (negative log-probability +in context). In this work, we show that Piantadosi et al.'s derivation does not +minimize CCH's cost, but rather a lower bound, which we term CCH-lower. We +propose a novel derivation, suggesting an improved way to minimize CCH's cost. +Under this method, we find that a language's word lengths should instead be +proportional to the surprisal's expectation plus its variance-to-mean ratio. +Experimentally, we compare these three communicative cost functions: Zipf's, +CCH-lower , and CCH. Across 13 languages and several experimental settings, we +find that length is better predicted by frequency than either of the other +hypotheses. In fact, when surprisal's expectation, or expectation plus +variance-to-mean ratio, is estimated using better language models, it leads to +worse word length predictions. We take these results as evidence that Zipf's +longstanding hypothesis holds. + +
+
+ comment: Published at EMNLP 2023 +
+
+
+
+
+ + ☆ The BigCode Project Governance Card + + +
+ This document serves as an overview of the different mechanisms and areas of +governance in the BigCode project. It aims to support transparency by providing +relevant information about choices that were made during the project to the +broader public, and to serve as an example of intentional governance of an open +research project that future endeavors can leverage to shape their own +approach. The first section, Project Structure, covers the project +organization, its stated goals and values, its internal decision processes, and +its funding and resources. The second section, Data and Model Governance, +covers decisions relating to the questions of data subject consent, privacy, +and model release. + +
+
+ comment: 12 pages, related papers arXiv:2305.06161 and arXiv:2301.03988 and + arXiv:2211.15533v1, learn more at https://www.bigcode-project.org/ +
+
+
+
+
+ + ☆ Efficient Large Language Models: A Survey + + +
+ Large Language Models (LLMs) have demonstrated remarkable capabilities in +important tasks such as natural language understanding, language generation, +and complex reasoning and have the potential to make a substantial impact on +our society. Such capabilities, however, come with the considerable resources +they demand, highlighting the strong need to develop effective techniques for +addressing their efficiency challenges. In this survey, we provide a systematic +and comprehensive review of efficient LLMs research. We organize the literature +in a taxonomy consisting of three main categories, covering distinct yet +interconnected efficient LLMs topics from model-centric, data-centric, and +framework-centric perspective, respectively. We have also created a GitHub +repository where we compile the papers featured in this survey at +https://github.com/AIoT-MLSys-Lab/EfficientLLMs, +https://github.com/AIoT-MLSys-Lab/Efficient-LLMs-Survey, and will actively +maintain this repository and incorporate new research as it emerges. We hope +our survey can serve as a valuable resource to help researchers and +practitioners gain a systematic understanding of the research developments in +efficient LLMs and inspire them to contribute to this important and exciting +field. + +
+
+
+
+
+ + ☆ Alpha-CLIP: A CLIP Model Focusing on Wherever You Want + + +
+ Contrastive Language-Image Pre-training (CLIP) plays an essential role in +extracting valuable content information from images across diverse tasks. It +aligns textual and visual modalities to comprehend the entire image, including +all the details, even those irrelevant to specific tasks. However, for a finer +understanding and controlled editing of images, it becomes crucial to focus on +specific regions of interest, which can be indicated as points, masks, or boxes +by humans or perception models. To fulfill the requirements, we introduce +Alpha-CLIP, an enhanced version of CLIP with an auxiliary alpha channel to +suggest attentive regions and fine-tuned with constructed millions of RGBA +region-text pairs. Alpha-CLIP not only preserves the visual recognition ability +of CLIP but also enables precise control over the emphasis of image contents. +It demonstrates effectiveness in various tasks, including but not limited to +open-world recognition, multimodal large language models, and conditional 2D / +3D generation. It has a strong potential to serve as a versatile tool for +image-related tasks. + +
+
+ comment: project page: https://aleafy.github.io/alpha-clip; code: + https://github.com/SunzeY/AlphaCLIP +
+
+
+
+
+ + ☆ LLM as OS (llmao), Agents as Apps: Envisioning AIOS, Agents and the + AIOS-Agent Ecosystem + + +
+ This paper envisions a revolutionary AIOS-Agent ecosystem, where Large +Language Model (LLM) serves as the (Artificial) Intelligent Operating System +(IOS, or AIOS)--an operating system ``with soul''. Upon this foundation, a +diverse range of LLM-based AI Agent Applications (Agents, or AAPs) are +developed, enriching the AIOS-Agent ecosystem and signaling a paradigm shift +from the traditional OS-APP ecosystem. We envision that LLM's impact will not +be limited to the AI application level, instead, it will in turn revolutionize +the design and implementation of computer system, architecture, software, and +programming language, featured by several main concepts: LLM as OS +(system-level), Agents as Applications (application-level), Natural Language as +Programming Interface (user-level), and Tools as Devices/Libraries +(hardware/middleware-level). + +
+
+ comment: 35 pages, 4 figures +
+
+
+
+
+ + ☆ Improving Activation Steering in Language Models with Mean-Centring + + +
+ Recent work in activation steering has demonstrated the potential to better +control the outputs of Large Language Models (LLMs), but it involves finding +steering vectors. This is difficult because engineers do not typically know how +features are represented in these models. We seek to address this issue by +applying the idea of mean-centring to steering vectors. We find that taking the +average of activations associated with a target dataset, and then subtracting +the mean of all training activations, results in effective steering vectors. We +test this method on a variety of models on natural language tasks by steering +away from generating toxic text, and steering the completion of a story towards +a target genre. We also apply mean-centring to extract function vectors, more +effectively triggering the execution of a range of natural language tasks by a +significant margin (compared to previous baselines). This suggests that +mean-centring can be used to easily improve the effectiveness of activation +steering in a wide range of contexts. + +
+
+
+
+
+ + ☆ Comparative Analysis of Multilingual Text Classification & + Identification through Deep Learning and Embedding Visualization + + +
+ This research conducts a comparative study on multilingual text +classification methods, utilizing deep learning and embedding visualization. +The study employs LangDetect, LangId, FastText, and Sentence Transformer on a +dataset encompassing 17 languages. It explores dimensionality's impact on +clustering, revealing FastText's clearer clustering in 2D visualization due to +its extensive multilingual corpus training. Notably, the FastText multi-layer +perceptron model achieved remarkable accuracy, precision, recall, and F1 score, +outperforming the Sentence Transformer model. The study underscores the +effectiveness of these techniques in multilingual text classification, +emphasizing the importance of large multilingual corpora for training +embeddings. It lays the groundwork for future research and assists +practitioners in developing language detection and classification systems. +Additionally, it includes the comparison of multi-layer perceptron, LSTM, and +Convolution models for classification. + +
+
+ comment: 9 Pages, 10 Figures, 1 Table +
+
+
+
+
+ + ☆ SmoothQuant+: Accurate and Efficient 4-bit Post-Training + WeightQuantization for LLM + + +
+ Large language models (LLMs) have shown remarkable capabilities in various +tasks. However their huge model size and the consequent demand for +computational and memory resources also pose challenges to model deployment. +Currently, 4-bit post-training quantization (PTQ) has achieved some success in +LLMs, reducing the memory footprint by approximately 75% compared to FP16 +models, albeit with some accuracy loss. In this paper, we propose SmoothQuant+, +an accurate and efficient 4-bit weight-only PTQ that requires no additional +training, which enables lossless in accuracy for LLMs for the first time. Based +on the fact that the loss of weight quantization is amplified by the activation +outliers, SmoothQuant+ smoothes the activation outliers by channel before +quantization, while adjusting the corresponding weights for mathematical +equivalence, and then performs group-wise 4-bit weight quantization for linear +layers. We have integrated SmoothQuant+ into the vLLM framework, an advanced +high-throughput inference engine specially developed for LLMs, and equipped it +with an efficient W4A16 CUDA kernels, so that vLLM can seamlessly support +SmoothQuant+ 4-bit weight quantization. Our results show that, with +SmoothQuant+, the Code Llama-34B model can be quantized and deployed on a A100 +40GB GPU, achieving lossless accuracy and a throughput increase of 1.9 to 4.0 +times compared to the FP16 model deployed on two A100 40GB GPUs. Moreover, the +latency per token is only 68% of the FP16 model deployed on two A100 40GB GPUs. +This is the state-of-the-art 4-bit weight quantization for LLMs as we know. + +
+
+
+
+
+ + ♻ ☆ KPI Extraction from Maintenance Work Orders -- A Comparison of Expert + Labeling, Text Classification and AI-Assisted Tagging for Computing Failure + Rates of Wind Turbines + + +
+ Maintenance work orders are commonly used to document information about wind +turbine operation and maintenance. This includes details about proactive and +reactive wind turbine downtimes, such as preventative and corrective +maintenance. However, the information contained in maintenance work orders is +often unstructured and difficult to analyze, presenting challenges for +decision-makers wishing to use it for optimizing operation and maintenance. To +address this issue, this work compares three different approaches to calculate +reliability by performance indicators from maintenance work orders. The first +approach involves manual labeling of the maintenance work orders by domain +experts, using the schema defined in an industrial guideline to assign the +label accordingly. The second approach involves the development of a model that +automatically labels the maintenance work orders using text classification +methods. Through this method, we are able to achieve macro average and weighted +average F1-Scores of 0.75 and 0.85 respectively. The third technique uses an +AI-assisted tagging tool to tag and structure the raw maintenance information, +together with a novel rule-based approach for extracting relevant maintenance +work orders for failure rate calculation. In our experiments the AI-assisted +tool leads to a 88% drop in tagging time in comparison to the other two +approaches, while expert labeling and text classification are more accurate in +KPI extraction. Overall, our findings make extracting maintenance information +from maintenance work orders more efficient, enable the assessment of +reliability key performance indicators and therefore support the optimization +of wind turbine operation and maintenance. + +
+
+
+
+
+ + ♻ ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, +\textit{Flamingo-CXR}, by fine-tuning a well-known vision-language foundation +model on radiology data. To evaluate the quality of the AI-generated reports, a +group of 16 certified radiologists provide detailed evaluations of AI-generated +and human written reports for chest X-rays from an intensive care setting in +the United States and an inpatient setting in India. At least one radiologist +(out of two per case) preferred the AI report to the ground truth report in +over 60$\%$ of cases for both datasets. Amongst the subset of AI-generated +reports that contain errors, the most frequently cited reasons were related to +the location and finding, whereas for human written reports, most mistakes were +related to severity and finding. This disparity suggested potential +complementarity between our AI system and human experts, prompting us to +develop an assistive scenario in which \textit{Flamingo-CXR} generates a +first-draft report, which is subsequently revised by a clinician. This is the +first demonstration of clinician-AI collaboration for report writing, and the +resultant reports are assessed to be equivalent or preferred by at least one +radiologist to reports written by experts alone in 80$\%$ of in-patient cases +and 60$\%$ of intensive care cases. + +
+
+
+
+
+ + ♻ ☆ LLMLingua: Compressing Prompts for Accelerated Inference of Large + Language Models EMNLP 2023 + + +
+ Large language models (LLMs) have been applied in various applications due to +their astonishing capabilities. With advancements in technologies such as +chain-of-thought (CoT) prompting and in-context learning (ICL), the prompts fed +to LLMs are becoming increasingly lengthy, even exceeding tens of thousands of +tokens. To accelerate model inference and reduce cost, this paper presents +LLMLingua, a coarse-to-fine prompt compression method that involves a budget +controller to maintain semantic integrity under high compression ratios, a +token-level iterative compression algorithm to better model the interdependence +between compressed contents, and an instruction tuning based method for +distribution alignment between language models. We conduct experiments and +analysis over four datasets from different scenarios, i.e., GSM8K, BBH, +ShareGPT, and Arxiv-March23; showing that the proposed approach yields +state-of-the-art performance and allows for up to 20x compression with little +performance loss. Our code is available at https://aka.ms/LLMLingua. + +
+
+ comment: Accepted at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Conditions for Length Generalization in Learning Reasoning Skills + + +
+ Reasoning is a fundamental capability of AI agents. Recently, large language +models (LLMs) have shown remarkable abilities to perform reasoning tasks. +However, numerous evaluations of the reasoning capabilities of LLMs have also +showed some limitations. An outstanding limitation is length generalization, +meaning that when trained on reasoning problems of smaller lengths or sizes, +the resulting models struggle with problems of larger sizes or lengths. This +potentially indicates some theoretical limitations of generalization in +learning reasoning skills. These evaluations and their observations motivated +us to perform a theoretical study of the length generalization problem. This +work focuses on reasoning tasks that can be formulated as Markov dynamic +processes (MDPs) and/or directed acyclic graphs (DAGs). It identifies and +proves conditions that decide whether the length generalization problem can be +solved or not for a reasoning task in a particular representation. Experiments +are also conducted to verify the theoretical results. + +
+
+
+
+
+ + ♻ ☆ TIGERScore: Towards Building Explainable Metric for All Text Generation + Tasks + + +
+ We present TIGERScore, a \textbf{T}rained metric that follows +\textbf{I}nstruction \textbf{G}uidance to perform \textbf{E}xplainable, and +\textbf{R}eference-free evaluation over a wide spectrum of text generation +tasks. Different from other automatic evaluation methods that only provide +arcane scores, TIGERScore is guided by natural language instruction to provide +error analysis to pinpoint the mistakes in the generated text. Our metric is +based on LLaMA-2, trained on our meticulously curated instruction-tuning +dataset MetricInstruct which covers 6 text generation tasks and 23 text +generation datasets. The dataset consists of 42K quadruple in the form of +(instruction, input, system output $\rightarrow$ error analysis). We collected +the `system outputs' through from a large variety of models to cover different +types of errors. To quantitatively assess our metric, we evaluate its +correlation with human ratings on 5 held-in datasets, 2 held-out datasets and +show that TIGERScore can achieve the open-source SoTA correlation with human +ratings across these datasets and almost approaches GPT-4 evaluator. As a +reference-free metric, its correlation can even surpass the best existing +reference-based metrics. To further qualitatively assess the rationale +generated by our metric, we conduct human evaluation on the generated +explanations and found that the explanations are 70.8\% accurate. Through these +experimental results, we believe TIGERScore demonstrates the possibility of +building universal explainable metrics to evaluate any text generation task. + +
+
+
+
+
+ + ♻ ☆ Entailment Semantics Can Be Extracted from an Ideal Language Model + + +
+ Language models are often trained on text alone, without additional +grounding. There is debate as to how much of natural language semantics can be +inferred from such a procedure. We prove that entailment judgments between +sentences can be extracted from an ideal language model that has perfectly +learned its target distribution, assuming the training sentences are generated +by Gricean agents, i.e., agents who follow fundamental principles of +communication from the linguistic theory of pragmatics. We also show entailment +judgments can be decoded from the predictions of a language model trained on +such Gricean data. Our results reveal a pathway for understanding the semantic +information encoded in unlabeled linguistic data and a potential framework for +extracting semantics from language models. + +
+
+ comment: Accepted at CONLL 2022. Updated Dec 4, 2023 with erratum +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Visual-Textual Sentiment Analysis from Social + Media Networks + + +
+ Social media networks have become a significant aspect of people's lives, +serving as a platform for their ideas, opinions and emotions. Consequently, +automated sentiment analysis (SA) is critical for recognising people's feelings +in ways that other information sources cannot. The analysis of these feelings +revealed various applications, including brand evaluations, YouTube film +reviews and healthcare applications. As social media continues to develop, +people post a massive amount of information in different forms, including text, +photos, audio and video. Thus, traditional SA algorithms have become limited, +as they do not consider the expressiveness of other modalities. By including +such characteristics from various material sources, these multimodal data +streams provide new opportunities for optimising the expected results beyond +text-based SA. Our study focuses on the forefront field of multimodal SA, which +examines visual and textual data posted on social media networks. Many people +are more likely to utilise this information to express themselves on these +platforms. To serve as a resource for academics in this rapidly growing field, +we introduce a comprehensive overview of textual and visual SA, including data +pre-processing, feature extraction techniques, sentiment benchmark datasets, +and the efficacy of multiple classification methodologies suited to each field. +We also provide a brief introduction of the most frequently utilised data +fusion strategies and a summary of existing research on visual-textual SA. +Finally, we highlight the most significant challenges and investigate several +important sentiment applications. + +
+
+
+
+
+ + ♻ ☆ Is This the Subspace You Are Looking for? An Interpretability Illusion + for Subspace Activation Patching NeurIPS 2023 + + +
+ Mechanistic interpretability aims to understand model behaviors in terms of +specific, interpretable features, often hypothesized to manifest as +low-dimensional subspaces of activations. Specifically, recent studies have +explored subspace interventions (such as activation patching) as a way to +simultaneously manipulate model behavior and attribute the features behind it +to given subspaces. + In this work, we demonstrate that these two aims diverge, potentially leading +to an illusory sense of interpretability. Counterintuitively, even if a +subspace intervention makes the model's output behave as if the value of a +feature was changed, this effect may be achieved by activating a dormant +parallel pathway leveraging another subspace that is causally disconnected from +model outputs. We demonstrate this phenomenon in a distilled mathematical +example, in two real-world domains (the indirect object identification task and +factual recall), and present evidence for its prevalence in practice. In the +context of factual recall, we further show a link to rank-1 fact editing, +providing a mechanistic explanation for previous work observing an +inconsistency between fact editing performance and fact localization. + However, this does not imply that activation patching of subspaces is +intrinsically unfit for interpretability. To contextualize our findings, we +also show what a success case looks like in a task (indirect object +identification) where prior manual circuit analysis informs an understanding of +the location of a feature. We explore the additional evidence needed to argue +that a patched subspace is faithful. + +
+
+ comment: NeurIPS 2023 Workshop on Attributing Model Behavior at Scale +
+
+
+
+
+ + ♻ ☆ Error Detection for Text-to-SQL Semantic Parsing EMNLP 2023 + + +
+ Despite remarkable progress in text-to-SQL semantic parsing in recent years, +the performance of existing parsers is still far from perfect. Specifically, +modern text-to-SQL parsers based on deep learning are often over-confident, +thus casting doubt on their trustworthiness when deployed for real use. In this +paper, we propose a parser-independent error detection model for text-to-SQL +semantic parsing. Using a language model of code as its bedrock, we enhance our +error detection model with graph neural networks that learn structural features +of both natural language questions and SQL queries. We train our model on +realistic parsing errors collected from a cross-domain setting, which leads to +stronger generalization ability. Experiments with three strong text-to-SQL +parsers featuring different decoding mechanisms show that our approach +outperforms parser-dependent uncertainty metrics. Our model could also +effectively improve the performance and usability of text-to-SQL semantic +parsers regardless of their architectures. (Our implementation is available at +https://github.com/OSU-NLP-Group/Text2SQL-Error-Detection) + +
+
+ comment: EMNLP 2023 (Findings); Updated with new experiment results +
+
+
+
+
+ + ♻ ☆ Completeness, Recall, and Negation in Open-World Knowledge Bases: A + Survey + + +
+ General-purpose knowledge bases (KBs) are a cornerstone of knowledge-centric +AI. Many of them are constructed pragmatically from Web sources, and are thus +far from complete. This poses challenges for the consumption as well as the +curation of their content. While several surveys target the problem of +completing incomplete KBs, the first problem is arguably to know whether and +where the KB is incomplete in the first place, and to which degree. + In this survey we discuss how knowledge about completeness, recall, and +negation in KBs can be expressed, extracted, and inferred. We cover (i) the +logical foundations of knowledge representation and querying under partial +closed-world semantics; (ii) the estimation of this information via statistical +patterns; (iii) the extraction of information about recall from KBs and text; +(iv) the identification of interesting negative statements; and (v) relaxed +notions of relative recall. + This survey is targeted at two types of audiences: (1) practitioners who are +interested in tracking KB quality, focusing extraction efforts, and building +quality-aware downstream applications; and (2) data management, knowledge base +and semantic web researchers who wish to understand the state of the art of +knowledge bases beyond the open-world assumption. Consequently, our survey +presents both fundamental methodologies and their working, and gives +practice-oriented recommendations on how to choose between different approaches +for a problem at hand. + +
+
+ comment: 42 pages, 8 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Strahler Number of Natural Language Sentences in Comparison with Random + Trees + + +
+ The Strahler number was originally proposed to characterize the complexity of +river bifurcation and has found various applications. This article proposes +computation of the Strahler number's upper and lower limits for natural +language sentence tree structures. Through empirical measurements across +grammatically annotated data, the Strahler number of natural language sentences +is shown to be almost 3 or 4, similarly to the case of river bifurcation as +reported by Strahler (1957). From the theory behind the number, we show that it +is one kind of lower limit on the amount of memory required to process +sentences. We consider the Strahler number to provide reasoning that explains +reports showing that the number of required memory areas to process sentences +is 3 to 4 for parsing (Schuler et al., 2010), and reports indicating a +psychological "magical number" of 3 to 5 (Cowan, 2001). An analytical and +empirical analysis shows that the Strahler number is not constant but grows +logarithmically; therefore, the Strahler number of sentences derives from the +range of sentence lengths. Furthermore, the Strahler number is not different +for random trees, which could suggest that its origin is not specific to +natural language. + +
+
+ comment: 34 pages, 12 figures, 11 tables +
+
+
+
+
+ + ♻ ☆ Visual Data-Type Understanding does not emerge from Scaling + Vision-Language Models + + +
+ Recent advances in the development of vision-language models (VLMs) are +yielding remarkable success in recognizing visual semantic content, including +impressive instances of compositional image understanding. Here, we introduce +the novel task of Visual Data-Type Identification, a basic perceptual skill +with implications for data curation (e.g., noisy data-removal from large +datasets, domain-specific retrieval) and autonomous vision (e.g., +distinguishing changing weather conditions from camera lens staining). We +develop two datasets consisting of animal images altered across a diverse set +of 27 visual data-types, spanning four broad categories. An extensive zero-shot +evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a nuanced +performance landscape. While VLMs are reasonably good at identifying certain +stylistic \textit{data-types}, such as cartoons and sketches, they struggle +with simpler data-types arising from basic manipulations like image rotations +or additive noise. Our findings reveal that (i) model scaling alone yields +marginal gains for contrastively-trained models like CLIP, and (ii) there is a +pronounced drop in performance for the largest auto-regressively trained VLMs +like OpenFlamingo. This finding points to a blind spot in current frontier +VLMs: they excel in recognizing semantic content but fail to acquire an +understanding of visual data-types through scaling. By analyzing the +pre-training distributions of these models and incorporating data-type +information into the captions during fine-tuning, we achieve a significant +enhancement in performance. By exploring this previously uncharted task, we aim +to set the stage for further advancing VLMs to equip them with visual data-type +understanding. Code and datasets are released at +https://github.com/bethgelab/DataTypeIdentification. + +
+
+
+
+
+ + ♻ ☆ Assessing Language Disorders using Artificial Intelligence: a Paradigm + Shift + + +
+ Speech, language, and communication deficits are present in most +neurodegenerative syndromes. They enable the early detection, diagnosis, +treatment planning, and monitoring of neurocognitive disease progression as +part of traditional neurological assessment. Nevertheless, standard speech and +language evaluation is time-consuming and resource-intensive for clinicians. We +argue that using machine learning methodologies, natural language processing, +and modern artificial intelligence (AI) for Language Assessment is an +improvement over conventional manual assessment. Using these methodologies, +Computational Language Assessment (CLA) accomplishes three goals: (i) provides +a neuro-cognitive evaluation of speech, language, and communication in elderly +and high-risk individuals for dementia; (ii) facilitates the diagnosis, +prognosis, and therapy efficacy in at-risk and language-impaired populations; +and (iii) allows easier extensibility to assess patients from a wide range of +languages. By employing AI models, CLA may inform neurocognitive theory on the +relationship between language symptoms and their neural bases. Finally, it +signals a paradigm shift by significantly advancing our ability to optimize the +prevention and treatment of elderly individuals with communication disorders, +allowing them to age gracefully with social engagement. + +
+
+ comment: 36 pages, 2 figures, to be submited +
+
+
+
+
+ + ♻ ☆ Modeling Empathic Similarity in Personal Narratives EMNLP 2023 + + +
+ The most meaningful connections between people are often fostered through +expression of shared vulnerability and emotional experiences in personal +narratives. We introduce a new task of identifying similarity in personal +stories based on empathic resonance, i.e., the extent to which two people +empathize with each others' experiences, as opposed to raw semantic or lexical +similarity, as has predominantly been studied in NLP. Using insights from +social psychology, we craft a framework that operationalizes empathic +similarity in terms of three key features of stories: main events, emotional +trajectories, and overall morals or takeaways. We create EmpathicStories, a +dataset of 1,500 personal stories annotated with our empathic similarity +features, and 2,000 pairs of stories annotated with empathic similarity scores. +Using our dataset, we fine-tune a model to compute empathic similarity of story +pairs, and show that this outperforms semantic similarity models on automated +correlation and retrieval metrics. Through a user study with 150 participants, +we also assess the effect our model has on retrieving stories that users +empathize with, compared to naive semantic similarity-based retrieval, and find +that participants empathized significantly more with stories retrieved by our +model. Our work has strong implications for the use of empathy-aware models to +foster human connection and empathy between people. + +
+
+ comment: Published at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ SwiftSage: A Generative Agent with Fast and Slow Thinking for Complex + Interactive Tasks NeurIPS 2023 + + +
+ We introduce SwiftSage, a novel agent framework inspired by the dual-process +theory of human cognition, designed to excel in action planning for complex +interactive reasoning tasks. SwiftSage integrates the strengths of behavior +cloning and prompting large language models (LLMs) to enhance task completion +performance. The framework comprises two primary modules: the Swift module, +representing fast and intuitive thinking, and the Sage module, emulating +deliberate thought processes. The Swift module is a small encoder-decoder LM +fine-tuned on the oracle agent's action trajectories, while the Sage module +employs LLMs such as GPT-4 for subgoal planning and grounding. We develop a +heuristic method to harmoniously integrate the two modules, resulting in a more +efficient and robust problem-solving process. In 30 tasks from the ScienceWorld +benchmark, SwiftSage significantly outperforms other methods such as SayCan, +ReAct, and Reflexion, demonstrating its effectiveness in solving complex +interactive tasks. + +
+
+ comment: Accepted to NeurIPS 2023 (spotlight). Project website: + https://swiftsage.github.io +
+
+
+
+
+ + ♻ ☆ Clickbait Detection via Large Language Models + + +
+ Clickbait, which aims to induce users with some surprising and even thrilling +headlines for increasing click-through rates, permeates almost all online +content publishers, such as news portals and social media. Recently, Large +Language Models (LLMs) have emerged as a powerful instrument and achieved +tremendous success in a series of NLP downstream tasks. However, it is not yet +known whether LLMs can be served as a high-quality clickbait detection system. +In this paper, we analyze the performance of LLMs in the few-shot and zero-shot +scenarios on several English and Chinese benchmark datasets. Experimental +results show that LLMs cannot achieve the best results compared to the +state-of-the-art deep and fine-tuning PLMs methods. Different from human +intuition, the experiments demonstrated that LLMs cannot make satisfied +clickbait detection just by the headlines. + +
+
+
+
+
+ + ♻ ☆ Inference-Time Policy Adapters (IPA): Tailoring Extreme-Scale LMs + without Fine-tuning EMNLP 2023 + + +
+ While extreme-scale language models have demonstrated exceptional performance +on a variety of language tasks, the degree of control over these language +models through pure prompting can often be limited. Directly fine-tuning such +language models can be effective for tailoring them, but it can be either +extremely costly (e.g., GPT-3) or not even feasible for the broader community +(e.g., GPT-4). + We propose Inference-time Policy Adapters (IPA), which efficiently tailors a +language model such as GPT-3 without fine-tuning it. IPA guides a large base +model during decoding time through a lightweight policy adapter trained to +optimize an arbitrary user objective with reinforcement learning. + On five challenging text generation tasks, such as toxicity reduction and +lexically constrained generation, IPA consistently brings significant +improvements over off-the-shelf language models. It outperforms competitive +baseline methods, sometimes even including expensive fine-tuning. In +particular, tailoring GPT-2 with IPA can outperform GPT-3, while tailoring +GPT-3 with IPA brings a major performance boost over GPT-3 (and sometimes even +over GPT-4). Our promising results highlight the potential of IPA as a +lightweight alternative to tailoring extreme-scale language models. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ All the World's a (Hyper)Graph: A Data Drama + + +
+ We introduce Hyperbard, a dataset of diverse relational data representations +derived from Shakespeare's plays. Our representations range from simple graphs +capturing character co-occurrence in single scenes to hypergraphs encoding +complex communication settings and character contributions as hyperedges with +edge-specific node weights. By making multiple intuitive representations +readily available for experimentation, we facilitate rigorous representation +robustness checks in graph learning, graph mining, and network analysis, +highlighting the advantages and drawbacks of specific representations. +Leveraging the data released in Hyperbard, we demonstrate that many solutions +to popular graph mining problems are highly dependent on the representation +choice, thus calling current graph curation practices into question. As an +homage to our data source, and asserting that science can also be art, we +present all our points in the form of a play. + +
+
+ comment: This is the full version of our paper; an abridged version appears in + Digital Scholarship in the Humanities. Landing page for code and data: + https://hyperbard.net/ +
+
+
+
+
+ + ♻ ☆ BoschAI @ PLABA 2023: Leveraging Edit Operations in End-to-End Neural + Sentence Simplification + + +
+ Automatic simplification can help laypeople to comprehend complex scientific +text. Language models are frequently applied to this task by translating from +complex to simple language. In this paper, we describe our system based on +Llama 2, which ranked first in the PLABA shared task addressing the +simplification of biomedical text. We find that the large portion of shared +tokens between input and output leads to weak training signals and +conservatively editing models. To mitigate these issues, we propose +sentence-level and token-level loss weights. They give higher weight to +modified tokens, indicated by edit distance and edit operations, respectively. +We conduct an empirical evaluation on the PLABA dataset and find that both +approaches lead to simplifications closer to those created by human annotators +(+1.8% / +3.5% SARI), simpler language (-1 / -1.1 FKGL) and more edits (1.6x / +1.8x edit distance) compared to the same model fine-tuned with standard cross +entropy. We furthermore show that the hyperparameter $\lambda$ in token-level +loss weights can be used to control the edit distance and the simplicity level +(FKGL). + +
+
+
+
+
+ + ♻ ☆ BiasTestGPT: Using ChatGPT for Social Bias Testing of Language Models + + +
+ Pretrained Language Models (PLMs) harbor inherent social biases that can +result in harmful real-world implications. Such social biases are measured +through the probability values that PLMs output for different social groups and +attributes appearing in a set of test sentences. However, bias testing is +currently cumbersome since the test sentences are generated either from a +limited set of manual templates or need expensive crowd-sourcing. We instead +propose using ChatGPT for the controllable generation of test sentences, given +any arbitrary user-specified combination of social groups and attributes +appearing in the test sentences. When compared to template-based methods, our +approach using ChatGPT for test sentence generation is superior in detecting +social bias, especially in challenging settings such as intersectional biases. +We present an open-source comprehensive bias testing framework (BiasTestGPT), +hosted on HuggingFace, that can be plugged into any open-source PLM for bias +testing. User testing with domain experts from various fields has shown their +interest in being able to test modern AI for social biases. Our tool has +significantly improved their awareness of such biases in PLMs, proving to be +learnable and user-friendly. We thus enable seamless open-ended social bias +testing of PLMs by domain experts through an automatic large-scale generation +of diverse test sentences for any combination of social categories and +attributes. + +
+
+
+
+
+ + ♻ ☆ TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and + Advanced Decoding Techniques + + +
+ Recent advances in language models (LMs), have demonstrated significant +efficacy in tasks related to the arts and humanities. While LMs have exhibited +exceptional performance across a wide range of natural language processing +tasks, there are notable challenges associated with their utilization on small +datasets and their ability to replicate more creative human capacities. In this +study, we aim to address these challenges by training a Persian classical +poetry generation model using a transformer architecture on a specialized +dataset with no pretraining. Additionally, we propose a novel decoding method +to enhance coherence and meaningfulness in the generated poetry, effectively +managing the tradeoff between diversity and quality. Furthermore, the results +of our training approach and the proposed decoding method are evaluated through +comprehensive set of automatic and human evaluations and showed its superior +capability to generate coherent and meaningful poetry in compare to other +decoding methods and an existing Persian large language model (LLM). + +
+
+
+
+
+ + ♻ ☆ AlpacaFarm: A Simulation Framework for Methods that Learn from Human + Feedback + + +
+ Large language models (LLMs) such as ChatGPT have seen widespread adoption +due to their ability to follow user instructions well. Developing these LLMs +involves a complex yet poorly understood workflow requiring training with human +feedback. Replicating and understanding this instruction-following process +faces three major challenges: the high cost of data collection, the lack of +trustworthy evaluation, and the absence of reference method implementations. We +address these challenges with AlpacaFarm, a simulator that enables research and +development for learning from feedback at a low cost. First, we design LLM +prompts to simulate human feedback that are 45x cheaper than crowdworkers and +display high agreement with humans. Second, we propose an automatic evaluation +and validate it against human instructions obtained on real-world interactions. +Third, we contribute reference implementations for several methods (PPO, DPO, +best-of-n, expert iteration, and more) that learn from pairwise feedback. +Finally, as an end-to-end validation of AlpacaFarm, we train and evaluate +eleven models on 10k pairs of real human feedback and show that rankings of +models trained in AlpacaFarm match rankings of models trained on human data. As +a demonstration of the research possible in AlpacaFarm, we find that methods +that use a reward model can substantially improve over supervised fine-tuning +and that our reference PPO implementation leads to a +10% improvement in +win-rate against Davinci003. We release all components of AlpacaFarm at +https://github.com/tatsu-lab/alpaca_farm. + +
+
+
+
+
+ + ♻ ☆ In-Context Learning for Text Classification with Many Labels + + +
+ In-context learning (ICL) using large language models for tasks with many +labels is challenging due to the limited context window, which makes it +difficult to fit a sufficient number of examples in the prompt. In this paper, +we use a pre-trained dense retrieval model to bypass this limitation, giving +the model only a partial view of the full label space for each inference call. +Testing with recent open-source LLMs (OPT, LLaMA), we set new state of the art +performance in few-shot settings for three common intent classification +datasets, with no finetuning. We also surpass fine-tuned performance on +fine-grained sentiment classification in certain cases. We analyze the +performance across number of in-context examples and different model scales, +showing that larger models are necessary to effectively and consistently make +use of larger context lengths for ICL. By running several ablations, we analyze +the model's use of: a) the similarity of the in-context examples to the current +input, b) the semantic content of the class names, and c) the correct +correspondence between examples and labels. We demonstrate that all three are +needed to varying degrees depending on the domain, contrary to certain recent +works. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language + Models with Creative Humor Generation + + +
+ Chain-of-Thought (CoT) guides large language models (LLMs) to reason +step-by-step, and can motivate their logical reasoning ability. While effective +for logical tasks, CoT is not conducive to creative problem-solving which often +requires out-of-box thoughts and is crucial for innovation advancements. In +this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a +non-sequential, creative paradigm involving strong associations and knowledge +leaps. To this end, we study LLMs on the popular Oogiri game which needs +participants to have good creativity and strong associative thinking for +responding unexpectedly and humorously to the given image, text, or both, and +thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the +Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset +which contains over 130,000 samples from the Oogiri game, and observe the +insufficient LoT ability or failures of most existing LLMs on the Oogiri game. +Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve +LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into +LoT-oriented instruction tuning data to train pretrained LLM for achieving +certain LoT humor generation and discrimination abilities. Then CLoT designs an +explorative self-refinement that encourages the LLM to generate more creative +LoT data via exploring parallels between seemingly unrelated concepts and +selects high-quality data to train itself for self-refinement. CLoT not only +excels in humor generation in the Oogiri game but also boosts creative +abilities in various tasks like cloud guessing game and divergent association +task. These findings advance our understanding and offer a pathway to improve +LLMs' creative capacities for innovative applications across domains. The +dataset, code, and models will be released online. +https://zhongshsh.github.io/CLoT/. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ TheoremQA: A Theorem-driven Question Answering dataset EMNLP 2023 + + +
+ The recent LLMs like GPT-4 and PaLM-2 have made tremendous progress in +solving fundamental math problems like GSM8K by achieving over 90% accuracy. +However, their capabilities to solve more challenging math problems which +require domain-specific knowledge (i.e. theorem) have yet to be investigated. +In this paper, we introduce TheoremQA, the first theorem-driven +question-answering dataset designed to evaluate AI models' capabilities to +apply theorems to solve challenging science problems. TheoremQA is curated by +domain experts containing 800 high-quality questions covering 350 theorems +(e.g. Taylor's theorem, Lagrange's theorem, Huffman coding, Quantum Theorem, +Elasticity Theorem, etc) from Math, Physics, EE&CS, and Finance. We evaluate a +wide spectrum of 16 large language and code models with different prompting +strategies like Chain-of-Thoughts and Program-of-Thoughts. We found that +GPT-4's capabilities to solve these problems are unparalleled, achieving an +accuracy of 51% with Program-of-Thoughts Prompting. All the existing +open-sourced models are below 15%, barely surpassing the random-guess baseline. +Given the diversity and broad coverage of TheoremQA, we believe it can be used +as a better benchmark to evaluate LLMs' capabilities to solve challenging +science problems. The data and code are released in +https://github.com/wenhuchen/TheoremQA. + +
+
+ comment: Accepted to Main Conference of EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ D-Bot: Database Diagnosis System using Large Language Models + + +
+ Database administrators (DBAs) play an important role in managing, +maintaining and optimizing database systems. However, it is hard and tedious +for DBAs to manage a large number of databases and give timely response +(waiting for hours is intolerable in many online cases). In addition, existing +empirical methods only support limited diagnosis scenarios, which are also +labor-intensive to update the diagnosis rules for database version updates. +Recently large language models (LLMs) have shown great potential in various +fields. Thus, we propose D-Bot, an LLM-based database diagnosis system that can +automatically acquire knowledge from diagnosis documents, and generate +reasonable and well-founded diagnosis report (i.e., identifying the root causes +and solutions) within acceptable time (e.g., under 10 minutes compared to hours +by a DBA). The techniques in D-Bot include (i) offline knowledge extraction +from documents, (ii) automatic prompt generation (e.g., knowledge matching, +tool retrieval), (iii) root cause analysis using tree search algorithm, and +(iv) collaborative mechanism for complex anomalies with multiple root causes. +We verify D-Bot on real benchmarks (including 539 anomalies of six typical +applications), and the results show that D-Bot can effectively analyze the root +causes of unseen anomalies and significantly outperforms traditional methods +and vanilla models like GPT-4. + +
+
+
+
+
+ + ♻ ☆ TraSE: Towards Tackling Authorial Style from a Cognitive Science + Perspective + + +
+ Stylistic analysis of text is a key task in research areas ranging from +authorship attribution to forensic analysis and personality profiling. The +existing approaches for stylistic analysis are plagued by issues like topic +influence, lack of discriminability for large number of authors and the +requirement for large amounts of diverse data. In this paper, the source of +these issues are identified along with the necessity for a cognitive +perspective on authorial style in addressing them. A novel feature +representation, called Trajectory-based Style Estimation (TraSE), is introduced +to support this purpose. Authorship attribution experiments with over 27,000 +authors and 1.4 million samples in a cross-domain scenario resulted in 90% +attribution accuracy suggesting that the feature representation is immune to +such negative influences and an excellent candidate for stylistic analysis. +Finally, a qualitative analysis is performed on TraSE using physical human +characteristics, like age, to validate its claim on capturing cognitive traits. + +
+
+ comment: Experimental results in the paper are incorrectly reported due to an + unforeseen glitch in the software prototype. The paper and its findings are + withdrawn +
+
+
+
+
+ + ♻ ☆ HalOmi: A Manually Annotated Benchmark for Multilingual Hallucination + and Omission Detection in Machine Translation + + +
+ Hallucinations in machine translation are translations that contain +information completely unrelated to the input. Omissions are translations that +do not include some of the input information. While both cases tend to be +catastrophic errors undermining user trust, annotated data with these types of +pathologies is extremely scarce and is limited to a few high-resource +languages. In this work, we release an annotated dataset for the hallucination +and omission phenomena covering 18 translation directions with varying resource +levels and scripts. Our annotation covers different levels of partial and full +hallucinations as well as omissions both at the sentence and at the word level. +Additionally, we revisit previous methods for hallucination and omission +detection, show that conclusions made based on a single language pair largely +do not hold for a large-scale evaluation, and establish new solid baselines. + +
+
+
+
+
+ + ♻ ☆ Polyglot or Not? Measuring Multilingual Encyclopedic Knowledge in + Foundation Models EMNLP 2023 + + +
+ In this work, we assess the ability of foundation models to recall +encyclopedic knowledge across a wide range of linguistic contexts. To support +this, we: 1) produce a 20-language dataset that contains 303k factual +associations paired with counterfactuals, 2) evaluate 5 models in a +multilingual test, and 3) benchmark a diverse set of 24 models in an +English-only test. Meta's LLaMA achieves the highest scores in both +multilingual and English-only evaluations. Yet, an analysis of LLaMA's errors +reveals significant limitations in its ability to recall facts in languages +other than English, plus difficulties related to the location and gender of +fact subjects. Overall, our findings suggest that today's foundation models are +far from polyglots. + +
+
+ comment: EMNLP 2023 (Main) +
+
+
+
+
+ + ♻ ☆ Model-tuning Via Prompts Makes NLP Models Adversarially Robust EMNLP 2023 + + +
+ In recent years, NLP practitioners have converged on the following practice: +(i) import an off-the-shelf pretrained (masked) language model; (ii) append a +multilayer perceptron atop the CLS token's hidden representation (with randomly +initialized weights); and (iii) fine-tune the entire model on a downstream task +(MLP-FT). This procedure has produced massive gains on standard NLP benchmarks, +but these models remain brittle, even to mild adversarial perturbations. In +this work, we demonstrate surprising gains in adversarial robustness enjoyed by +Model-tuning Via Prompts (MVP), an alternative method of adapting to downstream +tasks. Rather than appending an MLP head to make output prediction, MVP appends +a prompt template to the input, and makes prediction via text +infilling/completion. Across 5 NLP datasets, 4 adversarial attacks, and 3 +different models, MVP improves performance against adversarial substitutions by +an average of 8% over standard methods and even outperforms adversarial +training-based state-of-art defenses by 3.5%. By combining MVP with adversarial +training, we achieve further improvements in adversarial robustness while +maintaining performance on unperturbed examples. Finally, we conduct ablations +to investigate the mechanism underlying these gains. Notably, we find that the +main causes of vulnerability of MLP-FT can be attributed to the misalignment +between pre-training and fine-tuning tasks, and the randomly initialized MLP +parameters. + +
+
+ comment: Accepted to the EMNLP 2023 Conference +
+
+
+
+
+ + ♻ ☆ DADA: Dialect Adaptation via Dynamic Aggregation of Linguistic Rules EMNLP 2023 + + +
+ Existing large language models (LLMs) that mainly focus on Standard American +English (SAE) often lead to significantly worse performance when being applied +to other English dialects. While existing mitigations tackle discrepancies for +individual target dialects, they assume access to high-accuracy dialect +identification systems. The boundaries between dialects are inherently +flexible, making it difficult to categorize language into discrete predefined +categories. In this paper, we propose DADA (Dialect Adaptation via Dynamic +Aggregation), a modular approach to imbue SAE-trained models with +multi-dialectal robustness by composing adapters which handle specific +linguistic features. The compositional architecture of DADA allows for both +targeted adaptation to specific dialect variants and simultaneous adaptation to +various dialects. We show that DADA is effective for both single task and +instruction finetuned language models, offering an extensible and interpretable +framework for adapting existing LLMs to different English dialects. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ From Pixels to UI Actions: Learning to Follow Instructions via Graphical + User Interfaces + + +
+ Much of the previous work towards digital agents for graphical user +interfaces (GUIs) has relied on text-based representations (derived from HTML +or other structured data sources), which are not always readily available. +These input representations have been often coupled with custom, task-specific +action spaces. This paper focuses on creating agents that interact with the +digital world using the same conceptual interface that humans commonly use -- +via pixel-based screenshots and a generic action space corresponding to +keyboard and mouse actions. Building upon recent progress in pixel-based +pretraining, we show, for the first time, that it is possible for such agents +to outperform human crowdworkers on the MiniWob++ benchmark of GUI-based +instruction following tasks. + +
+
+
+
+
+ + ♻ ☆ Towards Grounded Visual Spatial Reasoning in Multi-Modal Vision Language + Models + + +
+ With pre-training of vision-and-language models (VLMs) on large-scale +datasets of image-text pairs, several recent works showed that these +pre-trained models lack fine-grained understanding, such as the ability to +count and recognize verbs, attributes, or relationships. The focus of this work +is to study the ability of these models to understand spatial relations. +Previously, this has been tackled using image-text matching (e.g., Visual +Spatial Reasoning benchmark) or visual question answering (e.g., GQA or VQAv2), +both showing poor performance and a large gap compared to human performance. In +this work, we use explainability tools to understand the causes of poor +performance better and present an alternative fine-grained, compositional +approach for ranking spatial clauses. We combine the evidence from grounding +noun phrases corresponding to objects and their locations to compute the final +rank of the spatial clause. We demonstrate the approach on representative VLMs +(such as LXMERT, GPV, and MDETR) and compare and highlight their abilities to +reason about spatial relationships. + +
+
+
+
+
+ + ♻ ☆ Attention-Driven Multi-Modal Fusion: Enhancing Sign Language Recognition + and Translation + + +
+ In this paper, we devise a mechanism for the addition of multi-modal +information with an existing pipeline for continuous sign language recognition +and translation. In our procedure, we have incorporated optical flow +information with RGB images to enrich the features with movement-related +information. This work studies the feasibility of such modality inclusion using +a cross-modal encoder. The plugin we have used is very lightweight and doesn't +need to include a separate feature extractor for the new modality in an +end-to-end manner. We have applied the changes in both sign language +recognition and translation, improving the result in each case. We have +evaluated the performance on the RWTH-PHOENIX-2014 dataset for sign language +recognition and the RWTH-PHOENIX-2014T dataset for translation. On the +recognition task, our approach reduced the WER by 0.9, and on the translation +task, our approach increased most of the BLEU scores by ~0.6 on the test set. + +
+
+
+
+
+ + ♻ ☆ Unveiling the General Intelligence Factor in Language Models: A + Psychometric Approach + + +
+ This study uncovers the factor of general intelligence, or g, in language +models, extending the psychometric theory traditionally applied to humans and +certain animal species. Utilizing factor analysis on two extensive datasets - +Open LLM Leaderboard with 1,232 models and General Language Understanding +Evaluation (GLUE) Leaderboard with 88 models - we find compelling evidence for +a unidimensional, highly stable g factor that accounts for 85% of the variance +in model performance. The study also finds a moderate correlation of .49 +between model size and g. The discovery of g in language models offers a +unified metric for model evaluation and opens new avenues for more robust, +g-based model ability assessment. These findings lay the foundation for +understanding and future research on artificial general intelligence from a +psychometric perspective and have practical implications for model evaluation +and development. + +
+
+ comment: 10 pages (including appendix), 7 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 126 + +
+
+
+ + ☆ Relightable Gaussian Codec Avatars + + +
+ The fidelity of relighting is bounded by both geometry and appearance +representations. For geometry, both mesh and volumetric approaches have +difficulty modeling intricate structures like 3D hair geometry. For appearance, +existing relighting models are limited in fidelity and often too slow to render +in real-time with high-resolution continuous environments. In this work, we +present Relightable Gaussian Codec Avatars, a method to build high-fidelity +relightable head avatars that can be animated to generate novel expressions. +Our geometry model based on 3D Gaussians can capture 3D-consistent +sub-millimeter details such as hair strands and pores on dynamic face +sequences. To support diverse materials of human heads such as the eyes, skin, +and hair in a unified manner, we present a novel relightable appearance model +based on learnable radiance transfer. Together with global illumination-aware +spherical harmonics for the diffuse components, we achieve real-time relighting +with spatially all-frequency reflections using spherical Gaussians. This +appearance model can be efficiently relit under both point light and continuous +illumination. We further improve the fidelity of eye reflections and enable +explicit gaze control by introducing relightable explicit eye models. Our +method outperforms existing approaches without compromising real-time +performance. We also demonstrate real-time relighting of avatars on a tethered +consumer VR headset, showcasing the efficiency and fidelity of our avatars. + +
+
+ comment: Website: https://shunsukesaito.github.io/rgca/ +
+
+
+
+
+ + ☆ Skeleton-in-Context: Unified Skeleton Sequence Modeling with In-Context + Learning + + +
+ In-context learning provides a new perspective for multi-task modeling for +vision and NLP. Under this setting, the model can perceive tasks from prompts +and accomplish them without any extra task-specific head predictions or model +fine-tuning. However, Skeleton sequence modeling via in-context learning +remains unexplored. Directly applying existing in-context models from other +areas onto skeleton sequences fails due to the inter-frame and cross-task pose +similarity that makes it outstandingly hard to perceive the task correctly from +a subtle context. To address this challenge, we propose Skeleton-in-Context +(SiC), an effective framework for in-context skeleton sequence modeling. Our +SiC is able to handle multiple skeleton-based tasks simultaneously after a +single training process and accomplish each task from context according to the +given prompt. It can further generalize to new, unseen tasks according to +customized prompts. To facilitate context perception, we additionally propose a +task-unified prompt, which adaptively learns tasks of different natures, such +as partial joint-level generation, sequence-level prediction, or 2D-to-3D +motion prediction. We conduct extensive experiments to evaluate the +effectiveness of our SiC on multiple tasks, including motion prediction, pose +estimation, joint completion, and future pose estimation. We also evaluate its +generalization capability on unseen tasks such as motion-in-between. These +experiments show that our model achieves state-of-the-art multi-task +performance and even outperforms single-task methods on certain tasks. + +
+
+ comment: Project page: https://github.com/fanglaosi/Skeleton-in-Context +
+
+
+
+
+ + ☆ Self-conditioned Image Generation via Generating Representations + + +
+ This paper presents $\textbf{R}$epresentation-$\textbf{C}$onditioned image +$\textbf{G}$eneration (RCG), a simple yet effective image generation framework +which sets a new benchmark in class-unconditional image generation. RCG does +not condition on any human annotations. Instead, it conditions on a +self-supervised representation distribution which is mapped from the image +distribution using a pre-trained encoder. During generation, RCG samples from +such representation distribution using a representation diffusion model (RDM), +and employs a pixel generator to craft image pixels conditioned on the sampled +representation. Such a design provides substantial guidance during the +generative process, resulting in high-quality image generation. Tested on +ImageNet 256$\times$256, RCG achieves a Frechet Inception Distance (FID) of +3.31 and an Inception Score (IS) of 253.4. These results not only significantly +improve the state-of-the-art of class-unconditional image generation but also +rival the current leading methods in class-conditional image generation, +bridging the long-standing performance gap between these two tasks. Code is +available at https://github.com/LTH14/rcg. + +
+
+
+
+
+ + ☆ OneLLM: One Framework to Align All Modalities with Language + + +
+ Multimodal large language models (MLLMs) have gained significant attention +due to their strong multimodal understanding capability. However, existing +works rely heavily on modality-specific encoders, which usually differ in +architecture and are limited to common modalities. In this paper, we present +OneLLM, an MLLM that aligns eight modalities to language using a unified +framework. We achieve this through a unified multimodal encoder and a +progressive multimodal alignment pipeline. In detail, we first train an image +projection module to connect a vision encoder with LLM. Then, we build a +universal projection module (UPM) by mixing multiple image projection modules +and dynamic routing. Finally, we progressively align more modalities to LLM +with the UPM. To fully leverage the potential of OneLLM in following +instructions, we also curated a comprehensive multimodal instruction dataset, +including 2M items from image, audio, video, point cloud, depth/normal map, IMU +and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks, +encompassing tasks such as multimodal captioning, question answering and +reasoning, where it delivers excellent performance. Code, data, model and +online demo are available at https://github.com/csuhan/OneLLM + +
+
+ comment: Code: https://github.com/csuhan/OneLLM +
+
+
+
+
+ + ☆ Intrinsic Harmonization for Illumination-Aware Compositing SIGGRAPH + + +
+ Despite significant advancements in network-based image harmonization +techniques, there still exists a domain disparity between typical training +pairs and real-world composites encountered during inference. Most existing +methods are trained to reverse global edits made on segmented image regions, +which fail to accurately capture the lighting inconsistencies between the +foreground and background found in composited images. In this work, we +introduce a self-supervised illumination harmonization approach formulated in +the intrinsic image domain. First, we estimate a simple global lighting model +from mid-level vision representations to generate a rough shading for the +foreground region. A network then refines this inferred shading to generate a +harmonious re-shading that aligns with the background scene. In order to match +the color appearance of the foreground and background, we utilize ideas from +prior harmonization approaches to perform parameterized image edits in the +albedo domain. To validate the effectiveness of our approach, we present +results from challenging real-world composites and conduct a user study to +objectively measure the enhanced realism achieved compared to state-of-the-art +harmonization methods. + +
+
+ comment: 10 pages, 8 figures. Accepted to SIGGRAPH Asia 2023 (Conference + Track). Project page: https://yaksoy.github.io/intrinsicCompositing/ +
+
+
+
+
+ + ☆ Memory Triggers: Unveiling Memorization in Text-To-Image Generative + Models through Word-Level Duplication + + +
+ Diffusion-based models, such as the Stable Diffusion model, have +revolutionized text-to-image synthesis with their ability to produce +high-quality, high-resolution images. These advancements have prompted +significant progress in image generation and editing tasks. However, these +models also raise concerns due to their tendency to memorize and potentially +replicate exact training samples, posing privacy risks and enabling adversarial +attacks. Duplication in training datasets is recognized as a major factor +contributing to memorization, and various forms of memorization have been +studied so far. This paper focuses on two distinct and underexplored types of +duplication that lead to replication during inference in diffusion-based +models, particularly in the Stable Diffusion model. We delve into these +lesser-studied duplication phenomena and their implications through two case +studies, aiming to contribute to the safer and more responsible use of +generative models in various applications. + +
+
+
+
+
+ + ☆ Hybrid Functional Maps for Crease-Aware Non-Isometric Shape Matching + + +
+ Non-isometric shape correspondence remains a fundamental challenge in +computer vision. Traditional methods using Laplace-Beltrami operator (LBO) +eigenmodes face limitations in characterizing high-frequency extrinsic shape +changes like bending and creases. We propose a novel approach of combining the +non-orthogonal extrinsic basis of eigenfunctions of the elastic thin-shell +hessian with the intrinsic ones of the LBO, creating a hybrid spectral space in +which we construct functional maps. To this end, we present a theoretical +framework to effectively integrate non-orthogonal basis functions into +descriptor- and learning-based functional map methods. Our approach can be +incorporated easily into existing functional map pipelines across varying +applications and is able to handle complex deformations beyond isometries. We +show extensive evaluations across various supervised and unsupervised settings +and demonstrate significant improvements. Notably, our approach achieves up to +15% better mean geodesic error for non-isometric correspondence settings and up +to 45% improvement in scenarios with topological noise. + +
+
+ comment: 17 pages, 9 figures +
+
+
+
+
+ + ☆ WarpDiffusion: Efficient Diffusion Model for High-Fidelity Virtual + Try-on + + +
+ Image-based Virtual Try-On (VITON) aims to transfer an in-shop garment image +onto a target person. While existing methods focus on warping the garment to +fit the body pose, they often overlook the synthesis quality around the +garment-skin boundary and realistic effects like wrinkles and shadows on the +warped garments. These limitations greatly reduce the realism of the generated +results and hinder the practical application of VITON techniques. Leveraging +the notable success of diffusion-based models in cross-modal image synthesis, +some recent diffusion-based methods have ventured to tackle this issue. +However, they tend to either consume a significant amount of training resources +or struggle to achieve realistic try-on effects and retain garment details. For +efficient and high-fidelity VITON, we propose WarpDiffusion, which bridges the +warping-based and diffusion-based paradigms via a novel informative and local +garment feature attention mechanism. Specifically, WarpDiffusion incorporates +local texture attention to reduce resource consumption and uses a novel +auto-mask module that effectively retains only the critical areas of the warped +garment while disregarding unrealistic or erroneous portions. Notably, +WarpDiffusion can be integrated as a plug-and-play component into existing +VITON methodologies, elevating their synthesis quality. Extensive experiments +on high-resolution VITON benchmarks and an in-the-wild test set demonstrate the +superiority of WarpDiffusion, surpassing state-of-the-art methods both +qualitatively and quantitatively. + +
+
+
+
+
+ + ☆ Reason2Drive: Towards Interpretable and Chain-based Reasoning for + Autonomous Driving + + +
+ Large vision-language models (VLMs) have garnered increasing interest in +autonomous driving areas, due to their advanced capabilities in complex +reasoning tasks essential for highly autonomous vehicle behavior. Despite their +potential, research in autonomous systems is hindered by the lack of datasets +with annotated reasoning chains that explain the decision-making processes in +driving. To bridge this gap, we present Reason2Drive, a benchmark dataset with +over 600K video-text pairs, aimed at facilitating the study of interpretable +reasoning in complex driving environments. We distinctly characterize the +autonomous driving process as a sequential combination of perception, +prediction, and reasoning steps, and the question-answer pairs are +automatically collected from a diverse range of open-source outdoor driving +datasets, including nuScenes, Waymo and ONCE. Moreover, we introduce a novel +aggregated evaluation metric to assess chain-based reasoning performance in +autonomous systems, addressing the semantic ambiguities of existing metrics +such as BLEU and CIDEr. Based on the proposed benchmark, we conduct experiments +to assess various existing VLMs, revealing insights into their reasoning +capabilities. Additionally, we develop an efficient approach to empower VLMs to +leverage object-level perceptual elements in both feature extraction and +prediction, further enhancing their reasoning accuracy. The code and dataset +will be released. + +
+
+
+
+
+ + ☆ Editable Stain Transformation Of Histological Images Using Unpaired GANs + + +
+ Double staining in histopathology, particularly for metaplastic breast +cancer, typically employs H&E and P63 dyes. However, P63's tissue damage and +high cost necessitate alternative methods. This study introduces xAI-CycleGAN, +an advanced architecture combining Mask CycleGAN with explainability features +and structure-preserving capabilities for transforming H&E stained breast +tissue images into P63-like images. The architecture allows for output editing, +enhancing resemblance to actual images and enabling further model refinement. +We showcase xAI-CycleGAN's efficacy in maintaining structural integrity and +generating high-quality images. Additionally, a histopathologist survey +indicates the generated images' realism is often comparable to actual images, +validating our model's high-quality output. + +
+
+ comment: 13 pages, 5 figures, AIRCAD ICIAP 2023 in Udine +
+
+
+
+
+ + ☆ MotionCtrl: A Unified and Flexible Motion Controller for Video + Generation + + +
+ Motions in a video primarily consist of camera motion, induced by camera +movement, and object motion, resulting from object movement. Accurate control +of both camera and object motion is essential for video generation. However, +existing works either mainly focus on one type of motion or do not clearly +distinguish between the two, limiting their control capabilities and diversity. +Therefore, this paper presents MotionCtrl, a unified and flexible motion +controller for video generation designed to effectively and independently +control camera and object motion. The architecture and training strategy of +MotionCtrl are carefully devised, taking into account the inherent properties +of camera motion, object motion, and imperfect training data. Compared to +previous methods, MotionCtrl offers three main advantages: 1) It effectively +and independently controls camera motion and object motion, enabling more +fine-grained motion control and facilitating flexible and diverse combinations +of both types of motion. 2) Its motion conditions are determined by camera +poses and trajectories, which are appearance-free and minimally impact the +appearance or shape of objects in generated videos. 3) It is a relatively +generalizable model that can adapt to a wide array of camera poses and +trajectories once trained. Extensive qualitative and quantitative experiments +have been conducted to demonstrate the superiority of MotionCtrl over existing +methods. + +
+
+ comment: Project Page: https://wzhouxiff.github.io/projects/MotionCtrl/ +
+
+
+
+
+ + ☆ Training Neural Networks on RAW and HDR Images for Restoration Tasks + + +
+ The vast majority of standard image and video content available online is +represented in display-encoded color spaces, in which pixel values are +conveniently scaled to a limited range (0-1) and the color distribution is +approximately perceptually uniform. In contrast, both camera RAW and high +dynamic range (HDR) images are often represented in linear color spaces, in +which color values are linearly related to colorimetric quantities of light. +While training on commonly available display-encoded images is a +well-established practice, there is no consensus on how neural networks should +be trained for tasks on RAW and HDR images in linear color spaces. In this +work, we test several approaches on three popular image restoration +applications: denoising, deblurring, and single-image super-resolution. We +examine whether HDR/RAW images need to be display-encoded using popular +transfer functions (PQ, PU21, mu-law), or whether it is better to train in +linear color spaces, but use loss functions that correct for perceptual +non-uniformity. Our results indicate that neural networks train significantly +better on HDR and RAW images represented in display-encoded color spaces, which +offer better perceptual uniformity than linear spaces. This small change to the +training strategy can bring a very substantial gain in performance, up to 10-15 +dB. + +
+
+
+
+
+ + ☆ MOCHa: Multi-Objective Reinforcement Mitigating Caption Hallucinations + + +
+ While recent years have seen rapid progress in image-conditioned text +generation, image captioning still suffers from the fundamental issue of +hallucinations, the generation of spurious details that cannot be inferred from +the given image. Dedicated methods for reducing hallucinations in image +captioning largely focus on closed-vocabulary object tokens, ignoring most +types of hallucinations that occur in practice. In this work, we propose MOCHa, +an approach that harnesses advancements in reinforcement learning (RL) to +address the sequence-level nature of hallucinations in an open-world setup. To +optimize for caption fidelity to the input image, we leverage ground-truth +reference captions as proxies to measure the logical consistency of generated +captions. However, optimizing for caption fidelity alone fails to preserve the +semantic adequacy of generations; therefore, we propose a multi-objective +reward function that jointly targets these qualities, without requiring any +strong supervision. We demonstrate that these goals can be simultaneously +optimized with our framework, enhancing performance for various captioning +models of different scales. Our qualitative and quantitative results +demonstrate MOCHa's superior performance across various established metrics. We +also demonstrate the benefit of our method in the open-vocabulary setting. To +this end, we contribute OpenCHAIR, a new benchmark for quantifying +open-vocabulary hallucinations in image captioning models, constructed using +generative foundation models. We will release our code, benchmark, and trained +models. + +
+
+ comment: Website Link: https://assafbk.github.io/mocha/ +
+
+
+
+
+ + ☆ Boosting Segment Anything Model Towards Open-Vocabulary Learning + + +
+ The recent Segment Anything Model (SAM) has emerged as a new paradigmatic +vision foundation model, showcasing potent zero-shot generalization and +flexible prompting. Despite SAM finding applications and adaptations in various +domains, its primary limitation lies in the inability to grasp object +semantics. In this paper, we present Sambor to seamlessly integrate SAM with +the open-vocabulary object detector in an end-to-end framework. While retaining +all the remarkable capabilities inherent to SAM, we enhance it with the +capacity to detect arbitrary objects based on human inputs like category names +or reference expressions. To accomplish this, we introduce a novel SideFormer +module that extracts SAM features to facilitate zero-shot object localization +and inject comprehensive semantic information for open-vocabulary recognition. +In addition, we devise an open-set region proposal network (Open-set RPN), +enabling the detector to acquire the open-set proposals generated by SAM. +Sambor demonstrates superior zero-shot performance across benchmarks, including +COCO and LVIS, proving highly competitive against previous SoTA methods. We +aspire for this work to serve as a meaningful endeavor in endowing SAM to +recognize diverse object categories and advancing open-vocabulary learning with +the support of vision foundation models. + +
+
+
+
+
+ + ☆ TokenCompose: Grounding Diffusion with Token-level Supervision + + +
+ We present TokenCompose, a Latent Diffusion Model for text-to-image +generation that achieves enhanced consistency between user-specified text +prompts and model-generated images. Despite its tremendous success, the +standard denoising process in the Latent Diffusion Model takes text prompts as +conditions only, absent explicit constraint for the consistency between the +text prompts and the image contents, leading to unsatisfactory results for +composing multiple object categories. TokenCompose aims to improve +multi-category instance composition by introducing the token-wise consistency +terms between the image content and object segmentation maps in the finetuning +stage. TokenCompose can be applied directly to the existing training pipeline +of text-conditioned diffusion models without extra human labeling information. +By finetuning Stable Diffusion, the model exhibits significant improvements in +multi-category instance composition and enhanced photorealism for its generated +images. + +
+
+ comment: Preprint. Project link: https://mlpc-ucsd.github.io/TokenCompose +
+
+
+
+
+ + ☆ DreamComposer: Controllable 3D Object Generation via Multi-View + Conditions + + +
+ Utilizing pre-trained 2D large-scale generative models, recent works are +capable of generating high-quality novel views from a single in-the-wild image. +However, due to the lack of information from multiple views, these works +encounter difficulties in generating controllable novel views. In this paper, +we present DreamComposer, a flexible and scalable framework that can enhance +existing view-aware diffusion models by injecting multi-view conditions. +Specifically, DreamComposer first uses a view-aware 3D lifting module to obtain +3D representations of an object from multiple views. Then, it renders the +latent features of the target view from 3D representations with the multi-view +feature fusion module. Finally the target view features extracted from +multi-view inputs are injected into a pre-trained diffusion model. Experiments +show that DreamComposer is compatible with state-of-the-art diffusion models +for zero-shot novel view synthesis, further enhancing them to generate +high-fidelity novel view images with multi-view conditions, ready for +controllable 3D object reconstruction and various other applications. + +
+
+ comment: Project Page: https://yhyang-myron.github.io/DreamComposer/ +
+
+
+
+
+ + ☆ Automated Multimodal Data Annotation via Calibration With Indoor + Positioning System + + +
+ Learned object detection methods based on fusion of LiDAR and camera data +require labeled training samples, but niche applications, such as warehouse +robotics or automated infrastructure, require semantic classes not available in +large existing datasets. Therefore, to facilitate the rapid creation of +multimodal object detection datasets and alleviate the burden of human +labeling, we propose a novel automated annotation pipeline. Our method uses an +indoor positioning system (IPS) to produce accurate detection labels for both +point clouds and images and eliminates manual annotation entirely. In an +experiment, the system annotates objects of interest 261.8 times faster than a +human baseline and speeds up end-to-end dataset creation by 61.5%. + +
+
+ comment: Contains eight pages and 10 figures. A version of this document was + accepted to IEEE IRC 2023 +
+
+
+
+
+ + ☆ DiffusionSat: A Generative Foundation Model for Satellite Imagery + + +
+ Diffusion models have achieved state-of-the-art results on many modalities +including images, speech, and video. However, existing models are not tailored +to support remote sensing data, which is widely used in important applications +including environmental monitoring and crop-yield prediction. Satellite images +are significantly different from natural images -- they can be multi-spectral, +irregularly sampled across time -- and existing diffusion models trained on +images from the Web do not support them. Furthermore, remote sensing data is +inherently spatio-temporal, requiring conditional generation tasks not +supported by traditional methods based on captions or images. In this paper, we +present DiffusionSat, to date the largest generative foundation model trained +on a collection of publicly available large, high-resolution remote sensing +datasets. As text-based captions are sparsely available for satellite images, +we incorporate the associated metadata such as geolocation as conditioning +information. Our method produces realistic samples and can be used to solve +multiple generative tasks including temporal generation, superresolution given +multi-spectral inputs and in-painting. Our method outperforms previous +state-of-the-art methods for satellite image generation and is the first +large-scale $\textit{generative}$ foundation model for satellite imagery. + +
+
+
+
+
+ + ☆ MMM: Generative Masked Motion Model + + +
+ Recent advances in text-to-motion generation using diffusion and +autoregressive models have shown promising results. However, these models often +suffer from a trade-off between real-time performance, high fidelity, and +motion editability. To address this gap, we introduce MMM, a novel yet simple +motion generation paradigm based on Masked Motion Model. MMM consists of two +key components: (1) a motion tokenizer that transforms 3D human motion into a +sequence of discrete tokens in latent space, and (2) a conditional masked +motion transformer that learns to predict randomly masked motion tokens, +conditioned on the pre-computed text tokens. By attending to motion and text +tokens in all directions, MMM explicitly captures inherent dependency among +motion tokens and semantic mapping between motion and text tokens. During +inference, this allows parallel and iterative decoding of multiple motion +tokens that are highly consistent with fine-grained text descriptions, +therefore simultaneously achieving high-fidelity and high-speed motion +generation. In addition, MMM has innate motion editability. By simply placing +mask tokens in the place that needs editing, MMM automatically fills the gaps +while guaranteeing smooth transitions between editing and non-editing parts. +Extensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM +surpasses current leading methods in generating high-quality motion (evidenced +by superior FID scores of 0.08 and 0.429), while offering advanced editing +features such as body-part modification, motion in-betweening, and the +synthesis of long motion sequences. In addition, MMM is two orders of magnitude +faster on a single mid-range GPU than editable motion diffusion models. Our +project page is available at \url{https://exitudio.github.io/MMM-page}. + +
+
+
+
+
+ + ☆ A Task is Worth One Word: Learning with Task Prompts for High-Quality + Versatile Image Inpainting + + +
+ Achieving high-quality versatile image inpainting, where user-specified +regions are filled with plausible content according to user intent, presents a +significant challenge. Existing methods face difficulties in simultaneously +addressing context-aware image inpainting and text-guided object inpainting due +to the distinct optimal training strategies required. To overcome this +challenge, we introduce PowerPaint, the first high-quality and versatile +inpainting model that excels in both tasks. First, we introduce learnable task +prompts along with tailored fine-tuning strategies to guide the model's focus +on different inpainting targets explicitly. This enables PowerPaint to +accomplish various inpainting tasks by utilizing different task prompts, +resulting in state-of-the-art performance. Second, we demonstrate the +versatility of the task prompt in PowerPaint by showcasing its effectiveness as +a negative prompt for object removal. Additionally, we leverage prompt +interpolation techniques to enable controllable shape-guided object inpainting. +Finally, we extensively evaluate PowerPaint on various inpainting benchmarks to +demonstrate its superior performance for versatile image inpainting. We release +our codes and models on our project page: https://powerpaint.github.io/. + +
+
+
+
+
+ + ☆ Language-Informed Visual Concept Learning + + +
+ Our understanding of the visual world is centered around various concept +axes, characterizing different aspects of visual entities. While different +concept axes can be easily specified by language, e.g. color, the exact visual +nuances along each axis often exceed the limitations of linguistic +articulations, e.g. a particular style of painting. In this work, our goal is +to learn a language-informed visual concept representation, by simply +distilling large pre-trained vision-language models. Specifically, we train a +set of concept encoders to encode the information pertinent to a set of +language-informed concept axes, with an objective of reproducing the input +image through a pre-trained Text-to-Image (T2I) model. To encourage better +disentanglement of different concept encoders, we anchor the concept embeddings +to a set of text embeddings obtained from a pre-trained Visual Question +Answering (VQA) model. At inference time, the model extracts concept embeddings +along various axes from new test images, which can be remixed to generate +images with novel compositions of visual concepts. With a lightweight test-time +finetuning procedure, it can also generalize to novel concepts unseen at +training. + +
+
+ comment: The first two authors contributed equally and are alphabetically + ordered. Project page: + https://ai.stanford.edu/~yzzhang/projects/concept-axes/ +
+
+
+
+
+ + ☆ Foundation Model Assisted Weakly Supervised Semantic Segmentation + + +
+ This work aims to leverage pre-trained foundation models, such as contrastive +language-image pre-training (CLIP) and segment anything model (SAM), to address +weakly supervised semantic segmentation (WSSS) using image-level labels. To +this end, we propose a coarse-to-fine framework based on CLIP and SAM for +generating high-quality segmentation seeds. Specifically, we construct an image +classification task and a seed segmentation task, which are jointly performed +by CLIP with frozen weights and two sets of learnable task-specific prompts. A +SAM-based seeding (SAMS) module is designed and applied to each task to produce +either coarse or fine seed maps. Moreover, we design a multi-label contrastive +loss supervised by image-level labels and a CAM activation loss supervised by +the generated coarse seed map. These losses are used to learn the prompts, +which are the only parts need to be learned in our framework. Once the prompts +are learned, we input each image along with the learned segmentation-specific +prompts into CLIP and the SAMS module to produce high-quality segmentation +seeds. These seeds serve as pseudo labels to train an off-the-shelf +segmentation network like other two-stage WSSS methods. Experiments show that +our method achieves the state-of-the-art performance on PASCAL VOC 2012 and +competitive results on MS COCO 2014. + +
+
+
+
+
+ + ☆ Context Diffusion: In-Context Aware Image Generation + + +
+ We propose Context Diffusion, a diffusion-based framework that enables image +generation models to learn from visual examples presented in context. Recent +work tackles such in-context learning for image generation, where a query image +is provided alongside context examples and text prompts. However, the quality +and fidelity of the generated images deteriorate when the prompt is not +present, demonstrating that these models are unable to truly learn from the +visual context. To address this, we propose a novel framework that separates +the encoding of the visual context and preserving the structure of the query +images. This results in the ability to learn from the visual context and text +prompts, but also from either one of them. Furthermore, we enable our model to +handle few-shot settings, to effectively address diverse in-context learning +scenarios. Our experiments and user study demonstrate that Context Diffusion +excels in both in-domain and out-of-domain tasks, resulting in an overall +enhancement in image quality and fidelity compared to counterpart models. + +
+
+
+
+
+ + ☆ DocBinFormer: A Two-Level Transformer Network for Effective Document + Image Binarization + + +
+ In real life, various degradation scenarios exist that might damage document +images, making it harder to recognize and analyze them, thus binarization is a +fundamental and crucial step for achieving the most optimal performance in any +document analysis task. We propose DocBinFormer (Document Binarization +Transformer), a novel two-level vision transformer (TL-ViT) architecture based +on vision transformers for effective document image binarization. The presented +architecture employs a two-level transformer encoder to effectively capture +both global and local feature representation from the input images. These +complimentary bi-level features are exploited for efficient document image +binarization, resulting in improved results for system-generated as well as +handwritten document images in a comprehensive approach. With the absence of +convolutional layers, the transformer encoder uses the pixel patches and +sub-patches along with their positional information to operate directly on +them, while the decoder generates a clean (binarized) output image from the +latent representation of the patches. Instead of using a simple vision +transformer block to extract information from the image patches, the proposed +architecture uses two transformer blocks for greater coverage of the extracted +feature space on a global and local scale. The encoded feature representation +is used by the decoder block to generate the corresponding binarized output. +Extensive experiments on a variety of DIBCO and H-DIBCO benchmarks show that +the proposed model outperforms state-of-the-art techniques on four metrics. The +source code will be made available at +https://github.com/RisabBiswas/DocBinFormer. + +
+
+
+
+
+ + ☆ Enhancing Kinship Verification through Multiscale Retinex and Combined + Deep-Shallow features + + +
+ The challenge of kinship verification from facial images represents a +cutting-edge and formidable frontier in the realms of pattern recognition and +computer vision. This area of study holds a myriad of potential applications, +spanning from image annotation and forensic analysis to social media research. +Our research stands out by integrating a preprocessing method named Multiscale +Retinex (MSR), which elevates image quality and amplifies contrast, ultimately +bolstering the end results. Strategically, our methodology capitalizes on the +harmonious blend of deep and shallow texture descriptors, merging them +proficiently at the score level through the Logistic Regression (LR) method. To +elucidate, we employ the Local Phase Quantization (LPQ) descriptor to extract +shallow texture characteristics. For deep feature extraction, we turn to the +prowess of the VGG16 model, which is pre-trained on a convolutional neural +network (CNN). The robustness and efficacy of our method have been put to the +test through meticulous experiments on three rigorous kinship datasets, namely: +Cornell Kin Face, UB Kin Face, and TS Kin Face. + +
+
+
+
+
+ + ☆ When an Image is Worth 1,024 x 1,024 Words: A Case Study in + Computational Pathology + + +
+ This technical report presents LongViT, a vision Transformer that can process +gigapixel images in an end-to-end manner. Specifically, we split the gigapixel +image into a sequence of millions of patches and project them linearly into +embeddings. LongNet is then employed to model the extremely long sequence, +generating representations that capture both short-range and long-range +dependencies. The linear computation complexity of LongNet, along with its +distributed algorithm, enables us to overcome the constraints of both +computation and memory. We apply LongViT in the field of computational +pathology, aiming for cancer diagnosis and prognosis within gigapixel +whole-slide images. Experimental results demonstrate that LongViT effectively +encodes gigapixel images and outperforms previous state-of-the-art methods on +cancer subtyping and survival prediction. Code and models will be available at +https://aka.ms/LongViT. + +
+
+
+
+
+ + ☆ Personalized Face Inpainting with Diffusion Models by Parallel Visual + Attention + + +
+ Face inpainting is important in various applications, such as photo +restoration, image editing, and virtual reality. Despite the significant +advances in face generative models, ensuring that a person's unique facial +identity is maintained during the inpainting process is still an elusive goal. +Current state-of-the-art techniques, exemplified by MyStyle, necessitate +resource-intensive fine-tuning and a substantial number of images for each new +identity. Furthermore, existing methods often fall short in accommodating +user-specified semantic attributes, such as beard or expression. To improve +inpainting results, and reduce the computational complexity during inference, +this paper proposes the use of Parallel Visual Attention (PVA) in conjunction +with diffusion models. Specifically, we insert parallel attention matrices to +each cross-attention module in the denoising network, which attends to features +extracted from reference images by an identity encoder. We train the added +attention modules and identity encoder on CelebAHQ-IDI, a dataset proposed for +identity-preserving face inpainting. Experiments demonstrate that PVA attains +unparalleled identity resemblance in both face inpainting and face inpainting +with language guidance tasks, in comparison to various benchmarks, including +MyStyle, Paint by Example, and Custom Diffusion. Our findings reveal that PVA +ensures good identity preservation while offering effective +language-controllability. Additionally, in contrast to Custom Diffusion, PVA +requires just 40 fine-tuning steps for each new identity, which translates to a +significant speed increase of over 20 times. + +
+
+
+
+
+ + ☆ Texture-Semantic Collaboration Network for ORSI Salient Object Detection + + +
+ Salient object detection (SOD) in optical remote sensing images (ORSIs) has +become increasingly popular recently. Due to the characteristics of ORSIs, +ORSI-SOD is full of challenges, such as multiple objects, small objects, low +illuminations, and irregular shapes. To address these challenges, we propose a +concise yet effective Texture-Semantic Collaboration Network (TSCNet) to +explore the collaboration of texture cues and semantic cues for ORSI-SOD. +Specifically, TSCNet is based on the generic encoder-decoder structure. In +addition to the encoder and decoder, TSCNet includes a vital Texture-Semantic +Collaboration Module (TSCM), which performs valuable feature modulation and +interaction on basic features extracted from the encoder. The main idea of our +TSCM is to make full use of the texture features at the lowest level and the +semantic features at the highest level to achieve the expression enhancement of +salient regions on features. In the TSCM, we first enhance the position of +potential salient regions using semantic features. Then, we render and restore +the object details using the texture features. Meanwhile, we also perceive +regions of various scales, and construct interactions between different +regions. Thanks to the perfect combination of TSCM and generic structure, our +TSCNet can take care of both the position and details of salient objects, +effectively handling various scenes. Extensive experiments on three datasets +demonstrate that our TSCNet achieves competitive performance compared to 14 +state-of-the-art methods. The code and results of our method are available at +https://github.com/MathLee/TSCNet. + +
+
+ comment: 5 pages, 3 figures, Accepted by IEEE Transactions on Circuits and + Systems II: Express Briefs 2023 +
+
+
+
+
+ + ☆ GPT-4 Enhanced Multimodal Grounding for Autonomous Driving: Leveraging + Cross-Modal Attention with Large Language Models + + +
+ In the field of autonomous vehicles (AVs), accurately discerning commander +intent and executing linguistic commands within a visual context presents a +significant challenge. This paper introduces a sophisticated encoder-decoder +framework, developed to address visual grounding in AVs.Our Context-Aware +Visual Grounding (CAVG) model is an advanced system that integrates five core +encoders-Text, Image, Context, and Cross-Modal-with a Multimodal decoder. This +integration enables the CAVG model to adeptly capture contextual semantics and +to learn human emotional features, augmented by state-of-the-art Large Language +Models (LLMs) including GPT-4. The architecture of CAVG is reinforced by the +implementation of multi-head cross-modal attention mechanisms and a +Region-Specific Dynamic (RSD) layer for attention modulation. This +architectural design enables the model to efficiently process and interpret a +range of cross-modal inputs, yielding a comprehensive understanding of the +correlation between verbal commands and corresponding visual scenes. Empirical +evaluations on the Talk2Car dataset, a real-world benchmark, demonstrate that +CAVG establishes new standards in prediction accuracy and operational +efficiency. Notably, the model exhibits exceptional performance even with +limited training data, ranging from 50% to 75% of the full dataset. This +feature highlights its effectiveness and potential for deployment in practical +AV applications. Moreover, CAVG has shown remarkable robustness and +adaptability in challenging scenarios, including long-text command +interpretation, low-light conditions, ambiguous command contexts, inclement +weather conditions, and densely populated urban environments. The code for the +proposed model is available at our Github. + +
+
+
+
+
+ + ☆ FoodFusion: A Latent Diffusion Model for Realistic Food Image Generation + + +
+ Current state-of-the-art image generation models such as Latent Diffusion +Models (LDMs) have demonstrated the capacity to produce visually striking +food-related images. However, these generated images often exhibit an artistic +or surreal quality that diverges from the authenticity of real-world food +representations. This inadequacy renders them impractical for applications +requiring realistic food imagery, such as training models for image-based +dietary assessment. To address these limitations, we introduce FoodFusion, a +Latent Diffusion model engineered specifically for the faithful synthesis of +realistic food images from textual descriptions. The development of the +FoodFusion model involves harnessing an extensive array of open-source food +datasets, resulting in over 300,000 curated image-caption pairs. Additionally, +we propose and employ two distinct data cleaning methodologies to ensure that +the resulting image-text pairs maintain both realism and accuracy. The +FoodFusion model, thus trained, demonstrates a remarkable ability to generate +food images that exhibit a significant improvement in terms of both realism and +diversity over the publicly available image generation models. We openly share +the dataset and fine-tuned models to support advancements in this critical +field of food image synthesis at https://bit.ly/genai4good. + +
+
+
+
+
+ + ☆ Low-shot Object Learning with Mutual Exclusivity Bias NeurIPS 2023 + + +
+ This paper introduces Low-shot Object Learning with Mutual Exclusivity Bias +(LSME), the first computational framing of mutual exclusivity bias, a +phenomenon commonly observed in infants during word learning. We provide a +novel dataset, comprehensive baselines, and a state-of-the-art method to enable +the ML community to tackle this challenging learning task. The goal of LSME is +to analyze an RGB image of a scene containing multiple objects and correctly +associate a previously-unknown object instance with a provided category label. +This association is then used to perform low-shot learning to test category +generalization. We provide a data generation pipeline for the LSME problem and +conduct a thorough analysis of the factors that contribute to its difficulty. +Additionally, we evaluate the performance of multiple baselines, including +state-of-the-art foundation models. Finally, we present a baseline approach +that outperforms state-of-the-art models in terms of low-shot accuracy. + +
+
+ comment: Accepted at NeurIPS 2023, Datasets and Benchmarks Track. Project + website https://ngailapdi.github.io/projects/lsme/ +
+
+
+
+
+ + ☆ Personalized Pose Forecasting + + +
+ Human pose forecasting is the task of predicting articulated human motion +given past human motion. There exists a number of popular benchmarks that +evaluate an array of different models performing human pose forecasting. These +benchmarks do not reflect that a human interacting system, such as a delivery +robot, observes and plans for the motion of the same individual over an +extended period of time. Every individual has unique and distinct movement +patterns. This is however not reflected in existing benchmarks that evaluate a +model's ability to predict an average human's motion rather than a particular +individual's. We reformulate the human motion forecasting problem and present a +model-agnostic personalization method. Motion forecasting personalization can +be performed efficiently online by utilizing a low-parametric time-series +analysis model that personalizes neural network pose predictions. + +
+
+
+
+
+ + ☆ On the Diversity and Realism of Distilled Dataset: An Efficient Dataset + Distillation Paradigm + + +
+ Contemporary machine learning requires training large neural networks on +massive datasets and thus faces the challenges of high computational demands. +Dataset distillation, as a recent emerging strategy, aims to compress +real-world datasets for efficient training. However, this line of research +currently struggle with large-scale and high-resolution datasets, hindering its +practicality and feasibility. To this end, we re-examine the existing dataset +distillation methods and identify three properties required for large-scale +real-world applications, namely, realism, diversity, and efficiency. As a +remedy, we propose RDED, a novel computationally-efficient yet effective data +distillation paradigm, to enable both diversity and realism of the distilled +data. Extensive empirical results over various neural architectures and +datasets demonstrate the advancement of RDED: we can distill the full +ImageNet-1K to a small dataset comprising 10 images per class within 7 minutes, +achieving a notable 42% top-1 accuracy with ResNet-18 on a single RTX-4090 GPU +(while the SOTA only achieves 21% but requires 6 hours). + +
+
+ comment: 17 pages, 20 figures +
+
+
+
+
+ + ☆ Defense Against Adversarial Attacks using Convolutional Auto-Encoders + + +
+ Deep learning models, while achieving state-of-the-art performance on many +tasks, are susceptible to adversarial attacks that exploit inherent +vulnerabilities in their architectures. Adversarial attacks manipulate the +input data with imperceptible perturbations, causing the model to misclassify +the data or produce erroneous outputs. This work is based on enhancing the +robustness of targeted classifier models against adversarial attacks. To +achieve this, an convolutional autoencoder-based approach is employed that +effectively counters adversarial perturbations introduced to the input images. +By generating images closely resembling the input images, the proposed +methodology aims to restore the model's accuracy. + +
+
+ comment: 9 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Active Wildfires Detection and Dynamic Escape Routes Planning for Humans + through Information Fusion between Drones and Satellites + + +
+ UAVs are playing an increasingly important role in the field of wilderness +rescue by virtue of their flexibility. This paper proposes a fusion of UAV +vision technology and satellite image analysis technology for active wildfires +detection and road networks extraction of wildfire areas and real-time dynamic +escape route planning for people in distress. Firstly, the fire source location +and the segmentation of smoke and flames are targeted based on Sentinel 2 +satellite imagery. Secondly, the road segmentation and the road condition +assessment are performed by D-linkNet and NDVI values in the central area of +the fire source by UAV. Finally, the dynamic optimal route planning for humans +in real time is performed by the weighted A* algorithm in the road network with +the dynamic fire spread model. Taking the Chongqing wildfire on August 24, +2022, as a case study, the results demonstrate that the dynamic escape route +planning algorithm can provide an optimal real-time navigation path for humans +in the presence of fire through the information fusion of UAVs and satellites. + +
+
+ comment: 6 pages, 10 figures, conference +
+
+
+
+
+ + ☆ FRDiff: Feature Reuse for Exquisite Zero-shot Acceleration of Diffusion + Models + + +
+ The substantial computational costs of diffusion models, particularly due to +the repeated denoising steps crucial for high-quality image generation, present +a major obstacle to their widespread adoption. While several studies have +attempted to address this issue by reducing the number of score function +evaluations using advanced ODE solvers without fine-tuning, the decreased +number of denoising iterations misses the opportunity to update fine details, +resulting in noticeable quality degradation. In our work, we introduce an +advanced acceleration technique that leverages the temporal redundancy inherent +in diffusion models. Reusing feature maps with high temporal similarity opens +up a new opportunity to save computation without sacrificing output quality. To +realize the practical benefits of this intuition, we conduct an extensive +analysis and propose a novel method, FRDiff. FRDiff is designed to harness the +advantages of both reduced NFE and feature reuse, achieving a Pareto frontier +that balances fidelity and latency trade-offs in various generative tasks. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Kandinsky 3.0 Technical Report + + +
+ We present Kandinsky 3.0, a large-scale text-to-image generation model based +on latent diffusion, continuing the series of text-to-image Kandinsky models +and reflecting our progress to achieve higher quality and realism of image +generation. Compared to previous versions of Kandinsky 2.x, Kandinsky 3.0 +leverages a two times larger U-Net backbone, a ten times larger text encoder +and removes diffusion mapping. We describe the architecture of the model, the +data collection procedure, the training technique, and the production system of +user interaction. We focus on the key components that, as we have identified as +a result of a large number of experiments, had the most significant impact on +improving the quality of our model compared to the others. By our side-by-side +comparisons, Kandinsky becomes better in text understanding and works better on +specific domains. Project page: https://ai-forever.github.io/Kandinsky-3 + +
+
+ comment: Project page: https://ai-forever.github.io/Kandinsky-3 +
+
+
+
+
+ + ☆ Gravitational cell detection and tracking in fluorescence microscopy + data + + +
+ Automatic detection and tracking of cells in microscopy images are major +applications of computer vision technologies in both biomedical research and +clinical practice. Though machine learning methods are increasingly common in +these fields, classical algorithms still offer significant advantages for both +tasks, including better explainability, faster computation, lower hardware +requirements and more consistent performance. In this paper, we present a novel +approach based on gravitational force fields that can compete with, and +potentially outperform modern machine learning models when applied to +fluorescence microscopy images. This method includes detection, segmentation, +and tracking elements, with the results demonstrated on a Cell Tracking +Challenge dataset. + +
+
+ comment: 5 pages, 2 figures, 1 formula, 1 table, submitted to the 21st + International Symposium on Biomedical Imaging (ISBI 2024) +
+
+
+
+
+ + ☆ Improving the Generalization of Segmentation Foundation Model under + Distribution Shift via Weakly Supervised Adaptation + + +
+ The success of large language models has inspired the computer vision +community to explore image segmentation foundation model that is able to +zero/few-shot generalize through prompt engineering. Segment-Anything(SAM), +among others, is the state-of-the-art image segmentation foundation model +demonstrating strong zero/few-shot generalization. Despite the success, recent +studies reveal the weakness of SAM under strong distribution shift. In +particular, SAM performs awkwardly on corrupted natural images, camouflaged +images, medical images, etc. Motivated by the observations, we aim to develop a +self-training based strategy to adapt SAM to target distribution. Given the +unique challenges of large source dataset, high computation cost and incorrect +pseudo label, we propose a weakly supervised self-training architecture with +anchor regularization and low-rank finetuning to improve the robustness and +computation efficiency of adaptation. We validate the effectiveness on 5 types +of downstream segmentation tasks including natural clean/corrupted images, +medical images, camouflaged images and robotic images. Our proposed method is +task-agnostic in nature and outperforms pre-trained SAM and state-of-the-art +domain adaptation methods on almost all downstream tasks with the same testing +prompt inputs. + +
+
+ comment: 20 pages, 12 figures +
+
+
+
+
+ + ☆ PneumoLLM: Harnessing the Power of Large Language Model for + Pneumoconiosis Diagnosis + + +
+ The conventional pretraining-and-finetuning paradigm, while effective for +common diseases with ample data, faces challenges in diagnosing data-scarce +occupational diseases like pneumoconiosis. Recently, large language models +(LLMs) have exhibits unprecedented ability when conducting multiple tasks in +dialogue, bringing opportunities to diagnosis. A common strategy might involve +using adapter layers for vision-language alignment and diagnosis in a dialogic +manner. Yet, this approach often requires optimization of extensive learnable +parameters in the text branch and the dialogue head, potentially diminishing +the LLMs' efficacy, especially with limited training data. In our work, we +innovate by eliminating the text branch and substituting the dialogue head with +a classification head. This approach presents a more effective method for +harnessing LLMs in diagnosis with fewer learnable parameters. Furthermore, to +balance the retention of detailed image information with progression towards +accurate diagnosis, we introduce the contextual multi-token engine. This engine +is specialized in adaptively generating diagnostic tokens. Additionally, we +propose the information emitter module, which unidirectionally emits +information from image tokens to diagnosis tokens. Comprehensive experiments +validate the superiority of our methods and the effectiveness of proposed +modules. Our codes can be found at +https://github.com/CodeMonsterPHD/PneumoLLM/tree/main. + +
+
+ comment: submitted to Medical Image Analysis +
+
+
+
+
+ + ☆ From Detection to Action Recognition: An Edge-Based Pipeline for Robot + Human Perception + + +
+ Mobile service robots are proving to be increasingly effective in a range of +applications, such as healthcare, monitoring Activities of Daily Living (ADL), +and facilitating Ambient Assisted Living (AAL). These robots heavily rely on +Human Action Recognition (HAR) to interpret human actions and intentions. +However, for HAR to function effectively on service robots, it requires prior +knowledge of human presence (human detection) and identification of individuals +to monitor (human tracking). In this work, we propose an end-to-end pipeline +that encompasses the entire process, starting from human detection and +tracking, leading to action recognition. The pipeline is designed to operate in +near real-time while ensuring all stages of processing are performed on the +edge, reducing the need for centralised computation. To identify the most +suitable models for our mobile robot, we conducted a series of experiments +comparing state-of-the-art solutions based on both their detection performance +and efficiency. To evaluate the effectiveness of our proposed pipeline, we +proposed a dataset comprising daily household activities. By presenting our +findings and analysing the results, we demonstrate the efficacy of our approach +in enabling mobile robots to understand and respond to human behaviour in +real-world scenarios relying mainly on the data from their RGB cameras. + +
+
+ comment: 7 pages, 10 figures, 2 tables +
+
+
+
+
+ + ☆ HiFi4G: High-Fidelity Human Performance Rendering via Compact Gaussian + Splatting + + +
+ We have recently seen tremendous progress in photo-real human modeling and +rendering. Yet, efficiently rendering realistic human performance and +integrating it into the rasterization pipeline remains challenging. In this +paper, we present HiFi4G, an explicit and compact Gaussian-based approach for +high-fidelity human performance rendering from dense footage. Our core +intuition is to marry the 3D Gaussian representation with non-rigid tracking, +achieving a compact and compression-friendly representation. We first propose a +dual-graph mechanism to obtain motion priors, with a coarse deformation graph +for effective initialization and a fine-grained Gaussian graph to enforce +subsequent constraints. Then, we utilize a 4D Gaussian optimization scheme with +adaptive spatial-temporal regularizers to effectively balance the non-rigid +prior and Gaussian updating. We also present a companion compression scheme +with residual compensation for immersive experiences on various platforms. It +achieves a substantial compression rate of approximately 25 times, with less +than 2MB of storage per frame. Extensive experiments demonstrate the +effectiveness of our approach, which significantly outperforms existing +approaches in terms of optimization speed, rendering quality, and storage +overhead. + +
+
+
+
+
+ + ☆ F3-Pruning: A Training-Free and Generalized Pruning Strategy towards + Faster and Finer Text-to-Video Synthesis + + +
+ Recently Text-to-Video (T2V) synthesis has undergone a breakthrough by +training transformers or diffusion models on large-scale datasets. +Nevertheless, inferring such large models incurs huge costs.Previous inference +acceleration works either require costly retraining or are model-specific.To +address this issue, instead of retraining we explore the inference process of +two mainstream T2V models using transformers and diffusion models.The +exploration reveals the redundancy in temporal attention modules of both +models, which are commonly utilized to establish temporal relations among +frames.Consequently, we propose a training-free and generalized pruning +strategy called F3-Pruning to prune redundant temporal attention +weights.Specifically, when aggregate temporal attention values are ranked below +a certain ratio, corresponding weights will be pruned.Extensive experiments on +three datasets using a classic transformer-based model CogVideo and a typical +diffusion-based model Tune-A-Video verify the effectiveness of F3-Pruning in +inference acceleration, quality assurance and broad applicability. + +
+
+
+
+
+ + ☆ Data is Overrated: Perceptual Metrics Can Lead Learning in the Absence + of Training Data NeurIPS 2023 + + +
+ Perceptual metrics are traditionally used to evaluate the quality of natural +signals, such as images and audio. They are designed to mimic the perceptual +behaviour of human observers and usually reflect structures found in natural +signals. This motivates their use as loss functions for training generative +models such that models will learn to capture the structure held in the metric. +We take this idea to the extreme in the audio domain by training a compressive +autoencoder to reconstruct uniform noise, in lieu of natural data. We show that +training with perceptual losses improves the reconstruction of spectrograms and +re-synthesized audio at test time over models trained with a standard Euclidean +loss. This demonstrates better generalisation to unseen natural signals when +using perceptual metrics. + +
+
+ comment: Machine Learning for Audio Workshop, NeurIPS 2023 +
+
+
+
+
+ + ☆ Quantum-Inspired Neural Network Model of Optical Illusions + + +
+ Ambiguous optical illusions have been a paradigmatic object of fascination, +research and inspiration in arts, psychology and video games. However, accurate +computational models of perception of ambiguous figures have been elusive. In +this paper, we design and train a deep neural network model to simulate the +human's perception of the Necker cube, an ambiguous drawing with several +alternating possible interpretations. Defining the weights of the neural +network connection using a quantum generator of truly random numbers, in +agreement with the emerging concepts of quantum artificial intelligence and +quantum cognition we reveal that the actual perceptual state of the Necker cube +is a qubit-like superposition of the two fundamental perceptual states +predicted by classical theories. Our results will find applications in video +games and virtual reality systems employed for training of astronauts and +operators of unmanned aerial vehicles. They will also be useful for researchers +working in the fields of machine learning and vision, psychology of perception +and quantum-mechanical models of human mind and decision-making. + +
+
+
+
+
+ + ☆ Data-driven Crop Growth Simulation on Time-varying Generated Images + using Multi-conditional Generative Adversarial Networks + + +
+ Image-based crop growth modeling can substantially contribute to precision +agriculture by revealing spatial crop development over time, which allows an +early and location-specific estimation of relevant future plant traits, such as +leaf area or biomass. A prerequisite for realistic and sharp crop image +generation is the integration of multiple growth-influencing conditions in a +model, such as an image of an initial growth stage, the associated growth time, +and further information about the field treatment. We present a two-stage +framework consisting first of an image prediction model and second of a growth +estimation model, which both are independently trained. The image prediction +model is a conditional Wasserstein generative adversarial network (CWGAN). In +the generator of this model, conditional batch normalization (CBN) is used to +integrate different conditions along with the input image. This allows the +model to generate time-varying artificial images dependent on multiple +influencing factors of different kinds. These images are used by the second +part of the framework for plant phenotyping by deriving plant-specific traits +and comparing them with those of non-artificial (real) reference images. For +various crop datasets, the framework allows realistic, sharp image predictions +with a slight loss of quality from short-term to long-term predictions. +Simulations of varying growth-influencing conditions performed with the trained +framework provide valuable insights into how such factors relate to crop +appearances, which is particularly useful in complex, less explored crop +mixture systems. Further results show that adding process-based simulated +biomass as a condition increases the accuracy of the derived phenotypic traits +from the predicted images. This demonstrates the potential of our framework to +serve as an interface between an image- and process-based crop growth model. + +
+
+ comment: 26 pages, 16 figures, code available at + https://github.com/luked12/crop-growth-cgan +
+
+
+
+
+ + ☆ High-Quality Facial Geometry and Appearance Capture at Home + + +
+ Facial geometry and appearance capture have demonstrated tremendous success +in 3D scanning real humans in studios. Recent works propose to democratize this +technique while keeping the results high quality. However, they are still +inconvenient for daily usage. In addition, they focus on an easier problem of +only capturing facial skin. This paper proposes a novel method for high-quality +face capture, featuring an easy-to-use system and the capability to model the +complete face with skin, mouth interior, hair, and eyes. We reconstruct facial +geometry and appearance from a single co-located smartphone flashlight sequence +captured in a dim room where the flashlight is the dominant light source (e.g. +rooms with curtains or at night). To model the complete face, we propose a +novel hybrid representation to effectively model both eyes and other facial +regions, along with novel techniques to learn it from images. We apply a +combined lighting model to compactly represent real illuminations and exploit a +morphable face albedo model as a reflectance prior to disentangle diffuse and +specular. Experiments show that our method can capture high-quality 3D +relightable scans. + +
+
+ comment: Project page: https://yxuhan.github.io/CoRA/index.html ; Github repo: + https://github.com/yxuhan/CoRA +
+
+
+
+
+ + ☆ UFineBench: Towards Text-based Person Retrieval with Ultra-fine + Granularity + + +
+ Existing text-based person retrieval datasets often have relatively +coarse-grained text annotations. This hinders the model to comprehend the +fine-grained semantics of query texts in real scenarios. To address this +problem, we contribute a new benchmark named \textbf{UFineBench} for text-based +person retrieval with ultra-fine granularity. + Firstly, we construct a new \textbf{dataset} named UFine6926. We collect a +large number of person images and manually annotate each image with two +detailed textual descriptions, averaging 80.8 words each. The average word +count is three to four times that of the previous datasets. In addition of +standard in-domain evaluation, we also propose a special \textbf{evaluation +paradigm} more representative of real scenarios. It contains a new evaluation +set with cross domains, cross textual granularity and cross textual styles, +named UFine3C, and a new evaluation metric for accurately measuring retrieval +ability, named mean Similarity Distribution (mSD). Moreover, we propose CFAM, a +more efficient \textbf{algorithm} especially designed for text-based person +retrieval with ultra fine-grained texts. It achieves fine granularity mining by +adopting a shared cross-modal granularity decoder and hard negative match +mechanism. + With standard in-domain evaluation, CFAM establishes competitive performance +across various datasets, especially on our ultra fine-grained UFine6926. +Furthermore, by evaluating on UFine3C, we demonstrate that training on our +UFine6926 significantly improves generalization to real scenarios compared with +other coarse-grained datasets. The dataset and code will be made publicly +available at \url{https://github.com/Zplusdragon/UFineBench}. + +
+
+
+
+
+ + ☆ Data-Centric Digital Agriculture: A Perspective + + +
+ In response to the increasing global demand for food, feed, fiber, and fuel, +digital agriculture is rapidly evolving to meet these demands while reducing +environmental impact. This evolution involves incorporating data science, +machine learning, sensor technologies, robotics, and new management strategies +to establish a more sustainable agricultural framework. So far, machine +learning research in digital agriculture has predominantly focused on +model-centric approaches, focusing on model design and evaluation. These +efforts aim to optimize model accuracy and efficiency, often treating data as a +static benchmark. Despite the availability of agricultural data and +methodological advancements, a saturation point has been reached, with many +established machine learning methods achieving comparable levels of accuracy +and facing similar limitations. To fully realize the potential of digital +agriculture, it is crucial to have a comprehensive understanding of the role of +data in the field and to adopt data-centric machine learning. This involves +developing strategies to acquire and curate valuable data and implementing +effective learning and evaluation strategies that utilize the intrinsic value +of data. This approach has the potential to create accurate, generalizable, and +adaptable machine learning methods that effectively and sustainably address +agricultural tasks such as yield prediction, weed detection, and early disease +identification + +
+
+
+
+
+ + ☆ Gaussian-Flow: 4D Reconstruction with Dynamic 3D Gaussian Particle + + +
+ We introduce Gaussian-Flow, a novel point-based approach for fast dynamic +scene reconstruction and real-time rendering from both multi-view and monocular +videos. In contrast to the prevalent NeRF-based approaches hampered by slow +training and rendering speeds, our approach harnesses recent advancements in +point-based 3D Gaussian Splatting (3DGS). Specifically, a novel Dual-Domain +Deformation Model (DDDM) is proposed to explicitly model attribute deformations +of each Gaussian point, where the time-dependent residual of each attribute is +captured by a polynomial fitting in the time domain, and a Fourier series +fitting in the frequency domain. The proposed DDDM is capable of modeling +complex scene deformations across long video footage, eliminating the need for +training separate 3DGS for each frame or introducing an additional implicit +neural field to model 3D dynamics. Moreover, the explicit deformation modeling +for discretized Gaussian points ensures ultra-fast training and rendering of a +4D scene, which is comparable to the original 3DGS designed for static 3D +reconstruction. Our proposed approach showcases a substantial efficiency +improvement, achieving a $5\times$ faster training speed compared to the +per-frame 3DGS modeling. In addition, quantitative results demonstrate that the +proposed Gaussian-Flow significantly outperforms previous leading methods in +novel view rendering quality. Project page: +https://nju-3dv.github.io/projects/Gaussian-Flow + +
+
+
+
+
+ + ☆ ShareCMP: Polarization-Aware RGB-P Semantic Segmentation + + +
+ Multimodal semantic segmentation is developing rapidly, but the modality of +RGB-Polarization remains underexplored. To delve into this problem, we +construct a UPLight RGB-P segmentation benchmark with 12 typical underwater +semantic classes which provides data support for Autonomous Underwater Vehicles +(AUVs) to perform special perception tasks. In this work, we design the +ShareCMP, an RGB-P semantic segmentation framework with a shared dual-branch +architecture, which reduces the number of parameters by about 26-33% compared +to previous dual-branch models. It encompasses a Polarization Generate +Attention (PGA) module designed to generate polarization modal images with +richer polarization properties for the encoder. In addition, we introduce the +Class Polarization-Aware Loss (CPALoss) to improve the learning and +understanding of the encoder for polarization modal information and to optimize +the PGA module. With extensive experiments on a total of three RGB-P +benchmarks, our ShareCMP achieves state-of-the-art performance in mIoU with +fewer parameters on the UPLight (92.45%), ZJU (92.7%), and MCubeS (50.99%) +datasets. The code is available at https://github.com/LEFTeyex/ShareCMP. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Artist-Friendly Relightable and Animatable Neural Heads + + +
+ An increasingly common approach for creating photo-realistic digital avatars +is through the use of volumetric neural fields. The original neural radiance +field (NeRF) allowed for impressive novel view synthesis of static heads when +trained on a set of multi-view images, and follow up methods showed that these +neural representations can be extended to dynamic avatars. Recently, new +variants also surpassed the usual drawback of baked-in illumination in neural +representations, showing that static neural avatars can be relit in any +environment. In this work we simultaneously tackle both the motion and +illumination problem, proposing a new method for relightable and animatable +neural heads. Our method builds on a proven dynamic avatar approach based on a +mixture of volumetric primitives, combined with a recently-proposed lightweight +hardware setup for relightable neural fields, and includes a novel architecture +that allows relighting dynamic neural avatars performing unseen expressions in +any environment, even with nearfield illumination and viewpoints. + +
+
+
+
+
+ + ☆ DeepPyramid+: Medical Image Segmentation using Pyramid View Fusion and + Deformable Pyramid Reception + + +
+ Semantic Segmentation plays a pivotal role in many applications related to +medical image and video analysis. However, designing a neural network +architecture for medical image and surgical video segmentation is challenging +due to the diverse features of relevant classes, including heterogeneity, +deformability, transparency, blunt boundaries, and various distortions. We +propose a network architecture, DeepPyramid+, which addresses diverse +challenges encountered in medical image and surgical video segmentation. The +proposed DeepPyramid+ incorporates two major modules, namely "Pyramid View +Fusion" (PVF) and "Deformable Pyramid Reception," (DPR), to address the +outlined challenges. PVF replicates a deduction process within the neural +network, aligning with the human visual system, thereby enhancing the +representation of relative information at each pixel position. Complementarily, +DPR introduces shape- and scale-adaptive feature extraction techniques using +dilated deformable convolutions, enhancing accuracy and robustness in handling +heterogeneous classes and deformable shapes. Extensive experiments conducted on +diverse datasets, including endometriosis videos, MRI images, OCT scans, and +cataract and laparoscopy videos, demonstrate the effectiveness of DeepPyramid+ +in handling various challenges such as shape and scale variation, reflection, +and blur degradation. DeepPyramid+ demonstrates significant improvements in +segmentation performance, achieving up to a 3.65% increase in Dice coefficient +for intra-domain segmentation and up to a 17% increase in Dice coefficient for +cross-domain segmentation. DeepPyramid+ consistently outperforms +state-of-the-art networks across diverse modalities considering different +backbone networks, showcasing its versatility. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ Open-sourced Data Ecosystem in Autonomous Driving: the Present and + Future + + +
+ With the continuous maturation and application of autonomous driving +technology, a systematic examination of open-source autonomous driving datasets +becomes instrumental in fostering the robust evolution of the industry +ecosystem. Current autonomous driving datasets can broadly be categorized into +two generations. The first-generation autonomous driving datasets are +characterized by relatively simpler sensor modalities, smaller data scale, and +is limited to perception-level tasks. KITTI, introduced in 2012, serves as a +prominent representative of this initial wave. In contrast, the +second-generation datasets exhibit heightened complexity in sensor modalities, +greater data scale and diversity, and an expansion of tasks from perception to +encompass prediction and control. Leading examples of the second generation +include nuScenes and Waymo, introduced around 2019. This comprehensive review, +conducted in collaboration with esteemed colleagues from both academia and +industry, systematically assesses over seventy open-source autonomous driving +datasets from domestic and international sources. It offers insights into +various aspects, such as the principles underlying the creation of high-quality +datasets, the pivotal role of data engine systems, and the utilization of +generative foundation models to facilitate scalable data generation. +Furthermore, this review undertakes an exhaustive analysis and discourse +regarding the characteristics and data scales that future third-generation +autonomous driving datasets should possess. It also delves into the scientific +and technical challenges that warrant resolution. These endeavors are pivotal +in advancing autonomous innovation and fostering technological enhancement in +critical domains. For further details, please refer to +https://github.com/OpenDriveLab/DriveAGI. + +
+
+ comment: This article is a simplified English translation of corresponding + Chinese article. Please refer to Chinese version for the complete content +
+
+
+
+
+ + ☆ SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting + + +
+ Spatiotemporal forecasting tasks, such as weather forecasting and traffic +prediction, offer significant societal benefits. These tasks can be effectively +approached as image forecasting problems using computer vision models. Vector +quantization (VQ) is a well-known method for discrete representation that +improves the latent space, leading to enhanced generalization and transfer +learning capabilities. One of the main challenges in using VQ for +spatiotemporal forecasting is how to balance between keeping enough details and +removing noises from the original patterns for better generalization. We +address this challenge by developing sparse vector quantization, or {\bf SVQ} +for short, that leverages sparse regression to make better trade-off between +the two objectives. The main innovation of this work is to approximate sparse +regression by a two-layer MLP and a randomly fixed or learnable matrix, +dramatically improving its computational efficiency. Through experiments +conducted on diverse datasets in multiple fields including weather forecasting, +traffic flow prediction, and video forecasting, we unequivocally demonstrate +that our proposed method consistently enhances the performance of base models +and achieves state-of-the-art results across all benchmarks. + +
+
+
+
+
+ + ☆ Predicting Postoperative Intraocular Lens Dislocation in Cataract + Surgery via Deep Learning + + +
+ A critical yet unpredictable complication following cataract surgery is +intraocular lens dislocation. Postoperative stability is imperative, as even a +tiny decentration of multifocal lenses or inadequate alignment of the torus in +toric lenses due to postoperative rotation can lead to a significant drop in +visual acuity. Investigating possible intraoperative indicators that can +predict post-surgical instabilities of intraocular lenses can help prevent this +complication. In this paper, we develop and evaluate the first fully-automatic +framework for the computation of lens unfolding delay, rotation, and +instability during surgery. Adopting a combination of three types of CNNs, +namely recurrent, region-based, and pixel-based, the proposed framework is +employed to assess the possibility of predicting post-operative lens +dislocation during cataract surgery. This is achieved via performing a +large-scale study on the statistical differences between the behavior of +different brands of intraocular lenses and aligning the results with expert +surgeons' hypotheses and observations about the lenses. We exploit a +large-scale dataset of cataract surgery videos featuring four intraocular lens +brands. Experimental results confirm the reliability of the proposed framework +in evaluating the lens' statistics during the surgery. The Pearson correlation +and t-test results reveal significant correlations between lens unfolding delay +and lens rotation and significant differences between the intra-operative +rotations stability of four groups of lenses. These results suggest that the +proposed framework can help surgeons select the lenses based on the patient's +eye conditions and predict post-surgical lens dislocation. + +
+
+ comment: 12 pages, 5 figures +
+
+
+
+
+ + ☆ Action Scene Graphs for Long-Form Understanding of Egocentric Videos + + +
+ We present Egocentric Action Scene Graphs (EASGs), a new representation for +long-form understanding of egocentric videos. EASGs extend standard +manually-annotated representations of egocentric videos, such as verb-noun +action labels, by providing a temporally evolving graph-based description of +the actions performed by the camera wearer, including interacted objects, their +relationships, and how actions unfold in time. Through a novel annotation +procedure, we extend the Ego4D dataset by adding manually labeled Egocentric +Action Scene Graphs offering a rich set of annotations designed for long-from +egocentric video understanding. We hence define the EASG generation task and +provide a baseline approach, establishing preliminary benchmarks. Experiments +on two downstream tasks, egocentric action anticipation and egocentric activity +summarization, highlight the effectiveness of EASGs for long-form egocentric +video understanding. We will release the dataset and the code to replicate +experiments and annotations. + +
+
+
+
+
+ + ☆ Riemannian Complex Matrix Convolution Network for PolSAR Image + Classification + + +
+ Recently, deep learning methods have achieved superior performance for +Polarimetric Synthetic Aperture Radar(PolSAR) image classification. Existing +deep learning methods learn PolSAR data by converting the covariance matrix +into a feature vector or complex-valued vector as the input. However, all these +methods cannot learn the structure of complex matrix directly and destroy the +channel correlation. To learn geometric structure of complex matrix, we propose +a Riemannian complex matrix convolution network for PolSAR image classification +in Riemannian space for the first time, which directly utilizes the complex +matrix as the network input and defines the Riemannian operations to learn +complex matrix's features. The proposed Riemannian complex matrix convolution +network considers PolSAR complex matrix endowed in Riemannian manifold, and +defines a series of new Riemannian convolution, ReLu and LogEig operations in +Riemannian space, which breaks through the Euclidean constraint of conventional +networks. Then, a CNN module is appended to enhance contextual Riemannian +features. Besides, a fast kernel learning method is developed for the proposed +method to learn class-specific features and reduce the computation time +effectively. Experiments are conducted on three sets of real PolSAR data with +different bands and sensors. Experiments results demonstrates the proposed +method can obtain superior performance than the state-of-the-art methods. + +
+
+
+
+
+ + ☆ Evaluating the point cloud of individual trees generated from images + based on Neural Radiance fields (NeRF) method + + +
+ Three-dimensional (3D) reconstruction of trees has always been a key task in +precision forestry management and research. Due to the complex branch +morphological structure of trees themselves and the occlusions from tree stems, +branches and foliage, it is difficult to recreate a complete three-dimensional +tree model from a two-dimensional image by conventional photogrammetric +methods. In this study, based on tree images collected by various cameras in +different ways, the Neural Radiance Fields (NeRF) method was used for +individual tree reconstruction and the exported point cloud models are compared +with point cloud derived from photogrammetric reconstruction and laser scanning +methods. The results show that the NeRF method performs well in individual tree +3D reconstruction, as it has higher successful reconstruction rate, better +reconstruction in the canopy area, it requires less amount of images as input. +Compared with photogrammetric reconstruction method, NeRF has significant +advantages in reconstruction efficiency and is adaptable to complex scenes, but +the generated point cloud tends to be noisy and low resolution. The accuracy of +tree structural parameters (tree height and diameter at breast height) +extracted from the photogrammetric point cloud is still higher than those of +derived from the NeRF point cloud. The results of this study illustrate the +great potential of NeRF method for individual tree reconstruction, and it +provides new ideas and research directions for 3D reconstruction and +visualization of complex forest scenes. + +
+
+ comment: 25 pages; 6 figures +
+
+
+
+
+ + ☆ Bottom-Up Instance Segmentation of Catheters for Chest X-Rays + + +
+ Chest X-ray (CXR) is frequently employed in emergency departments and +intensive care units to verify the proper placement of central lines and tubes +and to rule out related complications. The automation of the X-ray reading +process can be a valuable support tool for non-specialist technicians and +minimize reporting delays due to non-availability of experts. While existing +solutions for automated catheter segmentation and malposition detection show +promising results, the disentanglement of individual catheters remains an open +challenge, especially in complex cases where multiple devices appear +superimposed in the X-ray projection. Moreover, conventional top-down instance +segmentation methods are ineffective on such thin and long devices, that often +extend through the entire image. In this paper, we propose a deep learning +approach based on associative embeddings for catheter instance segmentation, +able to overcome those limitations and effectively handle device intersections. + +
+
+
+
+
+ + ☆ RING-NeRF: A Versatile Architecture based on Residual Implicit Neural + Grids + + +
+ Since their introduction, Neural Fields have become very popular for 3D +reconstruction and new view synthesis. Recent researches focused on +accelerating the process, as well as improving the robustness to variation of +the observation distance and limited number of supervised viewpoints. However, +those approaches often led to dedicated solutions that cannot be easily +combined. To tackle this issue, we introduce a new simple but efficient +architecture named RING-NeRF, based on Residual Implicit Neural Grids, that +provides a control on the level of detail of the mapping function between the +scene and the latent spaces. Associated with a distance-aware forward mapping +mechanism and a continuous coarse-to-fine reconstruction process, our versatile +architecture demonstrates both fast training and state-of-the-art performances +in terms of: (1) anti-aliased rendering, (2) reconstruction quality from few +supervised viewpoints, and (3) robustness in the absence of appropriate +scene-specific initialization for SDF-based NeRFs. We also demonstrate that our +architecture can dynamically add grids to increase the details of the +reconstruction, opening the way to adaptive reconstruction. + +
+
+
+
+
+ + ☆ PointMoment:Mixed-Moment-based Self-Supervised Representation Learning + for 3D Point Clouds + + +
+ Large and rich data is a prerequisite for effective training of deep neural +networks. However, the irregularity of point cloud data makes manual annotation +time-consuming and laborious. Self-supervised representation learning, which +leverages the intrinsic structure of large-scale unlabelled data to learn +meaningful feature representations, has attracted increasing attention in the +field of point cloud research. However, self-supervised representation learning +often suffers from model collapse, resulting in reduced information and +diversity of the learned representation, and consequently degrading the +performance of downstream tasks. To address this problem, we propose +PointMoment, a novel framework for point cloud self-supervised representation +learning that utilizes a high-order mixed moment loss function rather than the +conventional contrastive loss function. Moreover, our framework does not +require any special techniques such as asymmetric network architectures, +gradient stopping, etc. Specifically, we calculate the high-order mixed moment +of the feature variables and force them to decompose into products of their +individual moment, thereby making multiple variables more independent and +minimizing the feature redundancy. We also incorporate a contrastive learning +approach to maximize the feature invariance under different data augmentations +of the same point cloud. Experimental results show that our approach +outperforms previous unsupervised learning methods on the downstream task of 3D +point cloud classification and segmentation. + +
+
+ comment: 15 pages,3 figures +
+
+
+
+
+ + ☆ GraNet: A Multi-Level Graph Network for 6-DoF Grasp Pose Generation in + Cluttered Scenes IROS 2023 + + +
+ 6-DoF object-agnostic grasping in unstructured environments is a critical yet +challenging task in robotics. Most current works use non-optimized approaches +to sample grasp locations and learn spatial features without concerning the +grasping task. This paper proposes GraNet, a graph-based grasp pose generation +framework that translates a point cloud scene into multi-level graphs and +propagates features through graph neural networks. By building graphs at the +scene level, object level, and grasp point level, GraNet enhances feature +embedding at multiple scales while progressively converging to the ideal +grasping locations by learning. Our pipeline can thus characterize the spatial +distribution of grasps in cluttered scenes, leading to a higher rate of +effective grasping. Furthermore, we enhance the representation ability of +scalable graph networks by a structure-aware attention mechanism to exploit +local relations in graphs. Our method achieves state-of-the-art performance on +the large-scale GraspNet-1Billion benchmark, especially in grasping unseen +objects (+11.62 AP). The real robot experiment shows a high success rate in +grasping scattered objects, verifying the effectiveness of the proposed +approach in unstructured environments. + +
+
+ comment: IROS 2023 +
+
+
+
+
+ + ☆ Online Vectorized HD Map Construction using Geometry + + +
+ The construction of online vectorized High-Definition (HD) maps is critical +for downstream prediction and planning. Recent efforts have built strong +baselines for this task, however, shapes and relations of instances in urban +road systems are still under-explored, such as parallelism, perpendicular, or +rectangle-shape. In our work, we propose GeMap ($\textbf{Ge}$ometry +$\textbf{Map}$), which end-to-end learns Euclidean shapes and relations of map +instances beyond basic perception. Specifically, we design a geometric loss +based on angle and distance clues, which is robust to rigid transformations. We +also decouple self-attention to independently handle Euclidean shapes and +relations. Our method achieves new state-of-the-art performance on the NuScenes +and Argoverse 2 datasets. Remarkably, it reaches a 71.8% mAP on the large-scale +Argoverse 2 dataset, outperforming MapTR V2 by +4.4% and surpassing the 70% mAP +threshold for the first time. Code is available at +https://github.com/cnzzx/GeMap + +
+
+ comment: Project website https://invictus717.github.io/GeMap/ +
+
+
+
+
+ + ☆ PointJEM: Self-supervised Point Cloud Understanding for Reducing Feature + Redundancy via Joint Entropy Maximization + + +
+ Most deep learning-based point cloud processing methods are supervised and +require large scale of labeled data. However, manual labeling of point cloud +data is laborious and time-consuming. Self-supervised representation learning +can address the aforementioned issue by learning robust and generalized +representations from unlabeled datasets. Nevertheless, the embedded features +obtained by representation learning usually contain redundant information, and +most current methods reduce feature redundancy by linear correlation +constraints. In this paper, we propose PointJEM, a self-supervised +representation learning method applied to the point cloud field. PointJEM +comprises an embedding scheme and a loss function based on joint entropy. The +embedding scheme divides the embedding vector into different parts, each part +can learn a distinctive feature. To reduce redundant information in the +features, PointJEM maximizes the joint entropy between the different parts, +thereby rendering the learned feature variables pairwise independent. To +validate the effectiveness of our method, we conducted experiments on multiple +datasets. The results demonstrate that our method can significantly reduce +feature redundancy beyond linear correlation. Furthermore, PointJEM achieves +competitive performance in downstream tasks such as classification and +segmentation. + +
+
+
+
+
+ + ☆ Building Category Graphs Representation with Spatial and Temporal + Attention for Visual Navigation + + +
+ Given an object of interest, visual navigation aims to reach the object's +location based on a sequence of partial observations. To this end, an agent +needs to 1) learn a piece of certain knowledge about the relations of object +categories in the world during training and 2) look for the target object based +on the pre-learned object category relations and its moving trajectory in the +current unseen environment. In this paper, we propose a Category Relation Graph +(CRG) to learn the knowledge of object category layout relations and a +Temporal-Spatial-Region (TSR) attention architecture to perceive the long-term +spatial-temporal dependencies of objects helping the navigation. We learn prior +knowledge of object layout, establishing a category relationship graph to +deduce the positions of specific objects. Subsequently, we introduced TSR to +capture the relationships of objects in temporal, spatial, and regions within +the observation trajectories. Specifically, we propose a Temporal attention +module (T) to model the temporal structure of the observation sequence, which +implicitly encodes the historical moving or trajectory information. Then, a +Spatial attention module (S) is used to uncover the spatial context of the +current observation objects based on the category relation graph and past +observations. Last, a Region attention module (R) shifts the attention to the +target-relevant region. Based on the visual representation extracted by our +method, the agent can better perceive the environment and easily learn superior +navigation policy. Experiments on AI2-THOR demonstrate our CRG-TSR method +significantly outperforms existing methods regarding both effectiveness and +efficiency. The code has been included in the supplementary material and will +be publicly available. + +
+
+ comment: 18 pages; 7 figures +
+
+
+
+
+ + ☆ GCFA:Geodesic Curve Feature Augmentation via Shape Space Theory + + +
+ Deep learning has yielded remarkable outcomes in various domains. However, +the challenge of requiring large-scale labeled samples still persists in deep +learning. Thus, data augmentation has been introduced as a critical strategy to +train deep learning models. However, data augmentation suffers from information +loss and poor performance in small sample environments. To overcome these +drawbacks, we propose a feature augmentation method based on shape space +theory, i.e., Geodesic curve feature augmentation, called GCFA in brevity. +First, we extract features from the image with the neural network model. Then, +the multiple image features are projected into a pre-shape space as features. +In the pre-shape space, a Geodesic curve is built to fit the features. Finally, +the many generated features on the Geodesic curve are used to train the various +machine learning models. The GCFA module can be seamlessly integrated with most +machine learning methods. And the proposed method is simple, effective and +insensitive for the small sample datasets. Several examples demonstrate that +the GCFA method can greatly improve the performance of the data preprocessing +model in a small sample environment. + +
+
+
+
+
+ + ☆ Background Clustering Pre-training for Few-shot Segmentation ICIP 2023 + + +
+ Recent few-shot segmentation (FSS) methods introduce an extra pre-training +stage before meta-training to obtain a stronger backbone, which has become a +standard step in few-shot learning. Despite the effectiveness, current +pre-training scheme suffers from the merged background problem: only base +classes are labelled as foregrounds, making it hard to distinguish between +novel classes and actual background. In this paper, we propose a new +pre-training scheme for FSS via decoupling the novel classes from background, +called Background Clustering Pre-Training (BCPT). Specifically, we adopt online +clustering to the pixel embeddings of merged background to explore the +underlying semantic structures, bridging the gap between pre-training and +adaptation to novel classes. Given the clustering results, we further propose +the background mining loss and leverage base classes to guide the clustering +process, improving the quality and stability of clustering results. Experiments +on PASCAL-5i and COCO-20i show that BCPT yields advanced performance. Code will +be available. + +
+
+ comment: 6 pages, 2 figures, ICIP 2023 +
+
+
+
+
+ + ☆ Complementary Benefits of Contrastive Learning and Self-Training Under + Distribution Shift NeurIPS 2023 + + +
+ Self-training and contrastive learning have emerged as leading techniques for +incorporating unlabeled data, both under distribution shift (unsupervised +domain adaptation) and when it is absent (semi-supervised learning). However, +despite the popularity and compatibility of these techniques, their efficacy in +combination remains unexplored. In this paper, we undertake a systematic +empirical investigation of this combination, finding that (i) in domain +adaptation settings, self-training and contrastive learning offer significant +complementary gains; and (ii) in semi-supervised learning settings, +surprisingly, the benefits are not synergistic. Across eight distribution shift +datasets (e.g., BREEDs, WILDS), we demonstrate that the combined method obtains +3--8% higher accuracy than either approach independently. We then theoretically +analyze these techniques in a simplified model of distribution shift, +demonstrating scenarios under which the features produced by contrastive +learning can yield a good initialization for self-training to further amplify +gains and achieve optimal performance, even when either method alone would +fail. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ DiffPMAE: Diffusion Masked Autoencoders for Point Cloud Reconstruction + + +
+ Point cloud streaming is increasingly getting popular, evolving into the norm +for interactive service delivery and the future Metaverse. However, the +substantial volume of data associated with point clouds presents numerous +challenges, particularly in terms of high bandwidth consumption and large +storage capacity. Despite various solutions proposed thus far, with a focus on +point cloud compression, upsampling, and completion, these +reconstruction-related methods continue to fall short in delivering high +fidelity point cloud output. As a solution, in DiffPMAE, we propose an +effective point cloud reconstruction architecture. Inspired by self-supervised +learning concepts, we combine Masked Auto-Encoding and Diffusion Model +mechanism to remotely reconstruct point cloud data. By the nature of this +reconstruction process, DiffPMAE can be extended to many related downstream +tasks including point cloud compression, upsampling and completion. Leveraging +ShapeNet-55 and ModelNet datasets with over 60000 objects, we validate the +performance of DiffPMAE exceeding many state-of-the-art methods in-terms of +auto-encoding and downstream tasks considered. + +
+
+
+
+
+ + ☆ Cooperative Probabilistic Trajectory Forecasting under Occlusion + + +
+ Perception and planning under occlusion is essential for safety-critical +tasks. Occlusion-aware planning often requires communicating the information of +the occluded object to the ego agent for safe navigation. However, +communicating rich sensor information under adverse conditions during +communication loss and limited bandwidth may not be always feasible. Further, +in GPS denied environments and indoor navigation, localizing and sharing of +occluded objects can be challenging. To overcome this, relative pose estimation +between connected agents sharing a common field of view can be a +computationally effective way of communicating information about surrounding +objects. In this paper, we design an end-to-end network that cooperatively +estimates the current states of occluded pedestrian in the reference frame of +ego agent and then predicts the trajectory with safety guarantees. +Experimentally, we show that the uncertainty-aware trajectory prediction of +occluded pedestrian by the ego agent is almost similar to the ground truth +trajectory assuming no occlusion. The current research holds promise for +uncertainty-aware navigation among multiple connected agents under occlusion. + +
+
+ comment: 10 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Class Incremental Learning for Adversarial Robustness + + +
+ Adversarial training integrates adversarial examples during model training to +enhance robustness. However, its application in fixed dataset settings differs +from real-world dynamics, where data accumulates incrementally. In this study, +we investigate Adversarially Robust Class Incremental Learning (ARCIL), a +method that combines adversarial robustness with incremental learning. We +observe that combining incremental learning with naive adversarial training +easily leads to a loss of robustness. We discover that this is attributed to +the disappearance of the flatness of the loss function, a characteristic of +adversarial training. To address this issue, we propose the Flatness Preserving +Distillation (FPD) loss that leverages the output difference between +adversarial and clean examples. Additionally, we introduce the Logit Adjustment +Distillation (LAD) loss, which adapts the model's knowledge to perform well on +new tasks. Experimental results demonstrate the superiority of our method over +approaches that apply adversarial training to existing incremental learning +methods, which provides a strong baseline for incremental learning on +adversarial robustness in the future. Our method achieves AutoAttack accuracy +that is 5.99\%p, 5.27\%p, and 3.90\%p higher on average than the baseline on +split CIFAR-10, CIFAR-100, and Tiny ImageNet, respectively. The code will be +made available. + +
+
+
+
+
+ + ☆ STEP CATFormer: Spatial-Temporal Effective Body-Part Cross Attention + Transformer for Skeleton-based Action Recognition BMVC 2023 + + +
+ Graph convolutional networks (GCNs) have been widely used and achieved +remarkable results in skeleton-based action recognition. We think the key to +skeleton-based action recognition is a skeleton hanging in frames, so we focus +on how the Graph Convolutional Convolution networks learn different topologies +and effectively aggregate joint features in the global temporal and local +temporal. In this work, we propose three Channel-wise Tolopogy Graph +Convolution based on Channel-wise Topology Refinement Graph Convolution +(CTR-GCN). Combining CTR-GCN with two joint cross-attention modules can capture +the upper-lower body part and hand-foot relationship skeleton features. After +that, to capture features of human skeletons changing in frames we design the +Temporal Attention Transformers to extract skeletons effectively. The Temporal +Attention Transformers can learn the temporal features of human skeleton +sequences. Finally, we fuse the temporal features output scale with MLP and +classification. We develop a powerful graph convolutional network named Spatial +Temporal Effective Body-part Cross Attention Transformer which notably +high-performance on the NTU RGB+D, NTU RGB+D 120 datasets. Our code and models +are available at https://github.com/maclong01/STEP-CATFormer + +
+
+ comment: Accepted to BMVC 2023: Computer Vision for Games and Games for + Computer Vision (CVG). 9 pages +
+
+
+
+
+ + ☆ Indirect Gradient Matching for Adversarial Robust Distillation + + +
+ Adversarial training significantly improves adversarial robustness, but +superior performance is primarily attained with large models. This substantial +performance gap for smaller models has spurred active research into adversarial +distillation (AD) to mitigate the difference. Existing AD methods leverage the +teacher's logits as a guide. In contrast to these approaches, we aim to +transfer another piece of knowledge from the teacher, the input gradient. In +this paper, we propose a distillation module termed Indirect Gradient +Distillation Module (IGDM) that indirectly matches the student's input gradient +with that of the teacher. We hypothesize that students can better acquire the +teacher's knowledge by matching the input gradient. Leveraging the observation +that adversarial training renders the model locally linear on the input space, +we employ Taylor approximation to effectively align gradients without directly +calculating them. Experimental results show that IGDM seamlessly integrates +with existing AD methods, significantly enhancing the performance of all AD +methods. Particularly, utilizing IGDM on the CIFAR-100 dataset improves the +AutoAttack accuracy from 28.06% to 30.32% with the ResNet-18 model and from +26.18% to 29.52% with the MobileNetV2 model when integrated into the SOTA +method without additional data augmentation. The code will be made available. + +
+
+
+
+
+ + ☆ SO-NeRF: Active View Planning for NeRF using Surrogate Objectives + + +
+ Despite the great success of Neural Radiance Fields (NeRF), its +data-gathering process remains vague with only a general rule of thumb of +sampling as densely as possible. The lack of understanding of what actually +constitutes good views for NeRF makes it difficult to actively plan a sequence +of views that yield the maximal reconstruction quality. We propose Surrogate +Objectives for Active Radiance Fields (SOAR), which is a set of interpretable +functions that evaluates the goodness of views using geometric and photometric +visual cues - surface coverage, geometric complexity, textural complexity, and +ray diversity. Moreover, by learning to infer the SOAR scores from a deep +network, SOARNet, we are able to effectively select views in mere seconds +instead of hours, without the need for prior visits to all the candidate views +or training any radiance field during such planning. Our experiments show +SOARNet outperforms the baselines with $\sim$80x speed-up while achieving +better or comparable reconstruction qualities. We finally show that SOAR is +model-agnostic, thus it generalizes across fully neural-implicit to fully +explicit approaches. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Deep Multimodal Fusion for Surgical Feedback Classification + + +
+ Quantification of real-time informal feedback delivered by an experienced +surgeon to a trainee during surgery is important for skill improvements in +surgical training. Such feedback in the live operating room is inherently +multimodal, consisting of verbal conversations (e.g., questions and answers) as +well as non-verbal elements (e.g., through visual cues like pointing to +anatomic elements). In this work, we leverage a clinically-validated +five-category classification of surgical feedback: "Anatomic", "Technical", +"Procedural", "Praise" and "Visual Aid". We then develop a multi-label machine +learning model to classify these five categories of surgical feedback from +inputs of text, audio, and video modalities. The ultimate goal of our work is +to help automate the annotation of real-time contextual surgical feedback at +scale. Our automated classification of surgical feedback achieves AUCs ranging +from 71.5 to 77.6 with the fusion improving performance by 3.1%. We also show +that high-quality manual transcriptions of feedback audio from experts improve +AUCs to between 76.5 and 96.2, which demonstrates a clear path toward future +improvements. Empirically, we find that the Staged training strategy, with +first pre-training each modality separately and then training them jointly, is +more effective than training different modalities altogether. We also present +intuitive findings on the importance of modalities for different feedback +categories. This work offers an important first look at the feasibility of +automated classification of real-world live surgical feedback based on text, +audio, and video modalities. + +
+
+
+
+
+ + ☆ Human Body Model based ID using Shape and Pose Parameters + + +
+ We present a Human Body model based IDentification system (HMID) system that +is jointly trained for shape, pose and biometric identification. HMID is based +on the Human Mesh Recovery (HMR) network and we propose additional losses to +improve and stabilize shape estimation and biometric identification while +maintaining the pose and shape output. We show that when our HMID network is +trained using additional shape and pose losses, it shows a significant +improvement in biometric identification performance when compared to an +identical model that does not use such losses. The HMID model uses raw images +instead of silhouettes and is able to perform robust recognition on images +collected at range and altitude as many anthropometric properties are +reasonably invariant to clothing, view and range. We show results on the USF +dataset as well as the BRIAR dataset which includes probes with both clothing +and view changes. Our approach (using body model losses) shows a significant +improvement in Rank20 accuracy and True Accuracy Rate on the BRIAR evaluation +dataset. + +
+
+ comment: to be published in IEEE International Joint Conference on Biometrics, + Ljubljana, Slovenia 2023 +
+
+
+
+
+ + ☆ Rethinking Object Saliency Ranking: A Novel Whole-flow Processing + Paradigm + + +
+ Existing salient object detection methods are capable of predicting binary +maps that highlight visually salient regions. However, these methods are +limited in their ability to differentiate the relative importance of multiple +objects and the relationships among them, which can lead to errors and reduced +accuracy in downstream tasks that depend on the relative importance of multiple +objects. To conquer, this paper proposes a new paradigm for saliency ranking, +which aims to completely focus on ranking salient objects by their "importance +order". While previous works have shown promising performance, they still face +ill-posed problems. First, the saliency ranking ground truth (GT) orders +generation methods are unreasonable since determining the correct ranking order +is not well-defined, resulting in false alarms. Second, training a ranking +model remains challenging because most saliency ranking methods follow the +multi-task paradigm, leading to conflicts and trade-offs among different tasks. +Third, existing regression-based saliency ranking methods are complex for +saliency ranking models due to their reliance on instance mask-based saliency +ranking orders. These methods require a significant amount of data to perform +accurately and can be challenging to implement effectively. To solve these +problems, this paper conducts an in-depth analysis of the causes and proposes a +whole-flow processing paradigm of saliency ranking task from the perspective of +"GT data generation", "network structure design" and "training protocol". The +proposed approach outperforms existing state-of-the-art methods on the +widely-used SALICON set, as demonstrated by extensive experiments with fair and +reasonable comparisons. The saliency ranking task is still in its infancy, and +our proposed unified framework can serve as a fundamental strategy to guide +future work. + +
+
+ comment: 16 pages, 14 figures, accepted by IEEE Transactions on Image + Processing +
+
+
+
+
+ + ☆ Predicting Scores of Various Aesthetic Attribute Sets by Learning from + Overall Score Labels + + +
+ Now many mobile phones embed deep-learning models for evaluation or guidance +on photography. These models cannot provide detailed results like human pose +scores or scene color scores because of the rare of corresponding aesthetic +attribute data. However, the annotation of image aesthetic attribute scores +requires experienced artists and professional photographers, which hinders the +collection of large-scale fully-annotated datasets. In this paper, we propose +to replace image attribute labels with feature extractors. First, a novel +aesthetic attribute evaluation framework based on attribute features is +proposed to predict attribute scores and overall scores. We call it the F2S +(attribute features to attribute scores) model. We use networks from different +tasks to provide attribute features to our F2S models. Then, we define an +aesthetic attribute contribution to describe the role of aesthetic attributes +throughout an image and use it with the attribute scores and the overall scores +to train our F2S model. Sufficient experiments on publicly available datasets +demonstrate that our F2S model achieves comparable performance with those +trained on the datasets with fully-annotated aesthetic attribute score labels. +Our method makes it feasible to learn meaningful attribute scores for various +aesthetic attribute sets in different types of images with only overall +aesthetic scores. + +
+
+
+
+
+ + ☆ Cache Me if You Can: Accelerating Diffusion Models through Block Caching + + +
+ Diffusion models have recently revolutionized the field of image synthesis +due to their ability to generate photorealistic images. However, one of the +major drawbacks of diffusion models is that the image generation process is +costly. A large image-to-image network has to be applied many times to +iteratively refine an image from random noise. While many recent works propose +techniques to reduce the number of required steps, they generally treat the +underlying denoising network as a black box. In this work, we investigate the +behavior of the layers within the network and find that 1) the layers' output +changes smoothly over time, 2) the layers show distinct patterns of change, and +3) the change from step to step is often very small. We hypothesize that many +layer computations in the denoising network are redundant. Leveraging this, we +introduce block caching, in which we reuse outputs from layer blocks of +previous steps to speed up inference. Furthermore, we propose a technique to +automatically determine caching schedules based on each block's changes over +timesteps. In our experiments, we show through FID, human evaluation and +qualitative analysis that Block Caching allows to generate images with higher +visual quality at the same computational cost. We demonstrate this for +different state-of-the-art models (LDM and EMU) and solvers (DDIM and DPM). + +
+
+
+
+
+ + ☆ Satellite Imagery and AI: A New Era in Ocean Conservation, from Research + to Deployment and Impact NeurIPS + + +
+ Illegal, unreported, and unregulated (IUU) fishing poses a global threat to +ocean habitats. Publicly available satellite data offered by NASA and the +European Space Agency (ESA) provide an opportunity to actively monitor this +activity. Effectively leveraging satellite data for maritime conservation +requires highly reliable machine learning models operating globally with +minimal latency. This paper introduces three specialized computer vision models +designed for synthetic aperture radar (Sentinel-1), optical imagery +(Sentinel-2), and nighttime lights (Suomi-NPP/NOAA-20). It also presents best +practices for developing and delivering real-time computer vision services for +conservation. These models have been deployed in Skylight, a real time maritime +monitoring platform, which is provided at no cost to users worldwide. + +
+
+ comment: 8 pages, 3 figures, submitted to NeurIPS Computational Sustainability + 2023 +
+
+
+
+
+ + ☆ Feature 3DGS: Supercharging 3D Gaussian Splatting to Enable Distilled + Feature Fields + + +
+ 3D scene representations have gained immense popularity in recent years. +Methods that use Neural Radiance fields are versatile for traditional tasks +such as novel view synthesis. In recent times, some work has emerged that aims +to extend the functionality of NeRF beyond view synthesis, for semantically +aware tasks such as editing and segmentation using 3D feature field +distillation from 2D foundation models. However, these methods have two major +limitations: (a) they are limited by the rendering speed of NeRF pipelines, and +(b) implicitly represented feature fields suffer from continuity artifacts +reducing feature quality. Recently, 3D Gaussian Splatting has shown +state-of-the-art performance on real-time radiance field rendering. In this +work, we go one step further: in addition to radiance field rendering, we +enable 3D Gaussian splatting on arbitrary-dimension semantic features via 2D +foundation model distillation. This translation is not straightforward: naively +incorporating feature fields in the 3DGS framework leads to warp-level +divergence. We propose architectural and training changes to efficiently avert +this problem. Our proposed method is general, and our experiments showcase +novel view semantic segmentation, language-guided editing and segment anything +through learning feature fields from state-of-the-art 2D foundation models such +as SAM and CLIP-LSeg. Across experiments, our distillation method is able to +provide comparable or better results, while being significantly faster to both +train and render. Additionally, to the best of our knowledge, we are the first +method to enable point and bounding-box prompting for radiance field +manipulation, by leveraging the SAM model. Project website at: +https://feature-3dgs.github.io/ + +
+
+
+
+
+ + ♻ ☆ WoVoGen: World Volume-aware Diffusion for Controllable Multi-camera + Driving Scene Generation + + +
+ Generating multi-camera street-view videos is critical for augmenting +autonomous driving datasets, addressing the urgent demand for extensive and +varied data. Due to the limitations in diversity and challenges in handling +lighting conditions, traditional rendering-based methods are increasingly being +supplanted by diffusion-based methods. However, a significant challenge in +diffusion-based methods is ensuring that the generated sensor data preserve +both intra-world consistency and inter-sensor coherence. To address these +challenges, we combine an additional explicit world volume and propose the +World Volume-aware Multi-camera Driving Scene Generator (WoVoGen). This system +is specifically designed to leverage 4D world volume as a foundational element +for video generation. Our model operates in two distinct phases: (i) +envisioning the future 4D temporal world volume based on vehicle control +sequences, and (ii) generating multi-camera videos, informed by this envisioned +4D temporal world volume and sensor interconnectivity. The incorporation of the +4D world volume empowers WoVoGen not only to generate high-quality street-view +videos in response to vehicle control inputs but also to facilitate scene +editing tasks. + +
+
+
+
+
+ + ♻ ☆ Source-Free Domain Adaptation for RGB-D Semantic Segmentation with + Vision Transformers WACV 2024 + + +
+ With the increasing availability of depth sensors, multimodal frameworks that +combine color information with depth data are gaining interest. However, ground +truth data for semantic segmentation is burdensome to provide, thus making +domain adaptation a significant research area. Yet most domain adaptation +methods are not able to effectively handle multimodal data. Specifically, we +address the challenging source-free domain adaptation setting where the +adaptation is performed without reusing source data. We propose MISFIT: +MultImodal Source-Free Information fusion Transformer, a depth-aware framework +which injects depth data into a segmentation module based on vision +transformers at multiple stages, namely at the input, feature and output +levels. Color and depth style transfer helps early-stage domain alignment while +re-wiring self-attention between modalities creates mixed features, allowing +the extraction of better semantic content. Furthermore, a depth-based entropy +minimization strategy is also proposed to adaptively weight regions at +different distances. Our framework, which is also the first approach using +RGB-D vision transformers for source-free semantic segmentation, shows +noticeable performance improvements with respect to standard strategies. + +
+
+ comment: WACV 2024, 2nd Workshop on Pretraining (WACVW) +
+
+
+
+
+ + ♻ ☆ Incorporating Crowdsourced Annotator Distributions into Ensemble + Modeling to Improve Classification Trustworthiness for Ancient Greek Papyri + + +
+ Performing classification on noisy, crowdsourced image datasets can prove +challenging even for the best neural networks. Two issues which complicate the +problem on such datasets are class imbalance and ground-truth uncertainty in +labeling. The AL-ALL and AL-PUB datasets - consisting of tightly cropped, +individual characters from images of ancient Greek papyri - are strongly +affected by both issues. The application of ensemble modeling to such datasets +can help identify images where the ground-truth is questionable and quantify +the trustworthiness of those samples. As such, we apply stacked generalization +consisting of nearly identical ResNets with different loss functions: one +utilizing sparse cross-entropy (CXE) and the other Kullback-Liebler Divergence +(KLD). Both networks use labels drawn from a crowd-sourced consensus. This +consensus is derived from a Normalized Distribution of Annotations (NDA) based +on all annotations for a given character in the dataset. For the second +network, the KLD is calculated with respect to the NDA. For our ensemble model, +we apply a k-nearest neighbors model to the outputs of the CXE and KLD +networks. Individually, the ResNet models have approximately 93% accuracy, +while the ensemble model achieves an accuracy of > 95%, increasing the +classification trustworthiness. We also perform an analysis of the Shannon +entropy of the various models' output distributions to measure classification +uncertainty. Our results suggest that entropy is useful for predicting model +misclassifications. + +
+
+
+
+
+ + ♻ ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, +\textit{Flamingo-CXR}, by fine-tuning a well-known vision-language foundation +model on radiology data. To evaluate the quality of the AI-generated reports, a +group of 16 certified radiologists provide detailed evaluations of AI-generated +and human written reports for chest X-rays from an intensive care setting in +the United States and an inpatient setting in India. At least one radiologist +(out of two per case) preferred the AI report to the ground truth report in +over 60$\%$ of cases for both datasets. Amongst the subset of AI-generated +reports that contain errors, the most frequently cited reasons were related to +the location and finding, whereas for human written reports, most mistakes were +related to severity and finding. This disparity suggested potential +complementarity between our AI system and human experts, prompting us to +develop an assistive scenario in which \textit{Flamingo-CXR} generates a +first-draft report, which is subsequently revised by a clinician. This is the +first demonstration of clinician-AI collaboration for report writing, and the +resultant reports are assessed to be equivalent or preferred by at least one +radiologist to reports written by experts alone in 80$\%$ of in-patient cases +and 60$\%$ of intensive care cases. + +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code will be released at +https://github.com/Even-JK/PEFT-3D. + +
+
+ comment: 10 pages. The specialized PEFT framework for 3D pre-trained models, + which achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Even-JK/PEFT-3D +
+
+
+
+
+ + ♻ ☆ Deeply Coupled Cross-Modal Prompt Learning ACL 2023 + + +
+ Recent advancements in multimodal foundation models (e.g., CLIP) have +excelled in zero-shot generalization. Prompt tuning involved in the knowledge +transfer from foundation models to downstream tasks has gained significant +attention recently. Existing prompt-tuning methods in cross-modal learning, +however, either solely focus on language branch, or learn vision-language +interaction in a shallow mechanism. In this context, we propose a Deeply +coupled Cross-modal Prompt learning (DCP) method based on CLIP. DCP flexibly +accommodates the interplay between vision and language with a Cross-Modal +Prompt Attention (CMPA) mechanism, which enables the mutual exchange of +respective representation through a well-connected multi-head attention module +progressively and strongly. We then conduct comprehensive few-shot learning +experiments on 11 image classification datasets and analyze the robustness to +domain shift as well. Thorough experimental analysis evidently demonstrates the +superb few-shot generalization and compelling domain adaption capacity of a +well-executed DCP. The code can be found at https://github.com/GingL/CMPA. + +
+
+ comment: Accepted by ACL 2023 findings +
+
+
+
+
+ + ♻ ☆ Multiple Instance Learning for Digital Pathology: A Review on the + State-of-the-Art, Limitations & Future Potential + + +
+ Digital whole slides images contain an enormous amount of information +providing a strong motivation for the development of automated image analysis +tools. Particularly deep neural networks show high potential with respect to +various tasks in the field of digital pathology. However, a limitation is given +by the fact that typical deep learning algorithms require (manual) annotations +in addition to the large amounts of image data, to enable effective training. +Multiple instance learning exhibits a powerful tool for learning deep neural +networks in a scenario without fully annotated data. These methods are +particularly effective in this domain, due to the fact that labels for a +complete whole slide image are often captured routinely, whereas labels for +patches, regions or pixels are not. This potential already resulted in a +considerable number of publications, with the majority published in the last +three years. Besides the availability of data and a high motivation from the +medical perspective, the availability of powerful graphics processing units +exhibits an accelerator in this field. In this paper, we provide an overview of +widely and effectively used concepts of used deep multiple instance learning +approaches, recent advances and also critically discuss remaining challenges +and future potential. + +
+
+
+
+
+ + ♻ ☆ PaintNet: Unstructured Multi-Path Learning from 3D Point Clouds for + Robotic Spray Painting IROS 2023 + + +
+ Popular industrial robotic problems such as spray painting and welding +require (i) conditioning on free-shape 3D objects and (ii) planning of multiple +trajectories to solve the task. Yet, existing solutions make strong assumptions +on the form of input surfaces and the nature of output paths, resulting in +limited approaches unable to cope with real-data variability. By leveraging on +recent advances in 3D deep learning, we introduce a novel framework capable of +dealing with arbitrary 3D surfaces, and handling a variable number of unordered +output paths (i.e. unstructured). Our approach predicts local path segments, +which can be later concatenated to reconstruct long-horizon paths. We +extensively validate the proposed method in the context of robotic spray +painting by releasing PaintNet, the first public dataset of expert +demonstrations on free-shape 3D objects collected in a real industrial +scenario. A thorough experimental analysis demonstrates the capabilities of our +model to promptly predict smooth output paths that cover up to 95% of +previously unseen object surfaces, even without explicitly optimizing for paint +coverage. + +
+
+ comment: Presented as conference paper at IEEE/RSJ IROS 2023, Detroit, USA. + Project website at https://gabrieletiboni.github.io/paintnet +
+
+
+
+
+ + ♻ ☆ Overcoming Generic Knowledge Loss with Selective Parameter Update + + +
+ Foundation models encompass an extensive knowledge base and offer remarkable +transferability. However, this knowledge becomes outdated or insufficient over +time. The challenge lies in continuously updating foundation models to +accommodate novel information while retaining their original capabilities. +Leveraging the fact that foundation models have initial knowledge on various +tasks and domains, we propose a novel approach that, instead of updating all +parameters equally, localizes the updates to a sparse set of parameters +relevant to the task being learned. We strike a balance between efficiency and +new task performance, while maintaining the transferability and +generalizability of foundation models. We extensively evaluate our method on +foundational vision-language models with a diverse spectrum of continual +learning tasks. Our method achieves improvements on the accuracy of the newly +learned tasks up to 7% while preserving the pretraining knowledge with a +negligible decrease of 0.9% on a representative control set accuracy. + +
+
+
+
+
+ + ♻ ☆ Self-Supervised Open-Ended Classification with Small Visual Language + Models + + +
+ We present Self-Context Adaptation (SeCAt), a self-supervised approach that +unlocks few-shot abilities for open-ended classification with small visual +language models. Our approach imitates image captions in a self-supervised way +based on clustering a large pool of images followed by assigning +semantically-unrelated names to clusters. By doing so, we construct a training +signal consisting of interleaved sequences of image and pseudocaption pairs and +a query image, which we denote as the 'self-context' sequence. Based on this +signal the model is trained to produce the right pseudo-caption. We demonstrate +the performance and flexibility of SeCAt on several multimodal few-shot +datasets, spanning various granularities. By using models with approximately 1B +parameters we outperform the few-shot abilities of much larger models, such as +Frozen and FROMAGe. SeCAt opens new possibilities for research and applications +in open-ended few-shot learning that otherwise requires access to large or +proprietary models. + +
+
+
+
+
+ + ♻ ☆ Visual Data-Type Understanding does not emerge from Scaling + Vision-Language Models + + +
+ Recent advances in the development of vision-language models (VLMs) are +yielding remarkable success in recognizing visual semantic content, including +impressive instances of compositional image understanding. Here, we introduce +the novel task of Visual Data-Type Identification, a basic perceptual skill +with implications for data curation (e.g., noisy data-removal from large +datasets, domain-specific retrieval) and autonomous vision (e.g., +distinguishing changing weather conditions from camera lens staining). We +develop two datasets consisting of animal images altered across a diverse set +of 27 visual data-types, spanning four broad categories. An extensive zero-shot +evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a nuanced +performance landscape. While VLMs are reasonably good at identifying certain +stylistic \textit{data-types}, such as cartoons and sketches, they struggle +with simpler data-types arising from basic manipulations like image rotations +or additive noise. Our findings reveal that (i) model scaling alone yields +marginal gains for contrastively-trained models like CLIP, and (ii) there is a +pronounced drop in performance for the largest auto-regressively trained VLMs +like OpenFlamingo. This finding points to a blind spot in current frontier +VLMs: they excel in recognizing semantic content but fail to acquire an +understanding of visual data-types through scaling. By analyzing the +pre-training distributions of these models and incorporating data-type +information into the captions during fine-tuning, we achieve a significant +enhancement in performance. By exploring this previously uncharted task, we aim +to set the stage for further advancing VLMs to equip them with visual data-type +understanding. Code and datasets are released at +https://github.com/bethgelab/DataTypeIdentification. + +
+
+
+
+
+ + ♻ ☆ FaceStudio: Put Your Face Everywhere in Seconds + + +
+ This study investigates identity-preserving image synthesis, an intriguing +task in image generation that seeks to maintain a subject's identity while +adding a personalized, stylistic touch. Traditional methods, such as Textual +Inversion and DreamBooth, have made strides in custom image creation, but they +come with significant drawbacks. These include the need for extensive resources +and time for fine-tuning, as well as the requirement for multiple reference +images. To overcome these challenges, our research introduces a novel approach +to identity-preserving synthesis, with a particular focus on human images. Our +model leverages a direct feed-forward mechanism, circumventing the need for +intensive fine-tuning, thereby facilitating quick and efficient image +generation. Central to our innovation is a hybrid guidance framework, which +combines stylized images, facial images, and textual prompts to guide the image +generation process. This unique combination enables our model to produce a +variety of applications, such as artistic portraits and identity-blended +images. Our experimental results, including both qualitative and quantitative +evaluations, demonstrate the superiority of our method over existing baseline +models and previous works, particularly in its remarkable efficiency and +ability to preserve the subject's identity with high fidelity. + +
+
+ comment: Project homepage: https://icoz69.github.io/facestudio/ +
+
+
+
+
+ + ♻ ☆ InfMLLM: A Unified Framework for Visual-Language Tasks + + +
+ Large language models (LLMs) have proven their remarkable versatility in +handling a comprehensive range of language-centric applications. To expand +LLMs' capabilities to a broader spectrum of modal inputs, multimodal large +language models (MLLMs) have attracted growing interest. This work delves into +enabling LLMs to tackle more vision-language-related tasks, particularly image +captioning, visual question answering (VQA,) and visual grounding. To this end, +we implemented a three-stage training scheme: starting with lightweight +alignment pretraining, then moderate-weight multitask hybrid training, and +finally, LLM fine-tuning to improve instruction following capability. +Throughout the training process, the requirements on GPU memory gradually +increase. To effectively manage the number of visual embeddings passed to the +LLM while preserving their positional information, we introduce a +straightforward visual adapter module dubbed pool-adapter. Our experiments +demonstrate that preserving the positional information of visual embeddings +through the pool-adapter is particularly beneficial for tasks like visual +grounding. We name our proposed approach InfMLLM and have evaluated it +extensively on various benchmark datasets. Our results demonstrate that InfMLLM +achieves either state-of-the-art (SOTA) performance or performance comparable +to recent MLLMs. The code and model will be made open-source at: +\url{https://github.com/mightyzau/InfMLLM}. + +
+
+ comment: 8 +
+
+
+
+
+ + ♻ ☆ SAIF: Sparse Adversarial and Imperceptible Attack Framework + + +
+ Adversarial attacks hamper the decision-making ability of neural networks by +perturbing the input signal. The addition of calculated small distortion to +images, for instance, can deceive a well-trained image classification network. +In this work, we propose a novel attack technique called Sparse Adversarial and +Interpretable Attack Framework (SAIF). Specifically, we design imperceptible +attacks that contain low-magnitude perturbations at a small number of pixels +and leverage these sparse attacks to reveal the vulnerability of classifiers. +We use the Frank-Wolfe (conditional gradient) algorithm to simultaneously +optimize the attack perturbations for bounded magnitude and sparsity with +$O(1/\sqrt{T})$ convergence. Empirical results show that SAIF computes highly +imperceptible and interpretable adversarial examples, and outperforms +state-of-the-art sparse attack methods on the ImageNet dataset. + +
+
+
+
+
+ + ♻ ☆ Publicly available datasets of breast histopathology H&E whole-slide + images: A scoping review + + +
+ Advancements in digital pathology and computing resources have made a +significant impact in the field of computational pathology for breast cancer +diagnosis and treatment. However, access to high-quality labeled +histopathological images of breast cancer is a big challenge that limits the +development of accurate and robust deep learning models. In this scoping +review, we identified the publicly available datasets of breast H&E stained +whole-slide images (WSI) that can be used to develop deep learning algorithms. +We systematically searched nine scientific literature databases and nine +research data repositories and found 17 publicly available datasets containing +10385 H&E WSIs of breast cancer. Moreover, we reported image metadata and +characteristics for each dataset to assist researchers in selecting proper +datasets for specific tasks in breast cancer computational pathology. In +addition, we compiled two lists of breast H&E patches and private datasets as +supplementary resources for researchers. Notably, only 28% of the included +articles utilized multiple datasets, and only 14% used an external validation +set, suggesting that the performance of other developed models may be +susceptible to overestimation. The TCGA-BRCA was used in 52% of the selected +studies. This dataset has a considerable selection bias that can impact the +robustness and generalizability of the trained algorithms. There is also a lack +of consistent metadata reporting of breast WSI datasets that can be an issue in +developing accurate deep learning models, indicating the necessity of +establishing explicit guidelines for documenting breast WSI dataset +characteristics and metadata. + +
+
+ comment: 27 pages (including references), 8 figures, 3 tables, 5 supporting + information materials +
+
+
+
+
+ + ♻ ☆ Technical Report on Subspace Pyramid Fusion Network for Semantic + Segmentation + + +
+ The following is a technical report to test the validity of the proposed +Subspace Pyramid Fusion Module (SPFM) to capture multi-scale feature +representations, which is more useful for semantic segmentation. In this +investigation, we have proposed the Efficient Shuffle Attention Module(ESAM) to +reconstruct the skip-connections paths by fusing multi-level global context +features. Experimental results on two well-known semantic segmentation +datasets, including Camvid and Cityscapes, show the effectiveness of our +proposed method. + +
+
+
+
+
+ + ♻ ☆ Few-shot Hybrid Domain Adaptation of Image Generators + + +
+ Can a pre-trained generator be adapted to the hybrid of multiple target +domains and generate images with integrated attributes of them? In this work, +we introduce a new task -- Few-shot Hybrid Domain Adaptation (HDA). Given a +source generator and several target domains, HDA aims to acquire an adapted +generator that preserves the integrated attributes of all target domains, +without overriding the source domain's characteristics. Compared with Domain +Adaptation (DA), HDA offers greater flexibility and versatility to adapt +generators to more composite and expansive domains. Simultaneously, HDA also +presents more challenges than DA as we have access only to images from +individual target domains and lack authentic images from the hybrid domain. To +address this issue, we introduce a discriminator-free framework that directly +encodes different domains' images into well-separable subspaces. To achieve +HDA, we propose a novel directional subspace loss comprised of a distance loss +and a direction loss. Concretely, the distance loss blends the attributes of +all target domains by reducing the distances from generated images to all +target subspaces. The direction loss preserves the characteristics from the +source domain by guiding the adaptation along the perpendicular to subspaces. +Experiments show that our method can obtain numerous domain-specific attributes +in a single adapted generator, which surpasses the baseline methods in semantic +similarity, image fidelity, and cross-domain consistency. + +
+
+
+
+
+ + ♻ ☆ Evaluating Point Cloud from Moving Camera Videos: A No-Reference Metric + + +
+ Point cloud is one of the most widely used digital representation formats for +three-dimensional (3D) contents, the visual quality of which may suffer from +noise and geometric shift distortions during the production procedure as well +as compression and downsampling distortions during the transmission process. To +tackle the challenge of point cloud quality assessment (PCQA), many PCQA +methods have been proposed to evaluate the visual quality levels of point +clouds by assessing the rendered static 2D projections. Although such +projection-based PCQA methods achieve competitive performance with the +assistance of mature image quality assessment (IQA) methods, they neglect that +the 3D model is also perceived in a dynamic viewing manner, where the viewpoint +is continually changed according to the feedback of the rendering device. +Therefore, in this paper, we evaluate the point clouds from moving camera +videos and explore the way of dealing with PCQA tasks via using video quality +assessment (VQA) methods. First, we generate the captured videos by rotating +the camera around the point clouds through several circular pathways. Then we +extract both spatial and temporal quality-aware features from the selected key +frames and the video clips through using trainable 2D-CNN and pre-trained +3D-CNN models respectively. Finally, the visual quality of point clouds is +represented by the video quality values. The experimental results reveal that +the proposed method is effective for predicting the visual quality levels of +the point clouds and even competitive with full-reference (FR) PCQA methods. +The ablation studies further verify the rationality of the proposed framework +and confirm the contributions made by the quality-aware features extracted via +the dynamic viewing manner. The code is available at +https://github.com/zzc-1998/VQA_PC. + +
+
+
+
+
+ + ♻ ☆ Towards Learning a Generalist Model for Embodied Navigation + + +
+ Building a generalist agent that can interact with the world is the +intriguing target of AI systems, thus spurring the research for embodied +navigation, where an agent is required to navigate according to instructions or +respond to queries. Despite the major progress attained, previous works +primarily focus on task-specific agents and lack generalizability to unseen +scenarios. Recently, LLMs have presented remarkable capabilities across various +fields, and provided a promising opportunity for embodied navigation. Drawing +on this, we propose the first generalist model for embodied navigation, +NaviLLM. It adapts LLMs to embodied navigation by introducing schema-based +instruction. The schema-based instruction flexibly casts various tasks into +generation problems, thereby unifying a wide range of tasks. This approach +allows us to integrate diverse data sources from various datasets into the +training, equipping NaviLLM with a wide range of capabilities required by +embodied navigation. We conduct extensive experiments to evaluate the +performance and generalizability of our model. The experimental results +demonstrate that our unified model achieves state-of-the-art performance on +CVDN, SOON, and ScanQA. Specifically, it surpasses the previous +stats-of-the-art method by a significant margin of 29% in goal progress on +CVDN. Moreover, our model also demonstrates strong generalizability and +presents impressive results on unseen tasks, e.g., embodied question answering +and 3D captioning. + +
+
+ comment: 13 pages, 3 figures. Official code: + https://github.com/zd11024/NaviLLM +
+
+
+
+
+ + ♻ ☆ Universal Segmentation at Arbitrary Granularity with Language + Instruction + + +
+ This paper aims to achieve universal segmentation of arbitrary semantic +level. Despite significant progress in recent years, specialist segmentation +approaches are limited to specific tasks and data distribution. Retraining a +new model for adaptation to new scenarios or settings takes expensive +computation and time cost, which raises the demand for versatile and universal +segmentation model that can cater to various granularity. Although some +attempts have been made for unifying different segmentation tasks or +generalization to various scenarios, limitations in the definition of paradigms +and input-output spaces make it difficult for them to achieve accurate +understanding of content at arbitrary granularity. To this end, we present +UniLSeg, a universal segmentation model that can perform segmentation at any +semantic level with the guidance of language instructions. For training +UniLSeg, we reorganize a group of tasks from original diverse distributions +into a unified data format, where images with texts describing segmentation +targets as input and corresponding masks are output. Combined with a automatic +annotation engine for utilizing numerous unlabeled data, UniLSeg achieves +excellent performance on various tasks and settings, surpassing both specialist +and unified segmentation models. + +
+
+
+
+
+ + ♻ ☆ Optimizing rgb-d semantic segmentation through multi-modal interaction + and pooling attention + + +
+ Semantic segmentation of RGB-D images involves understanding the appearance +and spatial relationships of objects within a scene, which requires careful +consideration of various factors. However, in indoor environments, the simple +input of RGB and depth images often results in a relatively limited acquisition +of semantic and spatial information, leading to suboptimal segmentation +outcomes. To address this, we propose the Multi-modal Interaction and Pooling +Attention Network (MIPANet), a novel approach designed to harness the +interactive synergy between RGB and depth modalities, optimizing the +utilization of complementary information. Specifically, we incorporate a +Multi-modal Interaction Fusion Module (MIM) into the deepest layers of the +network. This module is engineered to facilitate the fusion of RGB and depth +information, allowing for mutual enhancement and correction. Additionally, we +introduce a Pooling Attention Module (PAM) at various stages of the encoder. +This module serves to amplify the features extracted by the network and +integrates the module's output into the decoder in a targeted manner, +significantly improving semantic segmentation performance. Our experimental +results demonstrate that MIPANet outperforms existing methods on two indoor +scene datasets, NYUDv2 and SUN-RGBD, underscoring its effectiveness in +enhancing RGB-D semantic segmentation. + +
+
+
+
+
+ + ♻ ☆ KappaFace: Adaptive Additive Angular Margin Loss for Deep Face + Recognition + + +
+ Feature learning is a widely used method employed for large-scale face +recognition. Recently, large-margin softmax loss methods have demonstrated +significant enhancements on deep face recognition. These methods propose fixed +positive margins in order to enforce intra-class compactness and inter-class +diversity. However, the majority of the proposed methods do not consider the +class imbalance issue, which is a major challenge in practice for developing +deep face recognition models. We hypothesize that it significantly affects the +generalization ability of the deep face models. Inspired by this observation, +we introduce a novel adaptive strategy, called KappaFace, to modulate the +relative importance based on class difficultness and imbalance. With the +support of the von Mises-Fisher distribution, our proposed KappaFace loss can +intensify the margin's magnitude for hard learning or low concentration classes +while relaxing it for counter classes. Experiments conducted on popular facial +benchmarks demonstrate that our proposed method achieves superior performance +to the state-of-the-art. + +
+
+
+
+
+ + ♻ ☆ FastPillars: A Deployment-friendly Pillar-based 3D Detector + + +
+ The deployment of 3D detectors strikes one of the major challenges in +real-world self-driving scenarios. Existing BEV-based (i.e., Bird Eye View) +detectors favor sparse convolutions (known as SPConv) to speed up training and +inference, which puts a hard barrier for deployment, especially for on-device +applications. In this paper, to tackle the challenge of efficient 3D object +detection from an industry perspective, we devise a deployment-friendly +pillar-based 3D detector, termed FastPillars. First, we introduce a novel +lightweight Max-and-Attention Pillar Encoding (MAPE) module specially for +enhancing small 3D objects. Second, we propose a simple yet effective principle +for designing a backbone in pillar-based 3D detection. We construct FastPillars +based on these designs, achieving high performance and low latency without +SPConv. Extensive experiments on two large-scale datasets demonstrate the +effectiveness and efficiency of FastPillars for on-device 3D detection +regarding both performance and speed. Specifically, FastPillars delivers +state-of-the-art accuracy on Waymo Open Dataset with 1.8X speed up and 3.8 +mAPH/L2 improvement over CenterPoint (SPConv-based). Our code is publicly +available at: https://github.com/StiphyJay/FastPillars. + +
+
+
+
+
+ + ♻ ☆ DreamPropeller: Supercharge Text-to-3D Generation with Parallel Sampling + + +
+ Recent methods such as Score Distillation Sampling (SDS) and Variational +Score Distillation (VSD) using 2D diffusion models for text-to-3D generation +have demonstrated impressive generation quality. However, the long generation +time of such algorithms significantly degrades the user experience. To tackle +this problem, we propose DreamPropeller, a drop-in acceleration algorithm that +can be wrapped around any existing text-to-3D generation pipeline based on +score distillation. Our framework generalizes Picard iterations, a classical +algorithm for parallel sampling an ODE path, and can account for non-ODE paths +such as momentum-based gradient updates and changes in dimensions during the +optimization process as in many cases of 3D generation. We show that our +algorithm trades parallel compute for wallclock time and empirically achieves +up to 4.7x speedup with a negligible drop in generation quality for all tested +frameworks. + +
+
+ comment: Github repo: https://github.com/alexzhou907/DreamPropeller; Project + page: https://alexzhou907.github.io/dreampropeller_page/ +
+
+
+
+
+ + ♻ ☆ MME: A Comprehensive Evaluation Benchmark for Multimodal Large Language + Models + + +
+ Multimodal Large Language Model (MLLM) relies on the powerful LLM to perform +multimodal tasks, showing amazing emergent abilities in recent studies, such as +writing poems based on an image. However, it is difficult for these case +studies to fully reflect the performance of MLLM, lacking a comprehensive +evaluation. In this paper, we fill in this blank, presenting the first +comprehensive MLLM Evaluation benchmark MME. It measures both perception and +cognition abilities on a total of 14 subtasks. In order to avoid data leakage +that may arise from direct use of public datasets for evaluation, the +annotations of instruction-answer pairs are all manually designed. The concise +instruction design allows us to fairly compare MLLMs, instead of struggling in +prompt engineering. Besides, with such an instruction, we can also easily carry +out quantitative statistics. A total of 30 advanced MLLMs are comprehensively +evaluated on our MME, which not only suggests that existing MLLMs still have a +large room for improvement, but also reveals the potential directions for the +subsequent model optimization. + +
+
+ comment: Project page: + https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models +
+
+
+
+
+ + ♻ ☆ Epistemic Graph: A Plug-And-Play Module For Hybrid Representation + Learning + + +
+ In recent years, deep models have achieved remarkable success in various +vision tasks. However, their performance heavily relies on large training +datasets. In contrast, humans exhibit hybrid learning, seamlessly integrating +structured knowledge for cross-domain recognition or relying on a smaller +amount of data samples for few-shot learning. Motivated by this human-like +epistemic process, we aim to extend hybrid learning to computer vision tasks by +integrating structured knowledge with data samples for more effective +representation learning. Nevertheless, this extension faces significant +challenges due to the substantial gap between structured knowledge and deep +features learned from data samples, encompassing both dimensions and knowledge +granularity. In this paper, a novel Epistemic Graph Layer (EGLayer) is +introduced to enable hybrid learning, enhancing the exchange of information +between deep features and a structured knowledge graph. Our EGLayer is composed +of three major parts, including a local graph module, a query aggregation +model, and a novel correlation alignment loss function to emulate human +epistemic ability. Serving as a plug-and-play module that can replace the +standard linear classifier, EGLayer significantly improves the performance of +deep models. Extensive experiments demonstrates that EGLayer can greatly +enhance representation learning for the tasks of cross-domain recognition and +few-shot learning, and the visualization of knowledge graphs can aid in model +interpretation. + +
+
+ comment: 15 pages +
+
+
+
+
+ + ♻ ☆ Towards Transferable Multi-modal Perception Representation Learning for + Autonomy: NeRF-Supervised Masked AutoEncoder + + +
+ This work proposes a unified self-supervised pre-training framework for +transferable multi-modal perception representation learning via masked +multi-modal reconstruction in Neural Radiance Field (NeRF), namely +NeRF-Supervised Masked AutoEncoder (NS-MAE). Specifically, conditioned on +certain view directions and locations, multi-modal embeddings extracted from +corrupted multi-modal input signals, i.e., Lidar point clouds and images, are +rendered into projected multi-modal feature maps via neural rendering. Then, +original multi-modal signals serve as reconstruction targets for the rendered +multi-modal feature maps to enable self-supervised representation learning. +Extensive experiments show that the representation learned via NS-MAE shows +promising transferability for diverse multi-modal and single-modal (camera-only +and Lidar-only) perception models on diverse 3D perception downstream tasks (3D +object detection and BEV map segmentation) with diverse amounts of fine-tuning +labeled data. Moreover, we empirically find that NS-MAE enjoys the synergy of +both the mechanism of masked autoencoder and neural radiance field. We hope +this study can inspire exploration of more general multi-modal representation +learning for autonomous agents. + +
+
+
+
+
+ + ♻ ☆ The SVHN Dataset Is Deceptive for Probabilistic Generative Models Due to + a Distribution Mismatch NeurIPS 2023 + + +
+ The Street View House Numbers (SVHN) dataset is a popular benchmark dataset +in deep learning. Originally designed for digit classification tasks, the SVHN +dataset has been widely used as a benchmark for various other tasks including +generative modeling. However, with this work, we aim to warn the community +about an issue of the SVHN dataset as a benchmark for generative modeling +tasks: we discover that the official split into training set and test set of +the SVHN dataset are not drawn from the same distribution. We empirically show +that this distribution mismatch has little impact on the classification task +(which may explain why this issue has not been detected before), but it +severely affects the evaluation of probabilistic generative models, such as +Variational Autoencoders and diffusion models. As a workaround, we propose to +mix and re-split the official training and test set when SVHN is used for tasks +other than classification. We publish a new split and the indices we used to +create it at https://jzenn.github.io/svhn-remix/ . + +
+
+ comment: Accepted at NeurIPS 2023 Workshop on Distribution Shifts; 4 pages + + appendix; proposed data set at https://jzenn.github.io/svhn-remix/ +
+
+
+
+
+ + ♻ ☆ The Surprising Effectiveness of Diffusion Models for Optical Flow and + Monocular Depth Estimation NeurIPS 2023 + + +
+ Denoising diffusion probabilistic models have transformed image generation +with their impressive fidelity and diversity. We show that they also excel in +estimating optical flow and monocular depth, surprisingly, without +task-specific architectures and loss functions that are predominant for these +tasks. Compared to the point estimates of conventional regression-based +methods, diffusion models also enable Monte Carlo inference, e.g., capturing +uncertainty and ambiguity in flow and depth. With self-supervised pre-training, +the combined use of synthetic and real data for supervised training, and +technical innovations (infilling and step-unrolled denoising diffusion +training) to handle noisy-incomplete training data, and a simple form of +coarse-to-fine refinement, one can train state-of-the-art diffusion models for +depth and optical flow estimation. Extensive experiments focus on quantitative +performance against benchmarks, ablations, and the model's ability to capture +uncertainty and multimodality, and impute missing values. Our model, DDVM +(Denoising Diffusion Vision Model), obtains a state-of-the-art relative depth +error of 0.074 on the indoor NYU benchmark and an Fl-all outlier rate of 3.26\% +on the KITTI optical flow benchmark, about 25\% better than the best published +method. For an overview see https://diffusion-vision.github.io. + +
+
+ comment: NeurIPS 2023 (Oral) +
+
+
+
+
+ + ♻ ☆ Context-PIPs: Persistent Independent Particles Demands Spatial Context + Features + + +
+ We tackle the problem of Persistent Independent Particles (PIPs), also called +Tracking Any Point (TAP), in videos, which specifically aims at estimating +persistent long-term trajectories of query points in videos. Previous methods +attempted to estimate these trajectories independently to incorporate longer +image sequences, therefore, ignoring the potential benefits of incorporating +spatial context features. We argue that independent video point tracking also +demands spatial context features. To this end, we propose a novel framework +Context-PIPs, which effectively improves point trajectory accuracy by +aggregating spatial context features in videos. Context-PIPs contains two main +modules: 1) a SOurse Feature Enhancement (SOFE) module, and 2) a TArget Feature +Aggregation (TAFA) module. Context-PIPs significantly improves PIPs all-sided, +reducing 11.4% Average Trajectory Error of Occluded Points (ATE-Occ) on CroHD +and increasing 11.8% Average Percentage of Correct Keypoint (A-PCK) on +TAP-Vid-Kinectics. Demos are available at +https://wkbian.github.io/Projects/Context-PIPs/. + +
+
+ comment: Project Page: https://wkbian.github.io/Projects/Context-PIPs/ +
+
+
+
+
+ + ♻ ☆ Early Autism Diagnosis based on Path Signature and Siamese Unsupervised + Feature Compressor + + +
+ Autism Spectrum Disorder (ASD) has been emerging as a growing public health +threat. Early diagnosis of ASD is crucial for timely, effective intervention +and treatment. However, conventional diagnosis methods based on communications +and behavioral patterns are unreliable for children younger than 2 years of +age. Given evidences of neurodevelopmental abnormalities in ASD infants, we +resort to a novel deep learning-based method to extract key features from the +inherently scarce, class-imbalanced, and heterogeneous structural MR images for +early autism diagnosis. Specifically, we propose a Siamese verification +framework to extend the scarce data, and an unsupervised compressor to +alleviate data imbalance by extracting key features. We also proposed weight +constraints to cope with sample heterogeneity by giving different samples +different voting weights during validation, and we used Path Signature to +unravel meaningful developmental features from the two-time point data +longitudinally. We further extracted machine learning focused brain regions for +autism diagnosis. Extensive experiments have shown that our method performed +well under practical scenarios, transcending existing machine learning methods +and providing anatomical insights for autism early diagnosis. + +
+
+
+
+
+ + ♻ ☆ Global and Local Semantic Completion Learning for Vision-Language + Pre-training + + +
+ Cross-modal alignment plays a crucial role in vision-language pre-training +(VLP) models, enabling them to capture meaningful associations across different +modalities. For this purpose, numerous masked modeling tasks have been proposed +for VLP to further promote cross-modal interactions. The core idea of previous +masked modeling tasks is to focus on reconstructing the masked tokens based on +visible context for learning local-local alignment. However, most of them pay +little attention to the global semantic features generated for the masked data, +resulting in a limited cross-modal alignment ability of global representations +to local features of the other modality. Therefore, in this paper, we propose a +novel Global and Local Semantic Completion Learning (GLSCL) task to facilitate +global-local alignment and local-local alignment simultaneously. Specifically, +the GLSCL task complements the missing semantics of masked data and recovers +global and local features by cross-modal interactions. Our GLSCL consists of +masked global semantic completion (MGSC) and masked local token completion +(MLTC). MGSC promotes learning more representative global features, which have +a great impact on the performance of downstream tasks, while MLTC reconstructs +modal-fusion local tokens, further enhancing accurate comprehension of +multimodal data. To evaluate the proposed approaches on cross-modal alignment, +we develop a validation benchmark called ALIGN-BENCH. Moreover, we present a +flexible vision encoder, enabling our model to simultaneously perform +image-text and video-text multimodal tasks. Experimental results show that our +proposed method obtains state-of-the-art performance on various vision-language +benchmarks, such as visual question answering, image-text retrieval, and +video-text retrieval. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2211.13437 +
+
+
+
+
+ + ♻ ☆ BenchLMM: Benchmarking Cross-style Visual Capability of Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) such as GPT-4V and LLaVA have shown remarkable +capabilities in visual reasoning with common image styles. However, their +robustness against diverse style shifts, crucial for practical applications, +remains largely unexplored. In this paper, we propose a new benchmark, +BenchLMM, to assess the robustness of LMMs against three different styles: +artistic image style, imaging sensor style, and application style, where each +style has five sub-styles. Utilizing BenchLMM, we comprehensively evaluate +state-of-the-art LMMs and reveal: 1) LMMs generally suffer performance +degradation when working with other styles; 2) An LMM performs better than +another model in common style does not guarantee its superior performance in +other styles; 3) LMMs' reasoning capability can be enhanced by prompting LMMs +to predict the style first, based on which we propose a versatile and +training-free method for improving LMMs; 4) An intelligent LMM is expected to +interpret the causes of its errors when facing stylistic variations. We hope +that our benchmark and analysis can shed new light on developing more +intelligent and versatile LMMs. + +
+
+ comment: Code is available at https://github.com/AIFEG/BenchLMM +
+
+
+
+
+ + ♻ ☆ The Art of Camouflage: Few-shot Learning for Animal Detection and + Segmentation + + +
+ Camouflaged object detection and segmentation is a new and challenging +research topic in computer vision. There is a serious issue of lacking data of +camouflaged objects such as camouflaged animals in natural scenes. In this +paper, we address the problem of few-shot learning for camouflaged object +detection and segmentation. To this end, we first collect a new dataset, +CAMO-FS, for the benchmark. We then propose a novel method to efficiently +detect and segment the camouflaged objects in the images. In particular, we +introduce the instance triplet loss and the instance memory storage. The +extensive experiments demonstrated that our proposed method achieves +state-of-the-art performance on the newly collected dataset. + +
+
+ comment: Under-review Journal +
+
+
+
+
+ + ♻ ☆ PanoGRF: Generalizable Spherical Radiance Fields for Wide-baseline + Panoramas NeurIPS2023 + + +
+ Achieving an immersive experience enabling users to explore virtual +environments with six degrees of freedom (6DoF) is essential for various +applications such as virtual reality (VR). Wide-baseline panoramas are commonly +used in these applications to reduce network bandwidth and storage +requirements. However, synthesizing novel views from these panoramas remains a +key challenge. Although existing neural radiance field methods can produce +photorealistic views under narrow-baseline and dense image captures, they tend +to overfit the training views when dealing with \emph{wide-baseline} panoramas +due to the difficulty in learning accurate geometry from sparse $360^{\circ}$ +views. To address this problem, we propose PanoGRF, Generalizable Spherical +Radiance Fields for Wide-baseline Panoramas, which construct spherical radiance +fields incorporating $360^{\circ}$ scene priors. Unlike generalizable radiance +fields trained on perspective images, PanoGRF avoids the information loss from +panorama-to-perspective conversion and directly aggregates geometry and +appearance features of 3D sample points from each panoramic view based on +spherical projection. Moreover, as some regions of the panorama are only +visible from one view while invisible from others under wide baseline settings, +PanoGRF incorporates $360^{\circ}$ monocular depth priors into spherical depth +estimation to improve the geometry features. Experimental results on multiple +panoramic datasets demonstrate that PanoGRF significantly outperforms +state-of-the-art generalizable view synthesis methods for wide-baseline +panoramas (e.g., OmniSyn) and perspective images (e.g., IBRNet, NeuRay). + +
+
+ comment: accepted to NeurIPS2023; Project Page: + https://thucz.github.io/PanoGRF/ +
+
+
+
+
+ + ♻ ☆ Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language + Models with Creative Humor Generation + + +
+ Chain-of-Thought (CoT) guides large language models (LLMs) to reason +step-by-step, and can motivate their logical reasoning ability. While effective +for logical tasks, CoT is not conducive to creative problem-solving which often +requires out-of-box thoughts and is crucial for innovation advancements. In +this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a +non-sequential, creative paradigm involving strong associations and knowledge +leaps. To this end, we study LLMs on the popular Oogiri game which needs +participants to have good creativity and strong associative thinking for +responding unexpectedly and humorously to the given image, text, or both, and +thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the +Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset +which contains over 130,000 samples from the Oogiri game, and observe the +insufficient LoT ability or failures of most existing LLMs on the Oogiri game. +Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve +LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into +LoT-oriented instruction tuning data to train pretrained LLM for achieving +certain LoT humor generation and discrimination abilities. Then CLoT designs an +explorative self-refinement that encourages the LLM to generate more creative +LoT data via exploring parallels between seemingly unrelated concepts and +selects high-quality data to train itself for self-refinement. CLoT not only +excels in humor generation in the Oogiri game but also boosts creative +abilities in various tasks like cloud guessing game and divergent association +task. These findings advance our understanding and offer a pathway to improve +LLMs' creative capacities for innovative applications across domains. The +dataset, code, and models will be released online. +https://zhongshsh.github.io/CLoT/. + +
+
+ comment: Technical report +
+
+
+
+
+ + ♻ ☆ TongueSAM: An Universal Tongue Segmentation Model Based on SAM with + Zero-Shot + + +
+ Tongue segmentation serves as the primary step in automated TCM tongue +diagnosis, which plays a significant role in the diagnostic results. Currently, +numerous deep learning based methods have achieved promising results. However, +when confronted with tongue images that differ from the training set or possess +challenging backgrounds, these methods demonstrate limited performance. To +address this issue, this paper proposes a universal tongue segmentation model +named TongueSAM based on SAM (Segment Anything Model). SAM is a large-scale +pretrained interactive segmentation model known for its powerful zero-shot +generalization capability. Applying SAM to tongue segmentation leverages its +learned prior knowledge from natural images, enabling the achievement of +zero-shot segmentation for various types of tongue images. In this study, a +Prompt Generator based on object detection is integrated into SAM to enable an +end-to-end automated tongue segmentation method. Experiments demonstrate that +TongueSAM achieves exceptional performance across various of tongue +segmentation datasets, particularly under zero-shot. Even when dealing with +challenging background tongue images, TongueSAM achieves a mIoU of 95.23\% +under zero-shot conditions, surpassing other segmentation methods. As far as we +know, this is the first application of large-scale pretrained model for tongue +segmentation. The project mentioned in this paper is currently publicly +available. + +
+
+
+
+
+ + ♻ ☆ Leveraging Single-View Images for Unsupervised 3D Point Cloud Completion + + +
+ Point clouds captured by scanning devices are often incomplete due to +occlusion. To overcome this limitation, point cloud completion methods have +been developed to predict the complete shape of an object based on its partial +input. These methods can be broadly classified as supervised or unsupervised. +However, both categories require a large number of 3D complete point clouds, +which may be difficult to capture. In this paper, we propose Cross-PCC, an +unsupervised point cloud completion method without requiring any 3D complete +point clouds. We only utilize 2D images of the complete objects, which are +easier to capture than 3D complete and clean point clouds. Specifically, to +take advantage of the complementary information from 2D images, we use a +single-view RGB image to extract 2D features and design a fusion module to fuse +the 2D and 3D features extracted from the partial point cloud. To guide the +shape of predicted point clouds, we project the predicted points of the object +to the 2D plane and use the foreground pixels of its silhouette maps to +constrain the position of the projected points. To reduce the outliers of the +predicted point clouds, we propose a view calibrator to move the points +projected to the background into the foreground by the single-view silhouette +image. To the best of our knowledge, our approach is the first point cloud +completion method that does not require any 3D supervision. The experimental +results of our method are superior to those of the state-of-the-art +unsupervised methods by a large margin. Moreover, our method even achieves +comparable performance to some supervised methods. We will make the source code +publicly available at https://github.com/ltwu6/cross-pcc. + +
+
+ comment: Accepted in IEEE Transactions on Multimedia +
+
+
+
+
+ + ♻ ☆ Improving Depth Gradient Continuity in Transformers: A Comparative Study + on Monocular Depth Estimation with CNN + + +
+ Monocular depth estimation is an ongoing challenge in computer vision. Recent +progress with Transformer models has demonstrated notable advantages over +conventional CNNs in this area. However, there's still a gap in understanding +how these models prioritize different regions in 2D images and how these +regions affect depth estimation performance. To explore the differences between +Transformers and CNNs, we employ a sparse pixel approach to contrastively +analyze the distinctions between the two. Our findings suggest that while +Transformers excel in handling global context and intricate textures, they lag +behind CNNs in preserving depth gradient continuity. To further enhance the +performance of Transformer models in monocular depth estimation, we propose the +Depth Gradient Refinement (DGR) module that refines depth estimation through +high-order differentiation, feature fusion, and recalibration. Additionally, we +leverage optimal transport theory, treating depth maps as spatial probability +distributions, and employ the optimal transport distance as a loss function to +optimize our model. Experimental results demonstrate that models integrated +with the plug-and-play Depth Gradient Refinement (DGR) module and the proposed +loss function enhance performance without increasing complexity and +computational costs on both outdoor KITTI and indoor NYU-Depth-v2 datasets. +This research not only offers fresh insights into the distinctions between +Transformers and CNNs in depth estimation but also paves the way for novel +depth estimation methodologies. + +
+
+
+
+
+ + ♻ ☆ A Non-Invasive Interpretable NAFLD Diagnostic Method Combining TCM + Tongue Features + + +
+ Non-alcoholic fatty liver disease (NAFLD) is a clinicopathological syndrome +characterized by hepatic steatosis resulting from the exclusion of alcohol and +other identifiable liver-damaging factors. It has emerged as a leading cause of +chronic liver disease worldwide. Currently, the conventional methods for NAFLD +detection are expensive and not suitable for users to perform daily +diagnostics. To address this issue, this study proposes a non-invasive and +interpretable NAFLD diagnostic method, the required user-provided indicators +are only Gender, Age, Height, Weight, Waist Circumference, Hip Circumference, +and tongue image. This method involves merging patients' physiological +indicators with tongue features, which are then input into a fusion network +named SelectorNet. SelectorNet combines attention mechanisms with feature +selection mechanisms, enabling it to autonomously learn the ability to select +important features. The experimental results show that the proposed method +achieves an accuracy of 77.22\% using only non-invasive data, and it also +provides compelling interpretability matrices. This study contributes to the +early diagnosis of NAFLD and the intelligent advancement of TCM tongue +diagnosis. The project mentioned in this paper is currently publicly available. + +
+
+
+
+
+ + ♻ ☆ Adaptive Confidence Threshold for ByteTrack in Multi-Object Tracking + + +
+ We investigate the application of ByteTrack in the realm of multiple object +tracking. ByteTrack, a simple tracking algorithm, enables the simultaneous +tracking of multiple objects by strategically incorporating detections with a +low confidence threshold. Conventionally, objects are initially associated with +high confidence threshold detections. When the association between objects and +detections becomes ambiguous, ByteTrack extends the association to lower +confidence threshold detections. One notable drawback of the existing ByteTrack +approach is its reliance on a fixed threshold to differentiate between high and +low-confidence detections. In response to this limitation, we introduce a novel +and adaptive approach. Our proposed method entails a dynamic adjustment of the +confidence threshold, leveraging insights derived from overall detections. +Through experimentation, we demonstrate the effectiveness of our adaptive +confidence threshold technique while maintaining running time compared to +ByteTrack. + +
+
+ comment: The 12th International Conference on Control, Automation and + Information Sciences (ICCAIS 2023), November 27th to 29th, 2023 in Hanoi +
+
+
+
+
+ + ♻ ☆ Learning Task-Specific Strategies for Accelerated MRI + + +
+ Compressed sensing magnetic resonance imaging (CS-MRI) seeks to recover +visual information from subsampled measurements for diagnostic tasks. +Traditional CS-MRI methods often separately address measurement subsampling, +image reconstruction, and task prediction, resulting in a suboptimal end-to-end +performance. In this work, we propose TACKLE as a unified co-design framework +for jointly optimizing subsampling, reconstruction, and prediction strategies +for the performance on downstream tasks. The na\"ive approach of simply +appending a task prediction module and training with a task-specific loss leads +to suboptimal downstream performance. Instead, we develop a training procedure +where a backbone architecture is first trained for a generic pre-training task +(image reconstruction in our case), and then fine-tuned for different +downstream tasks with a prediction head. Experimental results on multiple +public MRI datasets show that TACKLE achieves an improved performance on +various tasks over traditional CS-MRI methods. We also demonstrate that TACKLE +is robust to distribution shifts by showing that it generalizes to a new +dataset we experimentally collected using different acquisition setups from the +training data. Without additional fine-tuning, TACKLE leads to both numerical +and visual improvements compared to existing baselines. We have further +implemented a learned 4$\times$-accelerated sequence on a Siemens 3T MRI Skyra +scanner. Compared to the fully-sampling scan that takes 335 seconds, our +optimized sequence only takes 84 seconds, achieving a four-fold time reduction +as desired, while maintaining high performance. + +
+
+
+
+
+ + ♻ ☆ 3D Brain and Heart Volume Generative Models: A Survey + + +
+ Generative models such as generative adversarial networks and autoencoders +have gained a great deal of attention in the medical field due to their +excellent data generation capability. This paper provides a comprehensive +survey of generative models for three-dimensional (3D) volumes, focusing on the +brain and heart. A new and elaborate taxonomy of unconditional and conditional +generative models is proposed to cover diverse medical tasks for the brain and +heart: unconditional synthesis, classification, conditional synthesis, +segmentation, denoising, detection, and registration. We provide relevant +background, examine each task and also suggest potential future directions. A +list of the latest publications will be updated on Github to keep up with the +rapid influx of papers at +https://github.com/csyanbin/3D-Medical-Generative-Survey. + +
+
+ comment: Accepted at ACM Computing Surveys (CSUR) 2023 +
+
+
+
+
+ + ♻ ☆ Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image + Alignment with Iterative VQA Feedback + + +
+ The field of text-conditioned image generation has made unparalleled progress +with the recent advent of latent diffusion models. While remarkable, as the +complexity of given text input increases, the state-of-the-art diffusion models +may still fail in generating images which accurately convey the semantics of +the given prompt. Furthermore, it has been observed that such misalignments are +often left undetected by pretrained multi-modal models such as CLIP. To address +these problems, in this paper we explore a simple yet effective decompositional +approach towards both evaluation and improvement of text-to-image alignment. In +particular, we first introduce a Decompositional-Alignment-Score which given a +complex prompt decomposes it into a set of disjoint assertions. The alignment +of each assertion with generated images is then measured using a VQA model. +Finally, alignment scores for different assertions are combined aposteriori to +give the final text-to-image alignment score. Experimental analysis reveals +that the proposed alignment metric shows significantly higher correlation with +human ratings as opposed to traditional CLIP, BLIP scores. Furthermore, we also +find that the assertion level alignment scores provide a useful feedback which +can then be used in a simple iterative procedure to gradually increase the +expression of different assertions in the final image outputs. Human user +studies indicate that the proposed approach surpasses previous state-of-the-art +by 8.7% in overall text-to-image alignment accuracy. Project page for our paper +is available at https://1jsingh.github.io/divide-evaluate-and-refine + +
+
+
+
+
+
+
+
+ + Information Retrieval 8 + +
+
+
+ + ☆ Boosting legal case retrieval by query content selection with large + language models SIGIR + + +
+ Legal case retrieval, which aims to retrieve relevant cases to a given query +case, benefits judgment justice and attracts increasing attention. Unlike +generic retrieval queries, legal case queries are typically long and the +definition of relevance is closely related to legal-specific elements. +Therefore, legal case queries may suffer from noise and sparsity of salient +content, which hinders retrieval models from perceiving correct information in +a query. While previous studies have paid attention to improving retrieval +models and understanding relevance judgments, we focus on enhancing legal case +retrieval by utilizing the salient content in legal case queries. We first +annotate the salient content in queries manually and investigate how sparse and +dense retrieval models attend to those content. Then we experiment with various +query content selection methods utilizing large language models (LLMs) to +extract or summarize salient content and incorporate it into the retrieval +models. Experimental results show that reformulating long queries using LLMs +improves the performance of both sparse and dense models in legal case +retrieval. + +
+
+ comment: This work was accepted as 23-SIGIR-AP main conference paper +
+
+
+
+
+ + ☆ DBCopilot: Scaling Natural Language Querying to Massive Databases + + +
+ Text-to-SQL simplifies database interactions by enabling non-experts to +convert their natural language (NL) questions into Structured Query Language +(SQL) queries. While recent advances in large language models (LLMs) have +improved the zero-shot text-to-SQL paradigm, existing methods face scalability +challenges when dealing with massive, dynamically changing databases. This +paper introduces DBCopilot, a framework that addresses these challenges by +employing a compact and flexible copilot model for routing across massive +databases. Specifically, DBCopilot decouples the text-to-SQL process into +schema routing and SQL generation, leveraging a lightweight +sequence-to-sequence neural network-based router to formulate database +connections and navigate natural language questions through databases and +tables. The routed schemas and questions are then fed into LLMs for efficient +SQL generation. Furthermore, DBCopilot also introduced a reverse +schema-to-question generation paradigm, which can learn and adapt the router +over massive databases automatically without requiring manual intervention. +Experimental results demonstrate that DBCopilot is a scalable and effective +solution for real-world text-to-SQL tasks, providing a significant advancement +in handling large-scale schemas. + +
+
+ comment: Code and data are available at https://github.com/tshu-w/DBCopilot +
+
+
+
+
+ + ☆ Rethinking E-Commerce Search + + +
+ E-commerce search and recommendation usually operate on structured data such +as product catalogs and taxonomies. However, creating better search and +recommendation systems often requires a large variety of unstructured data +including customer reviews and articles on the web. Traditionally, the solution +has always been converting unstructured data into structured data through +information extraction, and conducting search over the structured data. +However, this is a costly approach that often has low quality. In this paper, +we envision a solution that does entirely the opposite. Instead of converting +unstructured data (web pages, customer reviews, etc) to structured data, we +instead convert structured data (product inventory, catalogs, taxonomies, etc) +into textual data, which can be easily integrated into the text corpus that +trains LLMs. Then, search and recommendation can be performed through a Q/A +mechanism through an LLM instead of using traditional information retrieval +methods over structured data. + +
+
+
+
+
+ + ☆ Sports Recommender Systems: Overview and Research Issues + + +
+ Sports recommender systems receive an increasing attention due to their +potential of fostering healthy living, improving personal well-being, and +increasing performances in sport. These systems support people in sports, for +example, by the recommendation of healthy and performance boosting food items, +the recommendation of training practices, talent and team recommendation, and +the recommendation of specific tactics in competitions. With applications in +the virtual world, for example, the recommendation of maps or opponents in +e-sports, these systems already transcend conventional sports scenarios where +physical presence is needed. On the basis of different working examples, we +present an overview of sports recommender systems applications and techniques. +Overall, we analyze the related state-of-the-art and discuss open research +issues. + +
+
+ comment: Article under review in the Journal of Intelligent Information + Systems (Springer JIIS) +
+
+
+
+
+ + ♻ ☆ An In-depth Analysis of Passage-Level Label Transfer for Contextual + Document Ranking + + +
+ Pre-trained contextual language models such as BERT, GPT, and XLnet work +quite well for document retrieval tasks. Such models are fine-tuned based on +the query-document/query-passage level relevance labels to capture the ranking +signals. However, the documents are longer than the passages and such document +ranking models suffer from the token limitation (512) of BERT. Researchers +proposed ranking strategies that either truncate the documents beyond the token +limit or chunk the documents into units that can fit into the BERT. In the +later case, the relevance labels are either directly transferred from the +original query-document pair or learned through some external model. In this +paper, we conduct a detailed study of the design decisions about splitting and +label transfer on retrieval effectiveness and efficiency. We find that direct +transfer of relevance labels from documents to passages introduces label noise +that strongly affects retrieval effectiveness for large training datasets. We +also find that query processing times are adversely affected by fine-grained +splitting schemes. As a remedy, we propose a careful passage level labelling +scheme using weak supervision that delivers improved performance (3-14% in +terms of nDCG score) over most of the recently proposed models for ad-hoc +retrieval while maintaining manageable computational complexity on four diverse +document retrieval datasets. + +
+
+ comment: Paper is about the performance analysis of contextual ranking + strategies in an ad-hoc document retrieval +
+
+
+
+
+ + ♻ ☆ A Comprehensive Review of Visual-Textual Sentiment Analysis from Social + Media Networks + + +
+ Social media networks have become a significant aspect of people's lives, +serving as a platform for their ideas, opinions and emotions. Consequently, +automated sentiment analysis (SA) is critical for recognising people's feelings +in ways that other information sources cannot. The analysis of these feelings +revealed various applications, including brand evaluations, YouTube film +reviews and healthcare applications. As social media continues to develop, +people post a massive amount of information in different forms, including text, +photos, audio and video. Thus, traditional SA algorithms have become limited, +as they do not consider the expressiveness of other modalities. By including +such characteristics from various material sources, these multimodal data +streams provide new opportunities for optimising the expected results beyond +text-based SA. Our study focuses on the forefront field of multimodal SA, which +examines visual and textual data posted on social media networks. Many people +are more likely to utilise this information to express themselves on these +platforms. To serve as a resource for academics in this rapidly growing field, +we introduce a comprehensive overview of textual and visual SA, including data +pre-processing, feature extraction techniques, sentiment benchmark datasets, +and the efficacy of multiple classification methodologies suited to each field. +We also provide a brief introduction of the most frequently utilised data +fusion strategies and a summary of existing research on visual-textual SA. +Finally, we highlight the most significant challenges and investigate several +important sentiment applications. + +
+
+
+
+
+ + ♻ ☆ PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval + Models WSDM 2024 + + +
+ Embedding-based Retrieval Models (ERMs) have emerged as a promising framework +for large-scale text retrieval problems due to powerful large language models. +Nevertheless, fine-tuning ERMs to reach state-of-the-art results can be +expensive due to the extreme scale of data as well as the complexity of +multi-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this +work, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast +tuning of ERMs without any backward pass in the optimization. At index building +stage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN) +component. At inference stage, PEFA performs a convex combination of two +scoring functions, one from the ERM and the other from the kNN. Based on the +neighborhood definition, PEFA framework induces two realizations, namely +PEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra +small) using a single ANN index. Empirically, PEFA achieves significant +improvement on two retrieval applications. For document retrieval, regarding +Recall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an +average of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%, +respectively. For product search, PEFA improves the Recall@100 of the +fine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL, +respectively. Our code is available at +https://github.com/amzn/pecos/tree/mainline/examples/pefa-wsdm24. + +
+
+ comment: Accept by WSDM 2024 +
+
+
+
+
+ + ♻ ☆ Learning Category Trees for ID-Based Recommendation: Exploring the Power + of Differentiable Vector Quantization + + +
+ Category information plays a crucial role in enhancing the quality and +personalization of recommender systems. Nevertheless, the availability of item +category information is not consistently present, particularly in the context +of ID-based recommendations. In this work, we propose a novel approach to +automatically learn and generate entity (i.e., user or item) category trees for +ID-based recommendation. Specifically, we devise a differentiable vector +quantization framework for automatic category tree generation, namely CAGE, +which enables the simultaneous learning and refinement of categorical code +representations and entity embeddings in an end-to-end manner, starting from +the randomly initialized states. With its high adaptability, CAGE can be easily +integrated into both sequential and non-sequential recommender systems. We +validate the effectiveness of CAGE on various recommendation tasks including +list completion, collaborative filtering, and click-through rate prediction, +across different recommendation models. We release the code and data for others +to reproduce the reported results. + +
+
+
+
+
+
+
+
+ + Machine Learning 149 + +
+
+
+ + ☆ OneLLM: One Framework to Align All Modalities with Language + + +
+ Multimodal large language models (MLLMs) have gained significant attention +due to their strong multimodal understanding capability. However, existing +works rely heavily on modality-specific encoders, which usually differ in +architecture and are limited to common modalities. In this paper, we present +OneLLM, an MLLM that aligns eight modalities to language using a unified +framework. We achieve this through a unified multimodal encoder and a +progressive multimodal alignment pipeline. In detail, we first train an image +projection module to connect a vision encoder with LLM. Then, we build a +universal projection module (UPM) by mixing multiple image projection modules +and dynamic routing. Finally, we progressively align more modalities to LLM +with the UPM. To fully leverage the potential of OneLLM in following +instructions, we also curated a comprehensive multimodal instruction dataset, +including 2M items from image, audio, video, point cloud, depth/normal map, IMU +and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks, +encompassing tasks such as multimodal captioning, question answering and +reasoning, where it delivers excellent performance. Code, data, model and +online demo are available at https://github.com/csuhan/OneLLM + +
+
+ comment: Code: https://github.com/csuhan/OneLLM +
+
+
+
+
+ + ☆ Memory Triggers: Unveiling Memorization in Text-To-Image Generative + Models through Word-Level Duplication + + +
+ Diffusion-based models, such as the Stable Diffusion model, have +revolutionized text-to-image synthesis with their ability to produce +high-quality, high-resolution images. These advancements have prompted +significant progress in image generation and editing tasks. However, these +models also raise concerns due to their tendency to memorize and potentially +replicate exact training samples, posing privacy risks and enabling adversarial +attacks. Duplication in training datasets is recognized as a major factor +contributing to memorization, and various forms of memorization have been +studied so far. This paper focuses on two distinct and underexplored types of +duplication that lead to replication during inference in diffusion-based +models, particularly in the Stable Diffusion model. We delve into these +lesser-studied duplication phenomena and their implications through two case +studies, aiming to contribute to the safer and more responsible use of +generative models in various applications. + +
+
+
+
+
+ + ☆ On the Role of Edge Dependency in Graph Generative Models + + +
+ In this work, we introduce a novel evaluation framework for generative models +of graphs, emphasizing the importance of model-generated graph overlap +(Chanpuriya et al., 2021) to ensure both accuracy and edge-diversity. We +delineate a hierarchy of graph generative models categorized into three levels +of complexity: edge independent, node independent, and fully dependent models. +This hierarchy encapsulates a wide range of prevalent methods. We derive +theoretical bounds on the number of triangles and other short-length cycles +producible by each level of the hierarchy, contingent on the model overlap. We +provide instances demonstrating the asymptotic optimality of our bounds. +Furthermore, we introduce new generative models for each of the three +hierarchical levels, leveraging dense subgraph discovery (Gionis & Tsourakakis, +2015). Our evaluation, conducted on real-world datasets, focuses on assessing +the output quality and overlap of our proposed models in comparison to other +popular models. Our results indicate that our simple, interpretable models +provide competitive baselines to popular generative models. Through this +investigation, we aim to propel the advancement of graph generative models by +offering a structured framework and robust evaluation metrics, thereby +facilitating the development of models capable of generating accurate and +edge-diverse graphs. + +
+
+
+
+
+ + ☆ Inverse Design of Vitrimeric Polymers by Molecular Dynamics and + Generative Modeling + + +
+ Vitrimer is a new class of sustainable polymers with the ability of +self-healing through rearrangement of dynamic covalent adaptive networks. +However, a limited choice of constituent molecules restricts their property +space, prohibiting full realization of their potential applications. Through a +combination of molecular dynamics (MD) simulations and machine learning (ML), +particularly a novel graph variational autoencoder (VAE) model, we establish a +method for generating novel vitrimers and guide their inverse design based on +desired glass transition temperature (Tg). We build the first vitrimer dataset +of one million and calculate Tg on 8,424 of them by high-throughput MD +simulations calibrated by a Gaussian process model. The proposed VAE employs +dual graph encoders and a latent dimension overlapping scheme which allows for +individual representation of multi-component vitrimers. By constructing a +continuous latent space containing necessary information of vitrimers, we +demonstrate high accuracy and efficiency of our framework in discovering novel +vitrimers with desirable Tg beyond the training regime. The proposed vitrimers +with reasonable synthesizability cover a wide range of Tg and broaden the +potential widespread usage of vitrimeric materials. + +
+
+
+
+
+ + ☆ What Planning Problems Can A Relational Neural Network Solve? NeurIPS 2023 + + +
+ Goal-conditioned policies are generally understood to be "feed-forward" +circuits, in the form of neural networks that map from the current state and +the goal specification to the next action to take. However, under what +circumstances such a policy can be learned and how efficient the policy will be +are not well understood. In this paper, we present a circuit complexity +analysis for relational neural networks (such as graph neural networks and +transformers) representing policies for planning problems, by drawing +connections with serialized goal regression search (S-GRS). We show that there +are three general classes of planning problems, in terms of the growth of +circuit width and depth as a function of the number of objects and planning +horizon, providing constructive proofs. We also illustrate the utility of this +analysis for designing neural networks for policy learning. + +
+
+ comment: NeurIPS 2023 (Spotlight). Project page: + https://concepts-ai.com/p/goal-regression-width/ +
+
+
+
+
+ + ☆ GeoShapley: A Game Theory Approach to Measuring Spatial Effects in + Machine Learning Models + + +
+ This paper introduces GeoShapley, a game theory approach to measuring spatial +effects in machine learning models. GeoShapley extends the Nobel Prize-winning +Shapley value framework in game theory by conceptualizing location as a player +in a model prediction game, which enables the quantification of the importance +of location and the synergies between location and other features in a model. +GeoShapley is a model-agnostic approach and can be applied to statistical or +black-box machine learning models in various structures. The interpretation of +GeoShapley is directly linked with spatially varying coefficient models for +explaining spatial effects and additive models for explaining non-spatial +effects. Using simulated data, GeoShapley values are validated against known +data-generating processes and are used for cross-comparison of seven +statistical and machine learning models. An empirical example of house price +modeling is used to illustrate GeoShapley's utility and interpretation with +real world data. The method is available as an open-source Python package named +geoshapley. + +
+
+ comment: 30 pages, 10 figures, 6 tables +
+
+
+
+
+ + ☆ On the Role of the Action Space in Robot Manipulation Learning and + Sim-to-Real Transfer + + +
+ We study the choice of action space in robot manipulation learning and +sim-to-real transfer. We define metrics that assess the performance, and +examine the emerging properties in the different action spaces. We train over +250 reinforcement learning~(RL) agents in simulated reaching and pushing tasks, +using 13 different control spaces. The choice of action spaces spans popular +choices in the literature as well as novel combinations of common design +characteristics. We evaluate the training performance in simulation and the +transfer to a real-world environment. We identify good and bad characteristics +of robotic action spaces and make recommendations for future designs. Our +findings have important implications for the design of RL algorithms for robot +manipulation tasks, and highlight the need for careful consideration of action +spaces when training and transferring RL agents for real-world robotics. + +
+
+
+
+
+ + ☆ Direct Exoplanet Detection Using Deep Convolutional Image Reconstruction + (ConStruct): A New Algorithm for Post-Processing High-Contrast Images + + +
+ We present a novel machine-learning approach for detecting faint point +sources in high-contrast adaptive optics imaging datasets. The most widely used +algorithms for primary subtraction aim to decouple bright stellar speckle noise +from planetary signatures by subtracting an approximation of the temporally +evolving stellar noise from each frame in an imaging sequence. Our approach +aims to improve the stellar noise approximation and increase the planet +detection sensitivity by leveraging deep learning in a novel direct imaging +post-processing algorithm. We show that a convolutional autoencoder neural +network, trained on an extensive reference library of real imaging sequences, +accurately reconstructs the stellar speckle noise at the location of a +potential planet signal. This tool is used in a post-processing algorithm we +call Direct Exoplanet Detection with Convolutional Image Reconstruction, or +ConStruct. The reliability and sensitivity of ConStruct are assessed using real +Keck/NIRC2 angular differential imaging datasets. Of the 30 unique point +sources we examine, ConStruct yields a higher S/N than traditional PCA-based +processing for 67$\%$ of the cases and improves the relative contrast by up to +a factor of 2.6. This work demonstrates the value and potential of deep +learning to take advantage of a diverse reference library of point spread +function realizations to improve direct imaging post-processing. ConStruct and +its future improvements may be particularly useful as tools for post-processing +high-contrast images from the James Webb Space Telescope and extreme adaptive +optics instruments, both for the current generation and those being designed +for the upcoming 30 meter-class telescopes. + +
+
+
+
+
+ + ☆ An Integration of Pre-Trained Speech and Language Models for End-to-End + Speech Recognition + + +
+ Advances in machine learning have made it possible to perform various text +and speech processing tasks, including automatic speech recognition (ASR), in +an end-to-end (E2E) manner. Since typical E2E approaches require large amounts +of training data and resources, leveraging pre-trained foundation models +instead of training from scratch is gaining attention. Although there have been +attempts to use pre-trained speech and language models in ASR, most of them are +limited to using either. This paper explores the potential of integrating a +pre-trained speech representation model with a large language model (LLM) for +E2E ASR. The proposed model enables E2E ASR by generating text tokens in an +autoregressive manner via speech representations as speech prompts, taking +advantage of the vast knowledge provided by the LLM. Furthermore, the proposed +model can incorporate remarkable developments for LLM utilization, such as +inference optimization and parameter-efficient domain adaptation. Experimental +results show that the proposed model achieves performance comparable to modern +E2E ASR models. + +
+
+ comment: 6 pages, 2 figures, 3 tables, The model is available at + https://huggingface.co/rinna/nue-asr +
+
+
+
+
+ + ☆ Towards small and accurate convolutional neural networks for acoustic + biodiversity monitoring + + +
+ Automated classification of animal sounds is a prerequisite for large-scale +monitoring of biodiversity. Convolutional Neural Networks (CNNs) are among the +most promising algorithms but they are slow, often achieve poor classification +in the field and typically require large training data sets. Our objective was +to design CNNs that are fast at inference time and achieve good classification +performance while learning from moderate-sized data. Recordings from a +rainforest ecosystem were used. Start and end-point of sounds from 20 bird +species were manually annotated. Spectrograms from 10 second segments were used +as CNN input. We designed simple CNNs with a frequency unwrapping layer +(SIMP-FU models) such that any output unit was connected to all spectrogram +frequencies but only to a sub-region of time, the Receptive Field (RF). Our +models allowed experimentation with different RF durations. Models either used +the time-indexed labels that encode start and end-point of sounds or simpler +segment-level labels. Models learning from time-indexed labels performed +considerably better than their segment-level counterparts. Best classification +performances was achieved for models with intermediate RF duration of 1.5 +seconds. The best SIMP-FU models achieved AUCs over 0.95 in 18 of 20 classes on +the test set. On compact low-cost hardware the best SIMP-FU models evaluated up +to seven times faster than real-time data acquisition. RF duration was a major +driver of classification performance. The optimum of 1.5 s was in the same +range as the duration of the sounds. Our models achieved good classification +performance while learning from moderate-sized training data. This is explained +by the usage of time-indexed labels during training and adequately sized RF. +Results confirm the feasibility of deploying small CNNs with good +classification performance on compact low-cost devices. + +
+
+
+
+
+ + ☆ Interpretability Illusions in the Generalization of Simplified Models + + +
+ A common method to study deep learning systems is to use simplified model +representations -- for example, using singular value decomposition to visualize +the model's hidden states in a lower dimensional space. This approach assumes +that the results of these simplified are faithful to the original model. Here, +we illustrate an important caveat to this assumption: even if the simplified +representations can accurately approximate the full model on the training set, +they may fail to accurately capture the model's behavior out of distribution -- +the understanding developed from simplified representations may be an illusion. +We illustrate this by training Transformer models on controlled datasets with +systematic generalization splits. First, we train models on the Dyck +balanced-parenthesis languages. We simplify these models using tools like +dimensionality reduction and clustering, and then explicitly test how these +simplified proxies match the behavior of the original model on various +out-of-distribution test sets. We find that the simplified proxies are +generally less faithful out of distribution. In cases where the original model +generalizes to novel structures or deeper depths, the simplified versions may +fail, or generalize better. This finding holds even if the simplified +representations do not directly depend on the training distribution. Next, we +study a more naturalistic task: predicting the next character in a dataset of +computer code. We find similar generalization gaps between the original model +and simplified proxies, and conduct further analysis to investigate which +aspects of the code completion task are associated with the largest gaps. +Together, our results raise questions about the extent to which mechanistic +interpretations derived using tools like SVD can reliably predict what a model +will do in novel situations. + +
+
+
+
+
+ + ☆ Efficient Inverse Design Optimization through Multi-fidelity + Simulations, Machine Learning, and Search Space Reduction Strategies + + +
+ This paper introduces a methodology designed to augment the inverse design +optimization process in scenarios constrained by limited compute, through the +strategic synergy of multi-fidelity evaluations, machine learning models, and +optimization algorithms. The proposed methodology is analyzed on two distinct +engineering inverse design problems: airfoil inverse design and the scalar +field reconstruction problem. It leverages a machine learning model trained +with low-fidelity simulation data, in each optimization cycle, thereby +proficiently predicting a target variable and discerning whether a +high-fidelity simulation is necessitated, which notably conserves computational +resources. Additionally, the machine learning model is strategically deployed +prior to optimization to reduce the search space, thereby further accelerating +convergence toward the optimal solution. The methodology has been employed to +enhance two optimization algorithms, namely Differential Evolution and Particle +Swarm Optimization. Comparative analyses illustrate performance improvements +across both algorithms. Notably, this method is adeptly adaptable across any +inverse design application, facilitating a harmonious synergy between a +representative low-fidelity machine learning model, and high-fidelity +simulation, and can be seamlessly applied across any variety of +population-based optimization algorithms. + +
+
+
+
+
+ + ☆ MICRACLE: Inverse Reinforcement and Curriculum Learning Model for + Human-inspired Mobile Robot Navigation + + +
+ In emergency scenarios, mobile robots must navigate like humans, interpreting +stimuli to locate potential victims rapidly without interfering with first +responders. Existing socially-aware navigation algorithms face computational +and adaptability challenges. To overcome these, we propose a solution, MIRACLE +-- an inverse reinforcement and curriculum learning model, that employs +gamified learning to gather stimuli-driven human navigational data. This data +is then used to train a Deep Inverse Maximum Entropy Reinforcement Learning +model, reducing reliance on demonstrator abilities. Testing reveals a low loss +of 2.7717 within a 400-sized environment, signifying human-like response +replication. Current databases lack comprehensive stimuli-driven data, +necessitating our approach. By doing so, we enable robots to navigate emergency +situations with human-like perception, enhancing their life-saving +capabilities. + +
+
+
+
+
+ + ☆ Editable Stain Transformation Of Histological Images Using Unpaired GANs + + +
+ Double staining in histopathology, particularly for metaplastic breast +cancer, typically employs H&E and P63 dyes. However, P63's tissue damage and +high cost necessitate alternative methods. This study introduces xAI-CycleGAN, +an advanced architecture combining Mask CycleGAN with explainability features +and structure-preserving capabilities for transforming H&E stained breast +tissue images into P63-like images. The architecture allows for output editing, +enhancing resemblance to actual images and enabling further model refinement. +We showcase xAI-CycleGAN's efficacy in maintaining structural integrity and +generating high-quality images. Additionally, a histopathologist survey +indicates the generated images' realism is often comparable to actual images, +validating our model's high-quality output. + +
+
+ comment: 13 pages, 5 figures, AIRCAD ICIAP 2023 in Udine +
+
+
+
+
+ + ☆ MACCA: Offline Multi-agent Reinforcement Learning with Causal Credit + Assignment + + +
+ Offline Multi-agent Reinforcement Learning (MARL) is valuable in scenarios +where online interaction is impractical or risky. While independent learning in +MARL offers flexibility and scalability, accurately assigning credit to +individual agents in offline settings poses challenges due to partial +observability and emergent behavior. Directly transferring the online credit +assignment method to offline settings results in suboptimal outcomes due to the +absence of real-time feedback and intricate agent interactions. Our approach, +MACCA, characterizing the generative process as a Dynamic Bayesian Network, +captures relationships between environmental variables, states, actions, and +rewards. Estimating this model on offline data, MACCA can learn each agent's +contribution by analyzing the causal relationship of their individual rewards, +ensuring accurate and interpretable credit assignment. Additionally, the +modularity of our approach allows it to seamlessly integrate with various +offline MARL methods. Theoretically, we proved that under the setting of the +offline dataset, the underlying causal structure and the function for +generating the individual rewards of agents are identifiable, which laid the +foundation for the correctness of our modeling. Experimentally, we tested MACCA +in two environments, including discrete and continuous action settings. The +results show that MACCA outperforms SOTA methods and improves performance upon +their backbones. + +
+
+ comment: 16 pages, 4 figures +
+
+
+
+
+ + ☆ Transformer-Powered Surrogates Close the ICF Simulation-Experiment Gap + with Extremely Limited Data + + +
+ Recent advances in machine learning, specifically transformer architecture, +have led to significant advancements in commercial domains. These powerful +models have demonstrated superior capability to learn complex relationships and +often generalize better to new data and problems. This paper presents a novel +transformer-powered approach for enhancing prediction accuracy in multi-modal +output scenarios, where sparse experimental data is supplemented with +simulation data. The proposed approach integrates transformer-based +architecture with a novel graph-based hyper-parameter optimization technique. +The resulting system not only effectively reduces simulation bias, but also +achieves superior prediction accuracy compared to the prior method. We +demonstrate the efficacy of our approach on inertial confinement fusion +experiments, where only 10 shots of real-world data are available, as well as +synthetic versions of these experiments. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ MotionCtrl: A Unified and Flexible Motion Controller for Video + Generation + + +
+ Motions in a video primarily consist of camera motion, induced by camera +movement, and object motion, resulting from object movement. Accurate control +of both camera and object motion is essential for video generation. However, +existing works either mainly focus on one type of motion or do not clearly +distinguish between the two, limiting their control capabilities and diversity. +Therefore, this paper presents MotionCtrl, a unified and flexible motion +controller for video generation designed to effectively and independently +control camera and object motion. The architecture and training strategy of +MotionCtrl are carefully devised, taking into account the inherent properties +of camera motion, object motion, and imperfect training data. Compared to +previous methods, MotionCtrl offers three main advantages: 1) It effectively +and independently controls camera motion and object motion, enabling more +fine-grained motion control and facilitating flexible and diverse combinations +of both types of motion. 2) Its motion conditions are determined by camera +poses and trajectories, which are appearance-free and minimally impact the +appearance or shape of objects in generated videos. 3) It is a relatively +generalizable model that can adapt to a wide array of camera poses and +trajectories once trained. Extensive qualitative and quantitative experiments +have been conducted to demonstrate the superiority of MotionCtrl over existing +methods. + +
+
+ comment: Project Page: https://wzhouxiff.github.io/projects/MotionCtrl/ +
+
+
+
+
+ + ☆ Not All Large Language Models (LLMs) Succumb to the "Reversal Curse": A + Comparative Study of Deductive Logical Reasoning in BERT and GPT Models + + +
+ The "Reversal Curse" refers to the scenario where auto-regressive decoder +large language models (LLMs), such as ChatGPT, trained on "A is B" fail to +learn "B is A", demonstrating a basic failure of logical deduction. This raises +a red flag in the use of GPT models for certain general tasks such as +constructing knowledge graphs, considering their adherence to this symmetric +principle. In our study, we examined a bidirectional LLM, BERT, and found that +it is immune to the reversal curse. Driven by ongoing efforts to construct +biomedical knowledge graphs with LLMs, we also embarked on evaluating more +complex but essential deductive reasoning capabilities. This process included +first training encoder and decoder language models to master the intersection +($\cap$) and union ($\cup$) operations on two sets and then moving on to assess +their capability to infer different combinations of union ($\cup$) and +intersection ($\cap$) operations on three newly created sets. The findings +showed that while both encoder and decoder language models, trained for tasks +involving two sets (union/intersection), were proficient in such scenarios, +they encountered difficulties when dealing with operations that included three +sets (various combinations of union and intersection). Our research highlights +the distinct characteristics of encoder and decoder models in simple and +complex logical reasoning. In practice, the choice between BERT and GPT should +be guided by the specific requirements and nature of the task at hand, +leveraging their respective strengths in bidirectional context comprehension +and sequence prediction. + +
+
+
+
+
+ + ☆ Multimodal Data and Resource Efficient Device-Directed Speech Detection + with Large Foundation Models + + +
+ Interactions with virtual assistants typically start with a trigger phrase +followed by a command. In this work, we explore the possibility of making these +interactions more natural by eliminating the need for a trigger phrase. Our +goal is to determine whether a user addressed the virtual assistant based on +signals obtained from the streaming audio recorded by the device microphone. We +address this task by combining 1-best hypotheses and decoder signals from an +automatic speech recognition system with acoustic representations from an audio +encoder as input features to a large language model (LLM). In particular, we +are interested in data and resource efficient systems that require only a small +amount of training data and can operate in scenarios with only a single frozen +LLM available on a device. For this reason, our model is trained on 80k or less +examples of multimodal data using a combination of low-rank adaptation and +prefix tuning. We compare the proposed system to unimodal baselines and show +that the multimodal approach achieves lower equal-error-rates (EERs), while +using only a fraction of the training data. We also show that low-dimensional +specialized audio representations lead to lower EERs than high-dimensional +general audio representations. + +
+
+
+
+
+ + ☆ Evaluation of Active Feature Acquisition Methods for Static Feature + Settings + + +
+ Active feature acquisition (AFA) agents, crucial in domains like healthcare +where acquiring features is often costly or harmful, determine the optimal set +of features for a subsequent classification task. As deploying an AFA agent +introduces a shift in missingness distribution, it's vital to assess its +expected performance at deployment using retrospective data. In a companion +paper, we introduce a semi-offline reinforcement learning (RL) framework for +active feature acquisition performance evaluation (AFAPE) where features are +assumed to be time-dependent. Here, we study and extend the AFAPE problem to +cover static feature settings, where features are time-invariant, and hence +provide more flexibility to the AFA agents in deciding the order of the +acquisitions. In this static feature setting, we derive and adapt new inverse +probability weighting (IPW), direct method (DM), and double reinforcement +learning (DRL) estimators within the semi-offline RL framework. These +estimators can be applied when the missingness in the retrospective dataset +follows a missing-at-random (MAR) pattern. They also can be applied to +missing-not-at-random (MNAR) patterns in conjunction with appropriate existing +missing data techniques. We illustrate the improved data efficiency offered by +the semi-offline RL estimators in synthetic and real-world data experiments +under synthetic MAR and MNAR missingness. + +
+
+ comment: 38 pages, 7 figures, 5 tables. arXiv admin note: substantial text + overlap with arXiv:2312.01530 +
+
+
+
+
+ + ☆ Physical Symbolic Optimization NeurIPS 2023 + + +
+ We present a framework for constraining the automatic sequential generation +of equations to obey the rules of dimensional analysis by construction. +Combining this approach with reinforcement learning, we built $\Phi$-SO, a +Physical Symbolic Optimization method for recovering analytical functions from +physical data leveraging units constraints. Our symbolic regression algorithm +achieves state-of-the-art results in contexts in which variables and constants +have known physical units, outperforming all other methods on SRBench's Feynman +benchmark in the presence of noise (exceeding 0.1%) and showing resilience even +in the presence of significant (10%) levels of noise. + +
+
+ comment: 6 pages, 2 figures, 1 table. Accepted to NeurIPS 2023, Machine + Learning for Physical Sciences workshop +
+
+
+
+
+ + ☆ DreamComposer: Controllable 3D Object Generation via Multi-View + Conditions + + +
+ Utilizing pre-trained 2D large-scale generative models, recent works are +capable of generating high-quality novel views from a single in-the-wild image. +However, due to the lack of information from multiple views, these works +encounter difficulties in generating controllable novel views. In this paper, +we present DreamComposer, a flexible and scalable framework that can enhance +existing view-aware diffusion models by injecting multi-view conditions. +Specifically, DreamComposer first uses a view-aware 3D lifting module to obtain +3D representations of an object from multiple views. Then, it renders the +latent features of the target view from 3D representations with the multi-view +feature fusion module. Finally the target view features extracted from +multi-view inputs are injected into a pre-trained diffusion model. Experiments +show that DreamComposer is compatible with state-of-the-art diffusion models +for zero-shot novel view synthesis, further enhancing them to generate +high-fidelity novel view images with multi-view conditions, ready for +controllable 3D object reconstruction and various other applications. + +
+
+ comment: Project Page: https://yhyang-myron.github.io/DreamComposer/ +
+
+
+
+
+ + ☆ DiffusionSat: A Generative Foundation Model for Satellite Imagery + + +
+ Diffusion models have achieved state-of-the-art results on many modalities +including images, speech, and video. However, existing models are not tailored +to support remote sensing data, which is widely used in important applications +including environmental monitoring and crop-yield prediction. Satellite images +are significantly different from natural images -- they can be multi-spectral, +irregularly sampled across time -- and existing diffusion models trained on +images from the Web do not support them. Furthermore, remote sensing data is +inherently spatio-temporal, requiring conditional generation tasks not +supported by traditional methods based on captions or images. In this paper, we +present DiffusionSat, to date the largest generative foundation model trained +on a collection of publicly available large, high-resolution remote sensing +datasets. As text-based captions are sparsely available for satellite images, +we incorporate the associated metadata such as geolocation as conditioning +information. Our method produces realistic samples and can be used to solve +multiple generative tasks including temporal generation, superresolution given +multi-spectral inputs and in-painting. Our method outperforms previous +state-of-the-art methods for satellite image generation and is the first +large-scale $\textit{generative}$ foundation model for satellite imagery. + +
+
+
+
+
+ + ☆ MMM: Generative Masked Motion Model + + +
+ Recent advances in text-to-motion generation using diffusion and +autoregressive models have shown promising results. However, these models often +suffer from a trade-off between real-time performance, high fidelity, and +motion editability. To address this gap, we introduce MMM, a novel yet simple +motion generation paradigm based on Masked Motion Model. MMM consists of two +key components: (1) a motion tokenizer that transforms 3D human motion into a +sequence of discrete tokens in latent space, and (2) a conditional masked +motion transformer that learns to predict randomly masked motion tokens, +conditioned on the pre-computed text tokens. By attending to motion and text +tokens in all directions, MMM explicitly captures inherent dependency among +motion tokens and semantic mapping between motion and text tokens. During +inference, this allows parallel and iterative decoding of multiple motion +tokens that are highly consistent with fine-grained text descriptions, +therefore simultaneously achieving high-fidelity and high-speed motion +generation. In addition, MMM has innate motion editability. By simply placing +mask tokens in the place that needs editing, MMM automatically fills the gaps +while guaranteeing smooth transitions between editing and non-editing parts. +Extensive experiments on the HumanML3D and KIT-ML datasets demonstrate that MMM +surpasses current leading methods in generating high-quality motion (evidenced +by superior FID scores of 0.08 and 0.429), while offering advanced editing +features such as body-part modification, motion in-betweening, and the +synthesis of long motion sequences. In addition, MMM is two orders of magnitude +faster on a single mid-range GPU than editable motion diffusion models. Our +project page is available at \url{https://exitudio.github.io/MMM-page}. + +
+
+
+
+
+ + ☆ Invariance & Causal Representation Learning: Prospects and Limitations + + +
+ In causal models, a given mechanism is assumed to be invariant to changes of +other mechanisms. While this principle has been utilized for inference in +settings where the causal variables are observed, theoretical insights when the +variables of interest are latent are largely missing. We assay the connection +between invariance and causal representation learning by establishing +impossibility results which show that invariance alone is insufficient to +identify latent causal variables. Together with practical considerations, we +use these theoretical findings to highlight the need for additional constraints +in order to identify representations by exploiting invariance. + +
+
+
+
+
+ + ☆ Blueprinting the Future: Automatic Item Categorization using + Hierarchical Zero-Shot and Few-Shot Classifiers + + +
+ In testing industry, precise item categorization is pivotal to align exam +questions with the designated content domains outlined in the assessment +blueprint. Traditional methods either entail manual classification, which is +laborious and error-prone, or utilize machine learning requiring extensive +training data, often leading to model underfit or overfit issues. This study +unveils a novel approach employing the zero-shot and few-shot Generative +Pretrained Transformer (GPT) classifier for hierarchical item categorization, +minimizing the necessity for training data, and instead, leveraging human-like +language descriptions to define categories. Through a structured python +dictionary, the hierarchical nature of examination blueprints is navigated +seamlessly, allowing for a tiered classification of items across multiple +levels. An initial simulation with artificial data demonstrates the efficacy of +this method, achieving an average accuracy of 92.91% measured by the F1 score. +This method was further applied to real exam items from the 2022 In-Training +Examination (ITE) conducted by the American Board of Family Medicine (ABFM), +reclassifying 200 items according to a newly formulated blueprint swiftly in 15 +minutes, a task that traditionally could span several days among editors and +physicians. This innovative approach not only drastically cuts down +classification time but also ensures a consistent, principle-driven +categorization, minimizing human biases and discrepancies. The ability to +refine classifications by adjusting definitions adds to its robustness and +sustainability. + +
+
+
+
+
+ + ☆ Personalized Face Inpainting with Diffusion Models by Parallel Visual + Attention + + +
+ Face inpainting is important in various applications, such as photo +restoration, image editing, and virtual reality. Despite the significant +advances in face generative models, ensuring that a person's unique facial +identity is maintained during the inpainting process is still an elusive goal. +Current state-of-the-art techniques, exemplified by MyStyle, necessitate +resource-intensive fine-tuning and a substantial number of images for each new +identity. Furthermore, existing methods often fall short in accommodating +user-specified semantic attributes, such as beard or expression. To improve +inpainting results, and reduce the computational complexity during inference, +this paper proposes the use of Parallel Visual Attention (PVA) in conjunction +with diffusion models. Specifically, we insert parallel attention matrices to +each cross-attention module in the denoising network, which attends to features +extracted from reference images by an identity encoder. We train the added +attention modules and identity encoder on CelebAHQ-IDI, a dataset proposed for +identity-preserving face inpainting. Experiments demonstrate that PVA attains +unparalleled identity resemblance in both face inpainting and face inpainting +with language guidance tasks, in comparison to various benchmarks, including +MyStyle, Paint by Example, and Custom Diffusion. Our findings reveal that PVA +ensures good identity preservation while offering effective +language-controllability. Additionally, in contrast to Custom Diffusion, PVA +requires just 40 fine-tuning steps for each new identity, which translates to a +significant speed increase of over 20 times. + +
+
+
+
+
+ + ☆ On the Diversity and Realism of Distilled Dataset: An Efficient Dataset + Distillation Paradigm + + +
+ Contemporary machine learning requires training large neural networks on +massive datasets and thus faces the challenges of high computational demands. +Dataset distillation, as a recent emerging strategy, aims to compress +real-world datasets for efficient training. However, this line of research +currently struggle with large-scale and high-resolution datasets, hindering its +practicality and feasibility. To this end, we re-examine the existing dataset +distillation methods and identify three properties required for large-scale +real-world applications, namely, realism, diversity, and efficiency. As a +remedy, we propose RDED, a novel computationally-efficient yet effective data +distillation paradigm, to enable both diversity and realism of the distilled +data. Extensive empirical results over various neural architectures and +datasets demonstrate the advancement of RDED: we can distill the full +ImageNet-1K to a small dataset comprising 10 images per class within 7 minutes, +achieving a notable 42% top-1 accuracy with ResNet-18 on a single RTX-4090 GPU +(while the SOTA only achieves 21% but requires 6 hours). + +
+
+ comment: 17 pages, 20 figures +
+
+
+
+
+ + ☆ Clustering by Contour coreset and variational quantum eigensolver + + +
+ Recent work has proposed solving the k-means clustering problem on quantum +computers via the Quantum Approximate Optimization Algorithm (QAOA) and coreset +techniques. Although the current method demonstrates the possibility of quantum +k-means clustering, it does not ensure high accuracy and consistency across a +wide range of datasets. The existing coreset techniques are designed for +classical algorithms and there has been no quantum-tailored coreset technique +which is designed to boost the accuracy of quantum algorithms. In this work, we +propose solving the k-means clustering problem with the variational quantum +eigensolver (VQE) and a customised coreset method, the Contour coreset, which +has been formulated with specific focus on quantum algorithms. Extensive +simulations with synthetic and real-life data demonstrated that our VQE+Contour +Coreset approach outperforms existing QAOA+Coreset k-means clustering +approaches with higher accuracy and lower standard deviation. Our work has +shown that quantum tailored coreset techniques has the potential to +significantly boost the performance of quantum algorithms when compared to +using generic off-the-shelf coreset techniques. + +
+
+ comment: 33 pages +
+
+
+
+
+ + ☆ Kandinsky 3.0 Technical Report + + +
+ We present Kandinsky 3.0, a large-scale text-to-image generation model based +on latent diffusion, continuing the series of text-to-image Kandinsky models +and reflecting our progress to achieve higher quality and realism of image +generation. Compared to previous versions of Kandinsky 2.x, Kandinsky 3.0 +leverages a two times larger U-Net backbone, a ten times larger text encoder +and removes diffusion mapping. We describe the architecture of the model, the +data collection procedure, the training technique, and the production system of +user interaction. We focus on the key components that, as we have identified as +a result of a large number of experiments, had the most significant impact on +improving the quality of our model compared to the others. By our side-by-side +comparisons, Kandinsky becomes better in text understanding and works better on +specific domains. Project page: https://ai-forever.github.io/Kandinsky-3 + +
+
+ comment: Project page: https://ai-forever.github.io/Kandinsky-3 +
+
+
+
+
+ + ☆ Towards Sobolev Training + + +
+ The increasing use of stochastic models for describing complex phenomena +warrants surrogate models that capture the reference model characteristics at a +fraction of the computational cost, foregoing potentially expensive Monte Carlo +simulation. The predominant approach of fitting a large neural network and then +pruning it to a reduced size has commonly neglected shortcomings. The produced +surrogate models often will not capture the sensitivities and uncertainties +inherent in the original model. In particular, (higher-order) derivative +information of such surrogates could differ drastically. Given a large enough +network, we expect this derivative information to match. However, the pruned +model will almost certainly not share this behavior. + In this paper, we propose to find surrogate models by using sensitivity +information throughout the learning and pruning process. We build on work using +Interval Adjoint Significance Analysis for pruning and combine it with the +recent advancements in Sobolev Training to accurately model the original +sensitivity information in the pruned neural network based surrogate model. We +experimentally underpin the method on an example of pricing a multidimensional +Basket option modelled through a stochastic differential equation with Brownian +motion. The proposed method is, however, not limited to the domain of +quantitative finance, which was chosen as a case study for intuitive +interpretations of the sensitivities. It serves as a foundation for building +further surrogate modelling techniques considering sensitivity information. + +
+
+
+
+
+ + ☆ Speculative Exploration on the Concept of Artificial Agents Conducting + Autonomous Research + + +
+ This paper engages in a speculative exploration of the concept of an +artificial agent capable of conducting research. Initially, it examines how the +act of research can be conceptually characterized, aiming to provide a starting +point for discussions about what it means to create such agents. The focus then +shifts to the core components of research: question formulation, hypothesis +generation, and hypothesis verification. This discussion includes a +consideration of the potential and challenges associated with enabling machines +to autonomously perform these tasks. Subsequently, this paper briefly considers +the overlapping themes and interconnections that underlie them. Finally, the +paper presents preliminary thoughts on prototyping as an initial step towards +uncovering the challenges involved in developing these research-capable agents. + +
+
+
+
+
+ + ☆ Learning From Scenarios for Stochastic Repairable Scheduling + + +
+ When optimizing problems with uncertain parameter values in a linear +objective, decision-focused learning enables end-to-end learning of these +values. We are interested in a stochastic scheduling problem, in which +processing times are uncertain, which brings uncertain values in the +constraints, and thus repair of an initial schedule may be needed. Historical +realizations of the stochastic processing times are available. We show how +existing decision-focused learning techniques based on stochastic smoothing can +be adapted to this scheduling problem. We include an extensive experimental +evaluation to investigate in which situations decision-focused learning +outperforms the state of the art for such situations: scenario-based stochastic +optimization. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Schrodinger Bridges Beat Diffusion Models on Text-to-Speech Synthesis + + +
+ In text-to-speech (TTS) synthesis, diffusion models have achieved promising +generation quality. However, because of the pre-defined data-to-noise diffusion +process, their prior distribution is restricted to a noisy representation, +which provides little information of the generation target. In this work, we +present a novel TTS system, Bridge-TTS, making the first attempt to substitute +the noisy Gaussian prior in established diffusion-based TTS methods with a +clean and deterministic one, which provides strong structural information of +the target. Specifically, we leverage the latent representation obtained from +text input as our prior, and build a fully tractable Schrodinger bridge between +it and the ground-truth mel-spectrogram, leading to a data-to-data process. +Moreover, the tractability and flexibility of our formulation allow us to +empirically study the design spaces such as noise schedules, as well as to +develop stochastic and deterministic samplers. Experimental results on the +LJ-Speech dataset illustrate the effectiveness of our method in terms of both +synthesis quality and sampling efficiency, significantly outperforming our +diffusion counterpart Grad-TTS in 50-step/1000-step synthesis and strong fast +TTS models in few-step scenarios. Project page: https://bridge-tts.github.io/ + +
+
+
+
+
+ + ☆ Precision of Individual Shapley Value Explanations + + +
+ Shapley values are extensively used in explainable artificial intelligence +(XAI) as a framework to explain predictions made by complex machine learning +(ML) models. In this work, we focus on conditional Shapley values for +predictive models fitted to tabular data and explain the prediction +$f(\boldsymbol{x}^{*})$ for a single observation $\boldsymbol{x}^{*}$ at the +time. Numerous Shapley value estimation methods have been proposed and +empirically compared on an average basis in the XAI literature. However, less +focus has been devoted to analyzing the precision of the Shapley value +explanations on an individual basis. We extend our work in Olsen et al. (2023) +by demonstrating and discussing that the explanations are systematically less +precise for observations on the outer region of the training data distribution +for all used estimation methods. This is expected from a statistical point of +view, but to the best of our knowledge, it has not been systematically +addressed in the Shapley value literature. This is crucial knowledge for +Shapley values practitioners, who should be more careful in applying these +observations' corresponding Shapley value explanations. + +
+
+ comment: Submitted to the 23rd European Young Statisticians Meeting (EYSM) +
+
+
+
+
+ + ☆ Exploring Answer Information Methods for Question Generation with + Transformers + + +
+ There has been a lot of work in question generation where different methods +to provide target answers as input, have been employed. This experimentation +has been mostly carried out for RNN based models. We use three different +methods and their combinations for incorporating answer information and explore +their effect on several automatic evaluation metrics. The methods that are used +are answer prompting, using a custom product method using answer embeddings and +encoder outputs, choosing sentences from the input paragraph that have answer +related information, and using a separate cross-attention attention block in +the decoder which attends to the answer. We observe that answer prompting +without any additional modes obtains the best scores across rouge, meteor +scores. Additionally, we use a custom metric to calculate how many of the +generated questions have the same answer, as the answer which is used to +generate them. + +
+
+
+
+
+ + ☆ From Detection to Action Recognition: An Edge-Based Pipeline for Robot + Human Perception + + +
+ Mobile service robots are proving to be increasingly effective in a range of +applications, such as healthcare, monitoring Activities of Daily Living (ADL), +and facilitating Ambient Assisted Living (AAL). These robots heavily rely on +Human Action Recognition (HAR) to interpret human actions and intentions. +However, for HAR to function effectively on service robots, it requires prior +knowledge of human presence (human detection) and identification of individuals +to monitor (human tracking). In this work, we propose an end-to-end pipeline +that encompasses the entire process, starting from human detection and +tracking, leading to action recognition. The pipeline is designed to operate in +near real-time while ensuring all stages of processing are performed on the +edge, reducing the need for centralised computation. To identify the most +suitable models for our mobile robot, we conducted a series of experiments +comparing state-of-the-art solutions based on both their detection performance +and efficiency. To evaluate the effectiveness of our proposed pipeline, we +proposed a dataset comprising daily household activities. By presenting our +findings and analysing the results, we demonstrate the efficacy of our approach +in enabling mobile robots to understand and respond to human behaviour in +real-world scenarios relying mainly on the data from their RGB cameras. + +
+
+ comment: 7 pages, 10 figures, 2 tables +
+
+
+
+
+ + ☆ Molecule Joint Auto-Encoding: Trajectory Pretraining with 2D and 3D + Diffusion NeurIPS 2023 + + +
+ Recently, artificial intelligence for drug discovery has raised increasing +interest in both machine learning and chemistry domains. The fundamental +building block for drug discovery is molecule geometry and thus, the molecule's +geometrical representation is the main bottleneck to better utilize machine +learning techniques for drug discovery. In this work, we propose a pretraining +method for molecule joint auto-encoding (MoleculeJAE). MoleculeJAE can learn +both the 2D bond (topology) and 3D conformation (geometry) information, and a +diffusion process model is applied to mimic the augmented trajectories of such +two modalities, based on which, MoleculeJAE will learn the inherent chemical +structure in a self-supervised manner. Thus, the pretrained geometrical +representation in MoleculeJAE is expected to benefit downstream +geometry-related tasks. Empirically, MoleculeJAE proves its effectiveness by +reaching state-of-the-art performance on 15 out of 20 tasks by comparing it +with 12 competitive baselines. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ Search Strategies for Self-driving Laboratories with Pending Experiments NeurIPS 2023 + + +
+ Self-driving laboratories (SDLs) consist of multiple stations that perform +material synthesis and characterisation tasks. To minimize station downtime and +maximize experimental throughput, it is practical to run experiments in +asynchronous parallel, in which multiple experiments are being performed at +once in different stages. Asynchronous parallelization of experiments, however, +introduces delayed feedback (i.e. "pending experiments"), which is known to +reduce Bayesian optimiser performance. Here, we build a simulator for a +multi-stage SDL and compare optimisation strategies for dealing with delayed +feedback and asynchronous parallelized operation. Using data from a real SDL, +we build a ground truth Bayesian optimisation simulator from 177 previously run +experiments for maximizing the conductivity of functional coatings. We then +compare search strategies such as expected improvement, noisy expected +improvement, 4-mode exploration and random sampling. We evaluate their +performance in terms of amount of delay and problem dimensionality. Our +simulation results showcase the trade-off between the asynchronous parallel +operation and delayed feedback. + +
+
+ comment: Accepted at NeurIPS 2023, AI4Mat +
+
+
+
+
+ + ☆ Subnetwork-to-go: Elastic Neural Network with Dynamic Training and + Customizable Inference + + +
+ Deploying neural networks to different devices or platforms is in general +challenging, especially when the model size is large or model complexity is +high. Although there exist ways for model pruning or distillation, it is +typically required to perform a full round of model training or finetuning +procedure in order to obtain a smaller model that satisfies the model size or +complexity constraints. Motivated by recent works on dynamic neural networks, +we propose a simple way to train a large network and flexibly extract a +subnetwork from it given a model size or complexity constraint during +inference. We introduce a new way to allow a large model to be trained with +dynamic depth and width during the training phase, and after the large model is +trained we can select a subnetwork from it with arbitrary depth and width +during the inference phase with a relatively better performance compared to +training the subnetwork independently from scratch. Experiment results on a +music source separation model show that our proposed method can effectively +improve the separation performance across different subnetwork sizes and +complexities with a single large model, and training the large model takes +significantly shorter time than training all the different subnetworks. + +
+
+ comment: 5 pages, 3 figures +
+
+
+
+
+ + ☆ Data is Overrated: Perceptual Metrics Can Lead Learning in the Absence + of Training Data NeurIPS 2023 + + +
+ Perceptual metrics are traditionally used to evaluate the quality of natural +signals, such as images and audio. They are designed to mimic the perceptual +behaviour of human observers and usually reflect structures found in natural +signals. This motivates their use as loss functions for training generative +models such that models will learn to capture the structure held in the metric. +We take this idea to the extreme in the audio domain by training a compressive +autoencoder to reconstruct uniform noise, in lieu of natural data. We show that +training with perceptual losses improves the reconstruction of spectrograms and +re-synthesized audio at test time over models trained with a standard Euclidean +loss. This demonstrates better generalisation to unseen natural signals when +using perceptual metrics. + +
+
+ comment: Machine Learning for Audio Workshop, NeurIPS 2023 +
+
+
+
+
+ + ☆ Run LoRA Run: Faster and Lighter LoRA Implementations + + +
+ LoRA is a technique that reduces the number of trainable parameters in a +neural network by introducing low-rank adapters to linear layers. This +technique is used both for fine-tuning (LoRA, QLoRA) and full train (ReLoRA). +This paper presents the RunLoRA framework for efficient implementations of LoRA +that significantly improves the speed of neural network training and +fine-tuning using low-rank adapters. The proposed implementation optimizes the +computation of LoRA operations based on dimensions of corresponding linear +layer, layer input dimensions and lora rank by choosing best forward and +backward computation graph based on FLOPs and time estimations, resulting in +faster training without sacrificing accuracy. The experimental results show up +to 17% speedup on Llama family of models. + +
+
+
+
+
+ + ☆ Compressed Context Memory For Online Language Model Interaction + + +
+ This paper presents a novel context compression method for Transformer +language models in online scenarios such as ChatGPT, where the context +continually expands. As the context lengthens, the attention process requires +more memory and computational resources, which in turn reduces the throughput +of the language model. To this end, we propose a compressed context memory +system that continually compresses the growing context into a compact memory +space. The compression process simply involves integrating a lightweight +conditional LoRA into the language model's forward pass during inference. Based +on the compressed context memory, the language model can perform inference with +reduced memory and attention operations. Through evaluations on conversation, +personalization, and multi-task learning, we demonstrate that our approach +achieves the performance level of a full context model with $5\times$ smaller +context memory space. Codes are available at +https://github.com/snu-mllab/context-memory. + +
+
+
+
+
+ + ☆ Approximating Solutions to the Knapsack Problem using the Lagrangian + Dual Framework + + +
+ The Knapsack Problem is a classic problem in combinatorial optimisation. +Solving these problems may be computationally expensive. Recent years have seen +a growing interest in the use of deep learning methods to approximate the +solutions to such problems. A core problem is how to enforce or encourage +constraint satisfaction in predicted solutions. A promising approach for +predicting solutions to constrained optimisation problems is the Lagrangian +Dual Framework which builds on the method of Lagrangian Relaxation. In this +paper we develop neural network models to approximate Knapsack Problem +solutions using the Lagrangian Dual Framework while improving constraint +satisfaction. We explore the problems of output interpretation and model +selection within this context. Experimental results show strong constraint +satisfaction with a minor reduction of optimality as compared to a baseline +neural network which does not explicitly model the constraints. + +
+
+
+
+
+ + ☆ SVQ: Sparse Vector Quantization for Spatiotemporal Forecasting + + +
+ Spatiotemporal forecasting tasks, such as weather forecasting and traffic +prediction, offer significant societal benefits. These tasks can be effectively +approached as image forecasting problems using computer vision models. Vector +quantization (VQ) is a well-known method for discrete representation that +improves the latent space, leading to enhanced generalization and transfer +learning capabilities. One of the main challenges in using VQ for +spatiotemporal forecasting is how to balance between keeping enough details and +removing noises from the original patterns for better generalization. We +address this challenge by developing sparse vector quantization, or {\bf SVQ} +for short, that leverages sparse regression to make better trade-off between +the two objectives. The main innovation of this work is to approximate sparse +regression by a two-layer MLP and a randomly fixed or learnable matrix, +dramatically improving its computational efficiency. Through experiments +conducted on diverse datasets in multiple fields including weather forecasting, +traffic flow prediction, and video forecasting, we unequivocally demonstrate +that our proposed method consistently enhances the performance of base models +and achieves state-of-the-art results across all benchmarks. + +
+
+
+
+
+ + ☆ An AI for Scientific Discovery Route between Amorphous Networks and + Mechanical Behavior + + +
+ "AI for science" is widely recognized as a future trend in the development of +scientific research. Currently, although machine learning algorithms have +played a crucial role in scientific research with numerous successful cases, +relatively few instances exist where AI assists researchers in uncovering the +underlying physical mechanisms behind a certain phenomenon and subsequently +using that mechanism to improve machine learning algorithms' efficiency. This +article uses the investigation into the relationship between extreme Poisson's +ratio values and the structure of amorphous networks as a case study to +illustrate how machine learning methods can assist in revealing underlying +physical mechanisms. Upon recognizing that the Poisson's ratio relies on the +low-frequency vibrational modes of dynamical matrix, we can then employ a +convolutional neural network, trained on the dynamical matrix instead of +traditional image recognition, to predict the Poisson's ratio of amorphous +networks with a much higher efficiency. Through this example, we aim to +showcase the role that artificial intelligence can play in revealing +fundamental physical mechanisms, which subsequently improves the machine +learning algorithms significantly. + +
+
+ comment: 15 pages, 5 figures +
+
+
+
+
+ + ☆ Generalized Contrastive Divergence: Joint Training of Energy-Based Model + and Diffusion Model through Inverse Reinforcement Learning NeurIPS 2023 + + +
+ We present Generalized Contrastive Divergence (GCD), a novel objective +function for training an energy-based model (EBM) and a sampler simultaneously. +GCD generalizes Contrastive Divergence (Hinton, 2002), a celebrated algorithm +for training EBM, by replacing Markov Chain Monte Carlo (MCMC) distribution +with a trainable sampler, such as a diffusion model. In GCD, the joint training +of EBM and a diffusion model is formulated as a minimax problem, which reaches +an equilibrium when both models converge to the data distribution. The minimax +learning with GCD bears interesting equivalence to inverse reinforcement +learning, where the energy corresponds to a negative reward, the diffusion +model is a policy, and the real data is expert demonstrations. We present +preliminary yet promising results showing that joint training is beneficial for +both EBM and a diffusion model. GCD enables EBM training without MCMC while +improving the sample quality of a diffusion model. + +
+
+ comment: NeurIPS 2023 Workshop on Diffusion Models +
+
+
+
+
+ + ☆ Diffused Task-Agnostic Milestone Planner + + +
+ Addressing decision-making problems using sequence modeling to predict future +trajectories shows promising results in recent years. In this paper, we take a +step further to leverage the sequence predictive method in wider areas such as +long-term planning, vision-based control, and multi-task decision-making. To +this end, we propose a method to utilize a diffusion-based generative sequence +model to plan a series of milestones in a latent space and to have an agent to +follow the milestones to accomplish a given task. The proposed method can learn +control-relevant, low-dimensional latent representations of milestones, which +makes it possible to efficiently perform long-term planning and vision-based +control. Furthermore, our approach exploits generation flexibility of the +diffusion model, which makes it possible to plan diverse trajectories for +multi-task decision-making. We demonstrate the proposed method across offline +reinforcement learning (RL) benchmarks and an visual manipulation environment. +The results show that our approach outperforms offline RL methods in solving +long-horizon, sparse-reward tasks and multi-task problems, while also achieving +the state-of-the-art performance on the most challenging vision-based +manipulation benchmark. + +
+
+ comment: 37th Conference on Neural Information Processing Systems +
+
+
+
+
+ + ☆ An Infinite-Width Analysis on the Jacobian-Regularised Training of a + Neural Network + + +
+ The recent theoretical analysis of deep neural networks in their +infinite-width limits has deepened our understanding of initialisation, feature +learning, and training of those networks, and brought new practical techniques +for finding appropriate hyperparameters, learning network weights, and +performing inference. In this paper, we broaden this line of research by +showing that this infinite-width analysis can be extended to the Jacobian of a +deep neural network. We show that a multilayer perceptron (MLP) and its +Jacobian at initialisation jointly converge to a Gaussian process (GP) as the +widths of the MLP's hidden layers go to infinity and characterise this GP. We +also prove that in the infinite-width limit, the evolution of the MLP under the +so-called robust training (i.e., training with a regulariser on the Jacobian) +is described by a linear first-order ordinary differential equation that is +determined by a variant of the Neural Tangent Kernel. We experimentally show +the relevance of our theoretical claims to wide finite networks, and +empirically analyse the properties of kernel regression solution to obtain an +insight into Jacobian regularisation. + +
+
+ comment: 72 pages, 21 figures +
+
+
+
+
+ + ☆ Teaching Specific Scientific Knowledge into Large Language Models + through Additional Training + + +
+ Through additional training, we explore embedding specialized scientific +knowledge into the Llama 2 Large Language Model (LLM). Key findings reveal that +effective knowledge integration requires reading texts from multiple +perspectives, especially in instructional formats. We utilize text augmentation +to tackle the scarcity of specialized texts, including style conversions and +translations. Hyperparameter optimization proves crucial, with different size +models (7b, 13b, and 70b) reasonably undergoing additional training. Validating +our methods, we construct a dataset of 65,000 scientific papers. Although we +have succeeded in partially embedding knowledge, the study highlights the +complexities and limitations of incorporating specialized information into +LLMs, suggesting areas for further improvement. + +
+
+
+
+
+ + ☆ On the variants of SVM methods applied to GPR data to classify tack coat + characteristics in French pavements: two experimental case studies + + +
+ Among the commonly used non-destructive techniques, the Ground Penetrating +Radar (GPR) is one of the most widely adopted today for assessing pavement +conditions in France. However, conventional radar systems and their forward +processing methods have shown their limitations for the physical and +geometrical characterization of very thin layers such as tack coats. However, +the use of Machine Learning methods applied to GPR with an inverse approach +showed that it was numerically possible to identify the tack coat +characteristics despite masking effects due to low timefrequency resolution +noted in the raw B-scans. Thus, we propose in this paper to apply the inverse +approach based on Machine Learning, already validated in previous works on +numerical data, on two experimental cases with different pavement structures. +The first case corresponds to a validation on known pavement structures on the +Gustave Eiffel University (Nantes, France) with its pavement fatigue carousel +and the second case focuses on a new real road in Vend{\'e}e department +(France). In both case studies, the performances of SVM/SVR methods showed the +efficiency of supervised learning methods to classify and estimate the emulsion +proportioning in the tack coats. + +
+
+
+
+
+ + ☆ Interpretable Mechanistic Representations for Meal-level Glycemic + Control in the Wild ML4H + + +
+ Diabetes encompasses a complex landscape of glycemic control that varies +widely among individuals. However, current methods do not faithfully capture +this variability at the meal level. On the one hand, expert-crafted features +lack the flexibility of data-driven methods; on the other hand, learned +representations tend to be uninterpretable which hampers clinical adoption. In +this paper, we propose a hybrid variational autoencoder to learn interpretable +representations of CGM and meal data. Our method grounds the latent space to +the inputs of a mechanistic differential equation, producing embeddings that +reflect physiological quantities, such as insulin sensitivity, glucose +effectiveness, and basal glucose levels. Moreover, we introduce a novel method +to infer the glucose appearance rate, making the mechanistic model robust to +unreliable meal logs. On a dataset of CGM and self-reported meals from +individuals with type-2 diabetes and pre-diabetes, our unsupervised +representation discovers a separation between individuals proportional to their +disease severity. Our embeddings produce clusters that are up to 4x better than +naive, expert, black-box, and pure mechanistic features. Our method provides a +nuanced, yet interpretable, embedding space to compare glycemic control within +and across individuals, directly learnable from in-the-wild data. + +
+
+ comment: Proceedings of Machine Learning for Health (ML4H) 2023. Code + available at: https://github.com/KeAWang/interpretable-cgm-representations +
+
+
+
+
+ + ☆ Measuring Misogyny in Natural Language Generation: Preliminary Results + from a Case Study on two Reddit Communities EMNLP 2023 + + +
+ Generic `toxicity' classifiers continue to be used for evaluating the +potential for harm in natural language generation, despite mounting evidence of +their shortcomings. We consider the challenge of measuring misogyny in natural +language generation, and argue that generic `toxicity' classifiers are +inadequate for this task. We use data from two well-characterised `Incel' +communities on Reddit that differ primarily in their degrees of misogyny to +construct a pair of training corpora which we use to fine-tune two language +models. We show that an open source `toxicity' classifier is unable to +distinguish meaningfully between generations from these models. We contrast +this with a misogyny-specific lexicon recently proposed by feminist +subject-matter experts, demonstrating that, despite the limitations of simple +lexicon-based approaches, this shows promise as a benchmark to evaluate +language models for misogyny, and that it is sensitive enough to reveal the +known differences in these Reddit communities. Our preliminary findings +highlight the limitations of a generic approach to evaluating harms, and +further emphasise the need for careful benchmark design and selection in +natural language evaluation. + +
+
+ comment: This extended abstract was presented at the Generation, Evaluation + and Metrics workshop at Empirical Methods in Natural Language Processing in + 2023 (GEM@EMNLP 2023) in Singapore +
+
+
+
+
+ + ☆ Deep Learning for Koopman-based Dynamic Movement Primitives + + +
+ The challenge of teaching robots to perform dexterous manipulation, dynamic +locomotion, or whole--body manipulation from a small number of demonstrations +is an important research field that has attracted interest from across the +robotics community. In this work, we propose a novel approach by joining the +theories of Koopman Operators and Dynamic Movement Primitives to Learning from +Demonstration. Our approach, named \gls{admd}, projects nonlinear dynamical +systems into linear latent spaces such that a solution reproduces the desired +complex motion. Use of an autoencoder in our approach enables generalizability +and scalability, while the constraint to a linear system attains +interpretability. Our results are comparable to the Extended Dynamic Mode +Decomposition on the LASA Handwriting dataset but with training on only a small +fractions of the letters. + +
+
+
+
+
+ + ☆ GCFA:Geodesic Curve Feature Augmentation via Shape Space Theory + + +
+ Deep learning has yielded remarkable outcomes in various domains. However, +the challenge of requiring large-scale labeled samples still persists in deep +learning. Thus, data augmentation has been introduced as a critical strategy to +train deep learning models. However, data augmentation suffers from information +loss and poor performance in small sample environments. To overcome these +drawbacks, we propose a feature augmentation method based on shape space +theory, i.e., Geodesic curve feature augmentation, called GCFA in brevity. +First, we extract features from the image with the neural network model. Then, +the multiple image features are projected into a pre-shape space as features. +In the pre-shape space, a Geodesic curve is built to fit the features. Finally, +the many generated features on the Geodesic curve are used to train the various +machine learning models. The GCFA module can be seamlessly integrated with most +machine learning methods. And the proposed method is simple, effective and +insensitive for the small sample datasets. Several examples demonstrate that +the GCFA method can greatly improve the performance of the data preprocessing +model in a small sample environment. + +
+
+
+
+
+ + ☆ Complementary Benefits of Contrastive Learning and Self-Training Under + Distribution Shift NeurIPS 2023 + + +
+ Self-training and contrastive learning have emerged as leading techniques for +incorporating unlabeled data, both under distribution shift (unsupervised +domain adaptation) and when it is absent (semi-supervised learning). However, +despite the popularity and compatibility of these techniques, their efficacy in +combination remains unexplored. In this paper, we undertake a systematic +empirical investigation of this combination, finding that (i) in domain +adaptation settings, self-training and contrastive learning offer significant +complementary gains; and (ii) in semi-supervised learning settings, +surprisingly, the benefits are not synergistic. Across eight distribution shift +datasets (e.g., BREEDs, WILDS), we demonstrate that the combined method obtains +3--8% higher accuracy than either approach independently. We then theoretically +analyze these techniques in a simplified model of distribution shift, +demonstrating scenarios under which the features produced by contrastive +learning can yield a good initialization for self-training to further amplify +gains and achieve optimal performance, even when either method alone would +fail. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ On the Nystrom Approximation for Preconditioning in Kernel Machines + + +
+ Kernel methods are a popular class of nonlinear predictive models in machine +learning. Scalable algorithms for learning kernel models need to be iterative +in nature, but convergence can be slow due to poor conditioning. Spectral +preconditioning is an important tool to speed-up the convergence of such +iterative algorithms for training kernel models. However computing and storing +a spectral preconditioner can be expensive which can lead to large +computational and storage overheads, precluding the application of kernel +methods to problems with large datasets. A Nystrom approximation of the +spectral preconditioner is often cheaper to compute and store, and has +demonstrated success in practical applications. In this paper we analyze the +trade-offs of using such an approximated preconditioner. Specifically, we show +that a sample of logarithmic size (as a function of the size of the dataset) +enables the Nystrom-based approximated preconditioner to accelerate gradient +descent nearly as well as the exact preconditioner, while also reducing the +computational and storage overheads. + +
+
+
+
+
+ + ☆ Benchmarking Continual Learning from Cognitive Perspectives + + +
+ Continual learning addresses the problem of continuously acquiring and +transferring knowledge without catastrophic forgetting of old concepts. While +humans achieve continual learning via diverse neurocognitive mechanisms, there +is a mismatch between cognitive properties and evaluation methods of continual +learning models. First, the measurement of continual learning models mostly +relies on evaluation metrics at a micro-level, which cannot characterize +cognitive capacities of the model. Second, the measurement is method-specific, +emphasizing model strengths in one aspect while obscuring potential weaknesses +in other respects. To address these issues, we propose to integrate model +cognitive capacities and evaluation metrics into a unified evaluation paradigm. +We first characterize model capacities via desiderata derived from cognitive +properties supporting human continual learning. The desiderata concern (1) +adaptability in varying lengths of task sequence; (2) sensitivity to dynamic +task variations; and (3) efficiency in memory usage and training time +consumption. Then we design evaluation protocols for each desideratum to assess +cognitive capacities of recent continual learning models. Experimental results +show that no method we consider has satisfied all the desiderata and is still +far away from realizing truly continual learning. Although some methods exhibit +some degree of adaptability and efficiency, no method is able to identify task +relationships when encountering dynamic task variations, or achieve a trade-off +in learning similarities and differences between tasks. Inspired by these +results, we discuss possible factors that influence model performance in these +desiderata and provide guidance for the improvement of continual learning +models. + +
+
+ comment: 12 pages, 11 figures +
+
+
+
+
+ + ☆ Balanced Marginal and Joint Distributional Learning via Mixture + Cramer-Wold Distance + + +
+ In the process of training a generative model, it becomes essential to +measure the discrepancy between two high-dimensional probability distributions: +the generative distribution and the ground-truth distribution of the observed +dataset. Recently, there has been growing interest in an approach that involves +slicing high-dimensional distributions, with the Cramer-Wold distance emerging +as a promising method. However, we have identified that the Cramer-Wold +distance primarily focuses on joint distributional learning, whereas +understanding marginal distributional patterns is crucial for effective +synthetic data generation. In this paper, we introduce a novel measure of +dissimilarity, the mixture Cramer-Wold distance. This measure enables us to +capture both marginal and joint distributional information simultaneously, as +it incorporates a mixture measure with point masses on standard basis vectors. +Building upon the mixture Cramer-Wold distance, we propose a new generative +model called CWDAE (Cramer-Wold Distributional AutoEncoder), which shows +remarkable performance in generating synthetic data when applied to real +tabular datasets. Furthermore, our model offers the flexibility to adjust the +level of data privacy with ease. + +
+
+
+
+
+ + ☆ Dyport: Dynamic Importance-based Hypothesis Generation Benchmarking + Technique + + +
+ This paper presents a novel benchmarking framework Dyport for evaluating +biomedical hypothesis generation systems. Utilizing curated datasets, our +approach tests these systems under realistic conditions, enhancing the +relevance of our evaluations. We integrate knowledge from the curated databases +into a dynamic graph, accompanied by a method to quantify discovery importance. +This not only assesses hypothesis accuracy but also their potential impact in +biomedical research which significantly extends traditional link prediction +benchmarks. Applicability of our benchmarking process is demonstrated on +several link prediction systems applied on biomedical semantic knowledge +graphs. Being flexible, our benchmarking system is designed for broad +application in hypothesis generation quality verification, aiming to expand the +scope of scientific discovery within the biomedical research community. +Availability and implementation: Dyport framework is fully open-source. All +code and datasets are available at: https://github.com/IlyaTyagin/Dyport + +
+
+
+
+
+ + ☆ Cooperative Probabilistic Trajectory Forecasting under Occlusion + + +
+ Perception and planning under occlusion is essential for safety-critical +tasks. Occlusion-aware planning often requires communicating the information of +the occluded object to the ego agent for safe navigation. However, +communicating rich sensor information under adverse conditions during +communication loss and limited bandwidth may not be always feasible. Further, +in GPS denied environments and indoor navigation, localizing and sharing of +occluded objects can be challenging. To overcome this, relative pose estimation +between connected agents sharing a common field of view can be a +computationally effective way of communicating information about surrounding +objects. In this paper, we design an end-to-end network that cooperatively +estimates the current states of occluded pedestrian in the reference frame of +ego agent and then predicts the trajectory with safety guarantees. +Experimentally, we show that the uncertainty-aware trajectory prediction of +occluded pedestrian by the ego agent is almost similar to the ground truth +trajectory assuming no occlusion. The current research holds promise for +uncertainty-aware navigation among multiple connected agents under occlusion. + +
+
+ comment: 10 pages, 13 figures, 1 table +
+
+
+
+
+ + ☆ Enhancing Molecular Property Prediction via Mixture of Collaborative + Experts + + +
+ Molecular Property Prediction (MPP) task involves predicting biochemical +properties based on molecular features, such as molecular graph structures, +contributing to the discovery of lead compounds in drug development. To address +data scarcity and imbalance in MPP, some studies have adopted Graph Neural +Networks (GNN) as an encoder to extract commonalities from molecular graphs. +However, these approaches often use a separate predictor for each task, +neglecting the shared characteristics among predictors corresponding to +different tasks. In response to this limitation, we introduce the GNN-MoCE +architecture. It employs the Mixture of Collaborative Experts (MoCE) as +predictors, exploiting task commonalities while confronting the homogeneity +issue in the expert pool and the decision dominance dilemma within the expert +group. To enhance expert diversity for collaboration among all experts, the +Expert-Specific Projection method is proposed to assign a unique projection +perspective to each expert. To balance decision-making influence for +collaboration within the expert group, the Expert-Specific Loss is presented to +integrate individual expert loss into the weighted decision loss of the group +for more equitable training. Benefiting from the enhancements of MoCE in expert +creation, dynamic expert group formation, and experts' collaboration, our model +demonstrates superior performance over traditional methods on 24 MPP datasets, +especially in tasks with limited data or high imbalance. + +
+
+ comment: 11 pages, 8 figures +
+
+
+
+
+ + ☆ OMNIINPUT: A Model-centric Evaluation Framework through Output + Distribution + + +
+ We propose a novel model-centric evaluation framework, OmniInput, to evaluate +the quality of an AI/ML model's predictions on all possible inputs (including +human-unrecognizable ones), which is crucial for AI safety and reliability. +Unlike traditional data-centric evaluation based on pre-defined test sets, the +test set in OmniInput is self-constructed by the model itself and the model +quality is evaluated by investigating its output distribution. We employ an +efficient sampler to obtain representative inputs and the output distribution +of the trained model, which, after selective annotation, can be used to +estimate the model's precision and recall at different output values and a +comprehensive precision-recall curve. Our experiments demonstrate that +OmniInput enables a more fine-grained comparison between models, especially +when their performance is almost the same on pre-defined datasets, leading to +new findings and insights for how to train more robust, generalizable models. + +
+
+
+
+
+ + ☆ STEP CATFormer: Spatial-Temporal Effective Body-Part Cross Attention + Transformer for Skeleton-based Action Recognition BMVC 2023 + + +
+ Graph convolutional networks (GCNs) have been widely used and achieved +remarkable results in skeleton-based action recognition. We think the key to +skeleton-based action recognition is a skeleton hanging in frames, so we focus +on how the Graph Convolutional Convolution networks learn different topologies +and effectively aggregate joint features in the global temporal and local +temporal. In this work, we propose three Channel-wise Tolopogy Graph +Convolution based on Channel-wise Topology Refinement Graph Convolution +(CTR-GCN). Combining CTR-GCN with two joint cross-attention modules can capture +the upper-lower body part and hand-foot relationship skeleton features. After +that, to capture features of human skeletons changing in frames we design the +Temporal Attention Transformers to extract skeletons effectively. The Temporal +Attention Transformers can learn the temporal features of human skeleton +sequences. Finally, we fuse the temporal features output scale with MLP and +classification. We develop a powerful graph convolutional network named Spatial +Temporal Effective Body-part Cross Attention Transformer which notably +high-performance on the NTU RGB+D, NTU RGB+D 120 datasets. Our code and models +are available at https://github.com/maclong01/STEP-CATFormer + +
+
+ comment: Accepted to BMVC 2023: Computer Vision for Games and Games for + Computer Vision (CVG). 9 pages +
+
+
+
+
+ + ☆ Anomaly Detection for Scalable Task Grouping in Reinforcement + Learning-based RAN Optimization + + +
+ The use of learning-based methods for optimizing cellular radio access +networks (RAN) has received increasing attention in recent years. This +coincides with a rapid increase in the number of cell sites worldwide, driven +largely by dramatic growth in cellular network traffic. Training and +maintaining learned models that work well across a large number of cell sites +has thus become a pertinent problem. This paper proposes a scalable framework +for constructing a reinforcement learning policy bank that can perform RAN +optimization across a large number of cell sites with varying traffic patterns. +Central to our framework is a novel application of anomaly detection techniques +to assess the compatibility between sites (tasks) and the policy bank. This +allows our framework to intelligently identify when a policy can be reused for +a task, and when a new policy needs to be trained and added to the policy bank. +Our results show that our approach to compatibility assessment leads to an +efficient use of computational resources, by allowing us to construct a +performant policy bank without exhaustively training on all tasks, which makes +it applicable under real-world constraints. + +
+
+
+
+
+ + ☆ Low-Cost High-Power Membership Inference by Boosting Relativity + + +
+ We present a robust membership inference attack (RMIA) that amplifies the +distinction between population data and the training data on any target model, +by effectively leveraging both reference models and reference data in our +likelihood ratio test. Our algorithm exhibits superior test power +(true-positive rate) when compared to prior methods, even at extremely low +false-positive error rates (as low as 0). Also, under computation constraints, +where only a limited number of reference models (as few as 1) are available, +our method performs exceptionally well, unlike some prior attacks that approach +random guessing in such scenarios. Our method lays the groundwork for +cost-effective and practical yet powerful and robust privacy risk analysis of +machine learning algorithms. + +
+
+
+
+
+ + ☆ f-FERM: A Scalable Framework for Robust Fair Empirical Risk Minimization + + +
+ Training and deploying machine learning models that meet fairness criteria +for protected groups are fundamental in modern artificial intelligence. While +numerous constraints and regularization terms have been proposed in the +literature to promote fairness in machine learning tasks, most of these methods +are not amenable to stochastic optimization due to the complex and nonlinear +structure of constraints and regularizers. Here, the term "stochastic" refers +to the ability of the algorithm to work with small mini-batches of data. +Motivated by the limitation of existing literature, this paper presents a +unified stochastic optimization framework for fair empirical risk minimization +based on f-divergence measures (f-FERM). The proposed stochastic algorithm +enjoys theoretical convergence guarantees. In addition, our experiments +demonstrate the superiority of fairness-accuracy tradeoffs offered by f-FERM +for almost all batch sizes (ranging from full-batch to batch size of one). +Moreover, we show that our framework can be extended to the case where there is +a distribution shift from training to the test data. Our extension is based on +a distributionally robust optimization reformulation of f-FERM objective under +$L_p$ norms as uncertainty sets. Again, in this distributionally robust +setting, f-FERM not only enjoys theoretical convergence guarantees but also +outperforms other baselines in the literature in the tasks involving +distribution shifts. An efficient stochastic implementation of $f$-FERM is +publicly available. + +
+
+ comment: 23 Pages,5 figures +
+
+
+
+
+ + ☆ CAFE: Towards Compact, Adaptive, and Fast Embedding for Large-scale + Recommendation Models + + +
+ Recently, the growing memory demands of embedding tables in Deep Learning +Recommendation Models (DLRMs) pose great challenges for model training and +deployment. Existing embedding compression solutions cannot simultaneously meet +three key design requirements: memory efficiency, low latency, and adaptability +to dynamic data distribution. This paper presents CAFE, a Compact, Adaptive, +and Fast Embedding compression framework that addresses the above requirements. +The design philosophy of CAFE is to dynamically allocate more memory resources +to important features (called hot features), and allocate less memory to +unimportant ones. In CAFE, we propose a fast and lightweight sketch data +structure, named HotSketch, to capture feature importance and report hot +features in real time. For each reported hot feature, we assign it a unique +embedding. For the non-hot features, we allow multiple features to share one +embedding by using hash embedding technique. Guided by our design philosophy, +we further propose a multi-level hash embedding framework to optimize the +embedding tables of non-hot features. We theoretically analyze the accuracy of +HotSketch, and analyze the model convergence against deviation. Extensive +experiments show that CAFE significantly outperforms existing embedding +compression methods, yielding 3.92% and 3.68% superior testing AUC on Criteo +Kaggle dataset and CriteoTB dataset at a compression ratio of 10000x. The +source codes of CAFE are available at GitHub. + +
+
+
+
+
+ + ☆ Seller-side Outcome Fairness in Online Marketplaces + + +
+ This paper aims to investigate and achieve seller-side fairness within online +marketplaces, where many sellers and their items are not sufficiently exposed +to customers in an e-commerce platform. This phenomenon raises concerns +regarding the potential loss of revenue associated with less exposed items as +well as less marketplace diversity. We introduce the notion of seller-side +outcome fairness and build an optimization model to balance collected +recommendation rewards and the fairness metric. We then propose a +gradient-based data-driven algorithm based on the duality and bandit theory. +Our numerical experiments on real e-commerce data sets show that our algorithm +can lift seller fairness measures while not hurting metrics like collected +Gross Merchandise Value (GMV) and total purchases. + +
+
+
+
+
+ + ☆ Customizable Combination of Parameter-Efficient Modules for Multi-Task + Learning + + +
+ Modular and composable transfer learning is an emerging direction in the +field of Parameter Efficient Fine-Tuning, as it enables neural networks to +better organize various aspects of knowledge, leading to improved cross-task +generalization. In this paper, we introduce a novel approach Customized +Polytropon C-Poly that combines task-common skills and task-specific skills, +while the skill parameters being highly parameterized using low-rank +techniques. Each task is associated with a customizable number of exclusive +specialized skills and also benefits from skills shared with peer tasks. A +skill assignment matrix is jointly learned. To evaluate our approach, we +conducted extensive experiments on the Super-NaturalInstructions and the +SuperGLUE benchmarks. Our findings demonstrate that C-Poly outperforms +fully-shared, task-specific, and skill-indistinguishable baselines, +significantly enhancing the sample efficiency in multi-task learning scenarios. + +
+
+ comment: 22 pages, 9 figures +
+
+
+
+
+ + ☆ Generalizable Neural Physics Solvers by Baldwinian Evolution + + +
+ Physics-informed neural networks (PINNs) are at the forefront of scientific +machine learning, making possible the creation of machine intelligence that is +cognizant of physical laws and able to accurately simulate them. In this paper, +the potential of discovering PINNs that generalize over an entire family of +physics tasks is studied, for the first time, through a biological lens of the +Baldwin effect. Drawing inspiration from the neurodevelopment of precocial +species that have evolved to learn, predict and react quickly to their +environment, we envision PINNs that are pre-wired with connection strengths +inducing strong biases towards efficient learning of physics. To this end, +evolutionary selection pressure (guided by proficiency over a family of tasks) +is coupled with lifetime learning (to specialize on a smaller subset of those +tasks) to produce PINNs that demonstrate fast and physics-compliant prediction +capabilities across a range of empirically challenging problem instances. The +Baldwinian approach achieves an order of magnitude improvement in prediction +accuracy at a fraction of the computation cost compared to state-of-the-art +results with PINNs meta-learned by gradient descent. This paper marks a leap +forward in the meta-learning of PINNs as generalizable physics solvers. + +
+
+
+
+
+ + ☆ Multicoated and Folded Graph Neural Networks with Strong Lottery Tickets + + +
+ The Strong Lottery Ticket Hypothesis (SLTH) demonstrates the existence of +high-performing subnetworks within a randomly initialized model, discoverable +through pruning a convolutional neural network (CNN) without any weight +training. A recent study, called Untrained GNNs Tickets (UGT), expanded SLTH +from CNNs to shallow graph neural networks (GNNs). However, discrepancies +persist when comparing baseline models with learned dense weights. +Additionally, there remains an unexplored area in applying SLTH to deeper GNNs, +which, despite delivering improved accuracy with additional layers, suffer from +excessive memory requirements. To address these challenges, this work utilizes +Multicoated Supermasks (M-Sup), a scalar pruning mask method, and implements it +in GNNs by proposing a strategy for setting its pruning thresholds adaptively. +In the context of deep GNNs, this research uncovers the existence of untrained +recurrent networks, which exhibit performance on par with their trained +feed-forward counterparts. This paper also introduces the Multi-Stage Folding +and Unshared Masks methods to expand the search space in terms of both +architecture and parameters. Through the evaluation of various datasets, +including the Open Graph Benchmark (OGB), this work establishes a triple-win +scenario for SLTH-based GNNs: by achieving high sparsity, competitive +performance, and high memory efficiency with up to 98.7\% reduction, it +demonstrates suitability for energy-efficient graph processing. + +
+
+ comment: 9 pages, accepted in the Second Learning on Graphs Conference (LoG + 2023) +
+
+
+
+
+ + ☆ Deep Multimodal Fusion for Surgical Feedback Classification + + +
+ Quantification of real-time informal feedback delivered by an experienced +surgeon to a trainee during surgery is important for skill improvements in +surgical training. Such feedback in the live operating room is inherently +multimodal, consisting of verbal conversations (e.g., questions and answers) as +well as non-verbal elements (e.g., through visual cues like pointing to +anatomic elements). In this work, we leverage a clinically-validated +five-category classification of surgical feedback: "Anatomic", "Technical", +"Procedural", "Praise" and "Visual Aid". We then develop a multi-label machine +learning model to classify these five categories of surgical feedback from +inputs of text, audio, and video modalities. The ultimate goal of our work is +to help automate the annotation of real-time contextual surgical feedback at +scale. Our automated classification of surgical feedback achieves AUCs ranging +from 71.5 to 77.6 with the fusion improving performance by 3.1%. We also show +that high-quality manual transcriptions of feedback audio from experts improve +AUCs to between 76.5 and 96.2, which demonstrates a clear path toward future +improvements. Empirically, we find that the Staged training strategy, with +first pre-training each modality separately and then training them jointly, is +more effective than training different modalities altogether. We also present +intuitive findings on the importance of modalities for different feedback +categories. This work offers an important first look at the feasibility of +automated classification of real-world live surgical feedback based on text, +audio, and video modalities. + +
+
+
+
+
+ + ☆ Accelerated Gradient Algorithms with Adaptive Subspace Search for + Instance-Faster Optimization + + +
+ Gradient-based minimax optimal algorithms have greatly promoted the +development of continuous optimization and machine learning. One seminal work +due to Yurii Nesterov [Nes83a] established $\tilde{\mathcal{O}}(\sqrt{L/\mu})$ +gradient complexity for minimizing an $L$-smooth $\mu$-strongly convex +objective. However, an ideal algorithm would adapt to the explicit complexity +of a particular objective function and incur faster rates for simpler problems, +triggering our reconsideration of two defeats of existing optimization modeling +and analysis. (i) The worst-case optimality is neither the instance optimality +nor such one in reality. (ii) Traditional $L$-smoothness condition may not be +the primary abstraction/characterization for modern practical problems. + In this paper, we open up a new way to design and analyze gradient-based +algorithms with direct applications in machine learning, including linear +regression and beyond. We introduce two factors $(\alpha, \tau_{\alpha})$ to +refine the description of the degenerated condition of the optimization +problems based on the observation that the singular values of Hessian often +drop sharply. We design adaptive algorithms that solve simpler problems without +pre-known knowledge with reduced gradient or analogous oracle accesses. The +algorithms also improve the state-of-art complexities for several problems in +machine learning, thereby solving the open problem of how to design faster +algorithms in light of the known complexity lower bounds. Specially, with the +$\mathcal{O}(1)$-nuclear norm bounded, we achieve an optimal +$\tilde{\mathcal{O}}(\mu^{-1/3})$ (v.s. $\tilde{\mathcal{O}}(\mu^{-1/2})$) +gradient complexity for linear regression. We hope this work could invoke the +rethinking for understanding the difficulty of modern problems in optimization. + +
+
+ comment: Optimization for Machine Learning +
+
+
+
+
+ + ☆ SDSRA: A Skill-Driven Skill-Recombination Algorithm for Efficient Policy + Learning + + +
+ In this paper, we introduce a novel algorithm - the Skill-Driven Skill +Recombination Algorithm (SDSRA) - an innovative framework that significantly +enhances the efficiency of achieving maximum entropy in reinforcement learning +tasks. We find that SDSRA achieves faster convergence compared to the +traditional Soft Actor-Critic (SAC) algorithm and produces improved policies. +By integrating skill-based strategies within the robust Actor-Critic framework, +SDSRA demonstrates remarkable adaptability and performance across a wide array +of complex and diverse benchmarks. + +
+
+
+
+
+ + ☆ Bootstrap Your Own Variance + + +
+ Understanding model uncertainty is important for many applications. We +propose Bootstrap Your Own Variance (BYOV), combining Bootstrap Your Own Latent +(BYOL), a negative-free Self-Supervised Learning (SSL) algorithm, with Bayes by +Backprop (BBB), a Bayesian method for estimating model posteriors. We find that +the learned predictive std of BYOV vs. a supervised BBB model is well captured +by a Gaussian distribution, providing preliminary evidence that the learned +parameter posterior is useful for label free uncertainty estimation. BYOV +improves upon the deterministic BYOL baseline (+2.83% test ECE, +1.03% test +Brier) and presents better calibration and reliability when tested with various +augmentations (eg: +2.4% test ECE, +1.2% test Brier for Salt & Pepper noise). + +
+
+
+
+
+ + ☆ Constrained Bayesian Optimization Under Partial Observations: Balanced + Improvements and Provable Convergence + + +
+ The partially observable constrained optimization problems (POCOPs) impede +data-driven optimization techniques since an infeasible solution of POCOPs can +provide little information about the objective as well as the constraints. We +endeavor to design an efficient and provable method for expensive POCOPs under +the framework of constrained Bayesian optimization. Our method consists of two +key components. Firstly, we present an improved design of the acquisition +functions that introduces balanced exploration during optimization. We +rigorously study the convergence properties of this design to demonstrate its +effectiveness. Secondly, we propose a Gaussian process embedding different +likelihoods as the surrogate model for a partially observable constraint. This +model leads to a more accurate representation of the feasible regions compared +to traditional classification-based models. Our proposed method is empirically +studied on both synthetic and real-world problems. The results demonstrate the +competitiveness of our method for solving POCOPs. + +
+
+ comment: 26 pages, 8 figures, under review +
+
+
+
+
+ + ☆ Domain Invariant Representation Learning and Sleep Dynamics Modeling for + Automatic Sleep Staging + + +
+ Sleep staging has become a critical task in diagnosing and treating sleep +disorders to prevent sleep related diseases. With rapidly growing large scale +public sleep databases and advances in machine learning, significant progress +has been made toward automatic sleep staging. However, previous studies face +some critical problems in sleep studies; the heterogeneity of subjects' +physiological signals, the inability to extract meaningful information from +unlabeled sleep signal data to improve predictive performances, the difficulty +in modeling correlations between sleep stages, and the lack of an effective +mechanism to quantify predictive uncertainty. In this study, we propose a +neural network based automatic sleep staging model, named DREAM, to learn +domain generalized representations from physiological signals and models sleep +dynamics. DREAM learns sleep related and subject invariant representations from +diverse subjects' sleep signal segments and models sleep dynamics by capturing +interactions between sequential signal segments and between sleep stages. In +the experiments, we demonstrate that DREAM outperforms the existing sleep +staging methods on three datasets. The case study demonstrates that our model +can learn the generalized decision function resulting in good prediction +performances for the new subjects, especially in case there are differences +between testing and training subjects. The usage of unlabeled data shows the +benefit of leveraging unlabeled EEG data. Further, uncertainty quantification +demonstrates that DREAM provides prediction uncertainty, making the model +reliable and helping sleep experts in real world applications. + +
+
+
+
+
+ + ☆ Corporate Bankruptcy Prediction with Domain-Adapted BERT + + +
+ This study performs BERT-based analysis, which is a representative +contextualized language model, on corporate disclosure data to predict +impending bankruptcies. Prior literature on bankruptcy prediction mainly +focuses on developing more sophisticated prediction methodologies with +financial variables. However, in our study, we focus on improving the quality +of input dataset. Specifically, we employ BERT model to perform sentiment +analysis on MD&A disclosures. We show that BERT outperforms dictionary-based +predictions and Word2Vec-based predictions in terms of adjusted R-square in +logistic regression, k-nearest neighbor (kNN-5), and linear kernel support +vector machine (SVM). Further, instead of pre-training the BERT model from +scratch, we apply self-learning with confidence-based filtering to corporate +disclosure data (10-K). We achieve the accuracy rate of 91.56% and demonstrate +that the domain adaptation procedure brings a significant improvement in +prediction accuracy. + +
+
+
+
+
+ + ☆ Understanding the Role of Optimization in Double Descent NeurIPS + + +
+ The phenomenon of model-wise double descent, where the test error peaks and +then reduces as the model size increases, is an interesting topic that has +attracted the attention of researchers due to the striking observed gap between +theory and practice \citep{Belkin2018ReconcilingMM}. Additionally, while double +descent has been observed in various tasks and architectures, the peak of +double descent can sometimes be noticeably absent or diminished, even without +explicit regularization, such as weight decay and early stopping. In this +paper, we investigate this intriguing phenomenon from the optimization +perspective and propose a simple optimization-based explanation for why double +descent sometimes occurs weakly or not at all. To the best of our knowledge, we +are the first to demonstrate that many disparate factors contributing to +model-wise double descent (initialization, normalization, batch size, learning +rate, optimization algorithm) are unified from the viewpoint of optimization: +model-wise double descent is observed if and only if the optimizer can find a +sufficiently low-loss minimum. These factors directly affect the condition +number of the optimization problem or the optimizer and thus affect the final +minimum found by the optimizer, reducing or increasing the height of the double +descent peak. We conduct a series of controlled experiments on random feature +models and two-layer neural networks under various optimization settings, +demonstrating this optimization-based unified view. Our results suggest the +following implication: Double descent is unlikely to be a problem for +real-world machine learning setups. Additionally, our results help explain the +gap between weak double descent peaks in practice and strong peaks observable +in carefully designed setups. + +
+
+ comment: NeurIPS Workshop 2023 Optimization for Machine Learning +
+
+
+
+
+ + ☆ A Scalable and Generalizable Pathloss Map Prediction + + +
+ Large-scale channel prediction, i.e., estimation of the pathloss from +geographical/morphological/building maps, is an essential component of wireless +network planning. Ray tracing (RT)-based methods have been widely used for many +years, but they require significant computational effort that may become +prohibitive with the increased network densification and/or use of higher +frequencies in B5G/6G systems. In this paper, we propose a data-driven, +model-free pathloss map prediction (PMP) method, called PMNet. PMNet uses a +supervised learning approach: it is trained on a limited amount of RT (or +channel measurement) data and map data. Once trained, PMNet can predict +pathloss over location with high accuracy (an RMSE level of $10^{-2}$) in a few +milliseconds. We further extend PMNet by employing transfer learning (TL). TL +allows PMNet to learn a new network scenario quickly (x5.6 faster training) and +efficiently (using x4.5 less data) by transferring knowledge from a pre-trained +model, while retaining accuracy. Our results demonstrate that PMNet is a +scalable and generalizable ML-based PMP method, showing its potential to be +used in several network optimization applications. + +
+
+
+
+
+ + ☆ PECANN: Parallel Efficient Clustering with Graph-Based Approximate + Nearest Neighbor Search + + +
+ This paper studies density-based clustering of point sets. These methods use +dense regions of points to detect clusters of arbitrary shapes. In particular, +we study variants of density peaks clustering, a popular type of algorithm that +has been shown to work well in practice. Our goal is to cluster large +high-dimensional datasets, which are prevalent in practice. Prior solutions are +either sequential, and cannot scale to large data, or are specialized for +low-dimensional data. + This paper unifies the different variants of density peaks clustering into a +single framework, PECANN, by abstracting out several key steps common to this +class of algorithms. One such key step is to find nearest neighbors that +satisfy a predicate function, and one of the main contributions of this paper +is an efficient way to do this predicate search using graph-based approximate +nearest neighbor search (ANNS). To provide ample parallelism, we propose a +doubling search technique that enables points to find an approximate nearest +neighbor satisfying the predicate in a small number of rounds. Our technique +can be applied to many existing graph-based ANNS algorithms, which can all be +plugged into PECANN. + We implement five clustering algorithms with PECANN and evaluate them on +synthetic and real-world datasets with up to 1.28 million points and up to 1024 +dimensions on a 30-core machine with two-way hyper-threading. Compared to the +state-of-the-art FASTDP algorithm for high-dimensional density peaks +clustering, which is sequential, our best algorithm is 45x-734x faster while +achieving competitive ARI scores. Compared to the state-of-the-art parallel +DPC-based algorithm, which is optimized for low dimensions, we show that PECANN +is two orders of magnitude faster. As far as we know, our work is the first to +evaluate DPC variants on large high-dimensional real-world image and text +embedding datasets. + +
+
+
+
+
+ + ♻ ☆ A unified framework for information-theoretic generalization bounds + + +
+ This paper presents a general methodology for deriving information-theoretic +generalization bounds for learning algorithms. The main technical tool is a +probabilistic decorrelation lemma based on a change of measure and a relaxation +of Young's inequality in $L_{\psi_p}$ Orlicz spaces. Using the decorrelation +lemma in combination with other techniques, such as symmetrization, couplings, +and chaining in the space of probability measures, we obtain new upper bounds +on the generalization error, both in expectation and in high probability, and +recover as special cases many of the existing generalization bounds, including +the ones based on mutual information, conditional mutual information, +stochastic chaining, and PAC-Bayes inequalities. In addition, the +Fernique-Talagrand upper bound on the expected supremum of a subgaussian +process emerges as a special case. + +
+
+ comment: 19 pages; final version accepted to Neural Information Processing + Systems +
+
+
+
+
+ + ♻ ☆ Causal Estimation of Exposure Shifts with Neural Networks: Evaluating + the Health Benefits of Stricter Air Quality Standards in the US + + +
+ In policy research, one of the most critical analytic tasks is to estimate +the causal effect of a policy-relevant shift to the distribution of a +continuous exposure/treatment on an outcome of interest. We call this problem +shift-response function (SRF) estimation. Existing neural network methods +involving robust causal-effect estimators lack theoretical guarantees and +practical implementations for SRF estimation. Motivated by a key +policy-relevant question in public health, we develop a neural network method +and its theoretical underpinnings to estimate SRFs with robustness and +efficiency guarantees. We then apply our method to data consisting of 68 +million individuals and 27 million deaths across the U.S. to estimate the +causal effect from revising the US National Ambient Air Quality Standards +(NAAQS) for PM 2.5 from 12 $\mu g/m^3$ to 9 $\mu g/m^3$. This change has been +recently proposed by the US Environmental Protection Agency (EPA). Our goal is +to estimate, for the first time, the reduction in deaths that would result from +this anticipated revision using causal methods for SRFs. Our proposed method, +called {T}argeted {R}egularization for {E}xposure {S}hifts with Neural +{Net}works (TRESNET), contributes to the neural network literature for causal +inference in two ways: first, it proposes a targeted regularization loss with +theoretical properties that ensure double robustness and achieves asymptotic +efficiency specific for SRF estimation; second, it enables loss functions from +the exponential family of distributions to accommodate non-continuous outcome +distributions (such as hospitalization or mortality counts). We complement our +application with benchmark experiments that demonstrate TRESNET's broad +applicability and competitiveness. + +
+
+
+
+
+ + ♻ ☆ Targeted Separation and Convergence with Kernel Discrepancies + + +
+ Maximum mean discrepancies (MMDs) like the kernel Stein discrepancy (KSD) +have grown central to a wide range of applications, including hypothesis +testing, sampler selection, distribution approximation, and variational +inference. In each setting, these kernel-based discrepancy measures are +required to (i) separate a target P from other probability measures or even +(ii) control weak convergence to P. In this article we derive new sufficient +and necessary conditions to ensure (i) and (ii). For MMDs on separable metric +spaces, we characterize those kernels that separate Bochner embeddable measures +and introduce simple conditions for separating all measures with unbounded +kernels and for controlling convergence with bounded kernels. We use these +results on $\mathbb{R}^d$ to substantially broaden the known conditions for KSD +separation and convergence control and to develop the first KSDs known to +exactly metrize weak convergence to P. Along the way, we highlight the +implications of our results for hypothesis testing, measuring and improving +sample quality, and sampling with Stein variational gradient descent. + +
+
+
+
+
+ + ♻ ☆ Incorporating Crowdsourced Annotator Distributions into Ensemble + Modeling to Improve Classification Trustworthiness for Ancient Greek Papyri + + +
+ Performing classification on noisy, crowdsourced image datasets can prove +challenging even for the best neural networks. Two issues which complicate the +problem on such datasets are class imbalance and ground-truth uncertainty in +labeling. The AL-ALL and AL-PUB datasets - consisting of tightly cropped, +individual characters from images of ancient Greek papyri - are strongly +affected by both issues. The application of ensemble modeling to such datasets +can help identify images where the ground-truth is questionable and quantify +the trustworthiness of those samples. As such, we apply stacked generalization +consisting of nearly identical ResNets with different loss functions: one +utilizing sparse cross-entropy (CXE) and the other Kullback-Liebler Divergence +(KLD). Both networks use labels drawn from a crowd-sourced consensus. This +consensus is derived from a Normalized Distribution of Annotations (NDA) based +on all annotations for a given character in the dataset. For the second +network, the KLD is calculated with respect to the NDA. For our ensemble model, +we apply a k-nearest neighbors model to the outputs of the CXE and KLD +networks. Individually, the ResNet models have approximately 93% accuracy, +while the ensemble model achieves an accuracy of > 95%, increasing the +classification trustworthiness. We also perform an analysis of the Shannon +entropy of the various models' output distributions to measure classification +uncertainty. Our results suggest that entropy is useful for predicting model +misclassifications. + +
+
+
+
+
+ + ♻ ☆ Improved Convergence of Score-Based Diffusion Models via + Prediction-Correction + + +
+ Score-based generative models (SGMs) are powerful tools to sample from +complex data distributions. Their underlying idea is to (i) run a forward +process for time $T_1$ by adding noise to the data, (ii) estimate its score +function, and (iii) use such estimate to run a reverse process. As the reverse +process is initialized with the stationary distribution of the forward one, the +existing analysis paradigm requires $T_1\to\infty$. This is however +problematic: from a theoretical viewpoint, for a given precision of the score +approximation, the convergence guarantee fails as $T_1$ diverges; from a +practical viewpoint, a large $T_1$ increases computational costs and leads to +error propagation. This paper addresses the issue by considering a version of +the popular predictor-corrector scheme: after running the forward process, we +first estimate the final distribution via an inexact Langevin dynamics and then +revert the process. Our key technical contribution is to provide convergence +guarantees which require to run the forward process only for a fixed finite +time $T_1$. Our bounds exhibit a mild logarithmic dependence on the input +dimension and the subgaussian norm of the target distribution, have minimal +assumptions on the data, and require only to control the $L^2$ loss on the +score approximation, which is the quantity minimized in practice. + +
+
+ comment: 33 pages; included discretization result; typos corrected +
+
+
+
+
+ + ♻ ☆ KPI Extraction from Maintenance Work Orders -- A Comparison of Expert + Labeling, Text Classification and AI-Assisted Tagging for Computing Failure + Rates of Wind Turbines + + +
+ Maintenance work orders are commonly used to document information about wind +turbine operation and maintenance. This includes details about proactive and +reactive wind turbine downtimes, such as preventative and corrective +maintenance. However, the information contained in maintenance work orders is +often unstructured and difficult to analyze, presenting challenges for +decision-makers wishing to use it for optimizing operation and maintenance. To +address this issue, this work compares three different approaches to calculate +reliability by performance indicators from maintenance work orders. The first +approach involves manual labeling of the maintenance work orders by domain +experts, using the schema defined in an industrial guideline to assign the +label accordingly. The second approach involves the development of a model that +automatically labels the maintenance work orders using text classification +methods. Through this method, we are able to achieve macro average and weighted +average F1-Scores of 0.75 and 0.85 respectively. The third technique uses an +AI-assisted tagging tool to tag and structure the raw maintenance information, +together with a novel rule-based approach for extracting relevant maintenance +work orders for failure rate calculation. In our experiments the AI-assisted +tool leads to a 88% drop in tagging time in comparison to the other two +approaches, while expert labeling and text classification are more accurate in +KPI extraction. Overall, our findings make extracting maintenance information +from maintenance work orders more efficient, enable the assessment of +reliability key performance indicators and therefore support the optimization +of wind turbine operation and maintenance. + +
+
+
+
+
+ + ♻ ☆ Consensus, dissensus and synergy between clinicians and specialist + foundation models in radiology report generation + + +
+ Radiology reports are an instrumental part of modern medicine, informing key +clinical decisions such as diagnosis and treatment. The worldwide shortage of +radiologists, however, restricts access to expert care and imposes heavy +workloads, contributing to avoidable errors and delays in report delivery. +While recent progress in automated report generation with vision-language +models offer clear potential in ameliorating the situation, the path to +real-world adoption has been stymied by the challenge of evaluating the +clinical quality of AI-generated reports. In this study, we build a +state-of-the-art report generation system for chest radiographs, +\textit{Flamingo-CXR}, by fine-tuning a well-known vision-language foundation +model on radiology data. To evaluate the quality of the AI-generated reports, a +group of 16 certified radiologists provide detailed evaluations of AI-generated +and human written reports for chest X-rays from an intensive care setting in +the United States and an inpatient setting in India. At least one radiologist +(out of two per case) preferred the AI report to the ground truth report in +over 60$\%$ of cases for both datasets. Amongst the subset of AI-generated +reports that contain errors, the most frequently cited reasons were related to +the location and finding, whereas for human written reports, most mistakes were +related to severity and finding. This disparity suggested potential +complementarity between our AI system and human experts, prompting us to +develop an assistive scenario in which \textit{Flamingo-CXR} generates a +first-draft report, which is subsequently revised by a clinician. This is the +first demonstration of clinician-AI collaboration for report writing, and the +resultant reports are assessed to be equivalent or preferred by at least one +radiologist to reports written by experts alone in 80$\%$ of in-patient cases +and 60$\%$ of intensive care cases. + +
+
+
+
+
+ + ♻ ☆ LLMLingua: Compressing Prompts for Accelerated Inference of Large + Language Models EMNLP 2023 + + +
+ Large language models (LLMs) have been applied in various applications due to +their astonishing capabilities. With advancements in technologies such as +chain-of-thought (CoT) prompting and in-context learning (ICL), the prompts fed +to LLMs are becoming increasingly lengthy, even exceeding tens of thousands of +tokens. To accelerate model inference and reduce cost, this paper presents +LLMLingua, a coarse-to-fine prompt compression method that involves a budget +controller to maintain semantic integrity under high compression ratios, a +token-level iterative compression algorithm to better model the interdependence +between compressed contents, and an instruction tuning based method for +distribution alignment between language models. We conduct experiments and +analysis over four datasets from different scenarios, i.e., GSM8K, BBH, +ShareGPT, and Arxiv-March23; showing that the proposed approach yields +state-of-the-art performance and allows for up to 20x compression with little +performance loss. Our code is available at https://aka.ms/LLMLingua. + +
+
+ comment: Accepted at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ A Hyperparameter Study for Quantum Kernel Methods + + +
+ Quantum kernel methods are a promising method in quantum machine learning +thanks to the guarantees connected to them. Their accessibility for analytic +considerations also opens up the possibility of prescreening datasets based on +their potential for a quantum advantage. To do so, earlier works developed the +geometric difference, which can be understood as a closeness measure between +two kernel-based machine learning approaches, most importantly between a +quantum kernel and classical kernel. This metric links the quantum and +classical model complexities. Therefore, it raises the question of whether the +geometric difference, based on its relation to model complexity, can be a +useful tool in evaluations other than for the potential for quantum advantage. +In this work, we investigate the effects of hyperparameter choice on the model +performance and the generalization gap between classical and quantum kernels. +The importance of hyperparameter optimization is well known also for classical +machine learning. Especially for the quantum Hamiltonian evolution feature map, +the scaling of the input data has been shown to be crucial. However, there are +additional parameters left to be optimized, like the best number of qubits to +trace out before computing a projected quantum kernel. We investigate the +influence of these hyperparameters and compare the classically reliable method +of cross validation with the method of choosing based on the geometric +difference. Based on the thorough investigation of the hyperparameters across +11 datasets we identified commodities that can be exploited when examining a +new dataset. In addition, our findings contribute to better understanding of +the applicability of the geometric difference. + +
+
+ comment: Updated experimental results, adapted text +
+
+
+
+
+ + ♻ ☆ Dimensionless Anomaly Detection on Multivariate Streams with Variance + Norm and Path Signature + + +
+ In this paper, we propose a dimensionless anomaly detection method for +multivariate streams. Our method is independent of the unit of measurement for +the different stream channels, therefore dimensionless. We first propose the +variance norm, a generalisation of Mahalanobis distance to handle +infinite-dimensional feature space and singular empirical covariance matrix +rigorously. We then combine the variance norm with the path signature, an +infinite collection of iterated integrals that provide global features of +streams, to propose SigMahaKNN, a method for anomaly detection on +(multivariate) streams. We show that SigMahaKNN is invariant to stream +reparametrisation, stream concatenation and has a graded discrimination power +depending on the truncation level of the path signature. We implement +SigMahaKNN as an open-source software, and perform extensive numerical +experiments, showing significantly improved anomaly detection on streams +compared to isolation forest and local outlier factors in applications ranging +from language analysis, hand-writing analysis, ship movement paths analysis and +univariate time-series analysis. + +
+
+
+
+
+ + ♻ ☆ Conditions for Length Generalization in Learning Reasoning Skills + + +
+ Reasoning is a fundamental capability of AI agents. Recently, large language +models (LLMs) have shown remarkable abilities to perform reasoning tasks. +However, numerous evaluations of the reasoning capabilities of LLMs have also +showed some limitations. An outstanding limitation is length generalization, +meaning that when trained on reasoning problems of smaller lengths or sizes, +the resulting models struggle with problems of larger sizes or lengths. This +potentially indicates some theoretical limitations of generalization in +learning reasoning skills. These evaluations and their observations motivated +us to perform a theoretical study of the length generalization problem. This +work focuses on reasoning tasks that can be formulated as Markov dynamic +processes (MDPs) and/or directed acyclic graphs (DAGs). It identifies and +proves conditions that decide whether the length generalization problem can be +solved or not for a reasoning task in a particular representation. Experiments +are also conducted to verify the theoretical results. + +
+
+
+
+
+ + ♻ ☆ Autoencoders for discovering manifold dimension and coordinates in data + from complex dynamical systems + + +
+ While many phenomena in physics and engineering are formally +high-dimensional, their long-time dynamics often live on a lower-dimensional +manifold. The present work introduces an autoencoder framework that combines +implicit regularization with internal linear layers and $L_2$ regularization +(weight decay) to automatically estimate the underlying dimensionality of a +data set, produce an orthogonal manifold coordinate system, and provide the +mapping functions between the ambient space and manifold space, allowing for +out-of-sample projections. We validate our framework's ability to estimate the +manifold dimension for a series of datasets from dynamical systems of varying +complexities and compare to other state-of-the-art estimators. We analyze the +training dynamics of the network to glean insight into the mechanism of +low-rank learning and find that collectively each of the implicit regularizing +layers compound the low-rank representation and even self-correct during +training. Analysis of gradient descent dynamics for this architecture in the +linear case reveals the role of the internal linear layers in leading to faster +decay of a "collective weight variable" incorporating all layers, and the role +of weight decay in breaking degeneracies and thus driving convergence along +directions in which no decay would occur in its absence. We show that this +framework can be naturally extended for applications of state-space modeling +and forecasting by generating a data-driven dynamic model of a spatiotemporally +chaotic partial differential equation using only the manifold coordinates. +Finally, we demonstrate that our framework is robust to hyperparameter choices. + +
+
+
+
+
+ + ♻ ☆ Point-PEFT: Parameter-Efficient Fine-Tuning for 3D Pre-trained Models + + +
+ The popularity of pre-trained large models has revolutionized downstream +tasks across diverse fields, such as language, vision, and multi-modality. To +minimize the adaption cost for downstream tasks, many Parameter-Efficient +Fine-Tuning (PEFT) techniques are proposed for language and 2D image +pre-trained models. However, the specialized PEFT method for 3D pre-trained +models is still under-explored. To this end, we introduce Point-PEFT, a novel +framework for adapting point cloud pre-trained models with minimal learnable +parameters. Specifically, for a pre-trained 3D model, we freeze most of its +parameters, and only tune the newly added PEFT modules on downstream tasks, +which consist of a Point-prior Prompt and a Geometry-aware Adapter. The +Point-prior Prompt adopts a set of learnable prompt tokens, for which we +propose to construct a memory bank with domain-specific knowledge, and utilize +a parameter-free attention to enhance the prompt tokens. The Geometry-aware +Adapter aims to aggregate point cloud features within spatial neighborhoods to +capture fine-grained geometric information through local interactions. +Extensive experiments indicate that our Point-PEFT can achieve better +performance than the full fine-tuning on various downstream tasks, while using +only 5% of the trainable parameters, demonstrating the efficiency and +effectiveness of our approach. Code will be released at +https://github.com/Even-JK/PEFT-3D. + +
+
+ comment: 10 pages. The specialized PEFT framework for 3D pre-trained models, + which achieves competitive performance to full fine-tuning, and significantly + reduces the computational resources. Project page: + https://github.com/Even-JK/PEFT-3D +
+
+
+
+
+ + ♻ ☆ Harnessing machine learning for accurate treatment of overlapping + opacity species in general circulation models + + +
+ To understand high precision observations of exoplanets and brown dwarfs, we +need detailed and complex general circulation models (GCMs) that incorporate +hydrodynamics, chemistry, and radiation. For this study, we specifically +examined the coupling between chemistry and radiation in GCMs and compared +different methods for the mixing of opacities of different chemical species in +the correlated-k assumption, when equilibrium chemistry cannot be assumed. We +propose a fast machine learning method based on DeepSets (DS), which +effectively combines individual correlated-k opacities (k-tables). We evaluated +the DS method alongside other published methods such as adaptive equivalent +extinction (AEE) and random overlap with rebinning and resorting (RORR). We +integrated these mixing methods into our GCM (expeRT/MITgcm) and assessed their +accuracy and performance for the example of the hot Jupiter HD~209458 b. Our +findings indicate that the DS method is both accurate and efficient for GCM +usage, whereas RORR is too slow. Additionally, we observed that the accuracy of +AEE depends on its specific implementation and may introduce numerical issues +in achieving radiative transfer solution convergence. We then applied the DS +mixing method in a simplified chemical disequilibrium situation, where we +modeled the rainout of TiO and VO, and confirmed that the rainout of TiO and VO +would hinder the formation of a stratosphere. To further expedite the +development of consistent disequilibrium chemistry calculations in GCMs, we +provide documentation and code for coupling the DS mixing method with +correlated-k radiative transfer solvers. The DS method has been extensively +tested to be accurate enough for GCMs; however, other methods might be needed +for accelerating atmospheric retrievals. + +
+
+ comment: Accepted for publication in A&A, language edited version +
+
+
+
+
+ + ♻ ☆ Towards Causal Representations of Climate Model Data + + +
+ Climate models, such as Earth system models (ESMs), are crucial for +simulating future climate change based on projected Shared Socioeconomic +Pathways (SSP) greenhouse gas emissions scenarios. While ESMs are sophisticated +and invaluable, machine learning-based emulators trained on existing simulation +data can project additional climate scenarios much faster and are +computationally efficient. However, they often lack generalizability and +interpretability. This work delves into the potential of causal representation +learning, specifically the \emph{Causal Discovery with Single-parent Decoding} +(CDSD) method, which could render climate model emulation efficient +\textit{and} interpretable. We evaluate CDSD on multiple climate datasets, +focusing on emissions, temperature, and precipitation. Our findings shed light +on the challenges, limitations, and promise of using CDSD as a stepping stone +towards more interpretable and robust climate model emulation. + +
+
+
+
+
+ + ♻ ☆ Towards Ordinal Data Science + + +
+ Order is one of the main instruments to measure the relationship between +objects in (empirical) data. However, compared to methods that use numerical +properties of objects, the amount of ordinal methods developed is rather small. +One reason for this is the limited availability of computational resources in +the last century that would have been required for ordinal computations. +Another reason -- particularly important for this line of research -- is that +order-based methods are often seen as too mathematically rigorous for applying +them to real-world data. In this paper, we will therefore discuss different +means for measuring and 'calculating' with ordinal structures -- a specific +class of directed graphs -- and show how to infer knowledge from them. Our aim +is to establish Ordinal Data Science as a fundamentally new research agenda. +Besides cross-fertilization with other cornerstone machine learning and +knowledge representation methods, a broad range of disciplines will benefit +from this endeavor, including, psychology, sociology, economics, web science, +knowledge engineering, scientometrics. + +
+
+ comment: 40 pages, 7 figures, Transactions on Graph Data and Knowledge (TGDK) +
+
+
+
+
+ + ♻ ☆ Is This the Subspace You Are Looking for? An Interpretability Illusion + for Subspace Activation Patching NeurIPS 2023 + + +
+ Mechanistic interpretability aims to understand model behaviors in terms of +specific, interpretable features, often hypothesized to manifest as +low-dimensional subspaces of activations. Specifically, recent studies have +explored subspace interventions (such as activation patching) as a way to +simultaneously manipulate model behavior and attribute the features behind it +to given subspaces. + In this work, we demonstrate that these two aims diverge, potentially leading +to an illusory sense of interpretability. Counterintuitively, even if a +subspace intervention makes the model's output behave as if the value of a +feature was changed, this effect may be achieved by activating a dormant +parallel pathway leveraging another subspace that is causally disconnected from +model outputs. We demonstrate this phenomenon in a distilled mathematical +example, in two real-world domains (the indirect object identification task and +factual recall), and present evidence for its prevalence in practice. In the +context of factual recall, we further show a link to rank-1 fact editing, +providing a mechanistic explanation for previous work observing an +inconsistency between fact editing performance and fact localization. + However, this does not imply that activation patching of subspaces is +intrinsically unfit for interpretability. To contextualize our findings, we +also show what a success case looks like in a task (indirect object +identification) where prior manual circuit analysis informs an understanding of +the location of a feature. We explore the additional evidence needed to argue +that a patched subspace is faithful. + +
+
+ comment: NeurIPS 2023 Workshop on Attributing Model Behavior at Scale +
+
+
+
+
+ + ♻ ☆ Constrained Parameter Regularization + + +
+ Regularization is a critical component in deep learning training, with weight +decay being a commonly used approach. It applies a constant penalty coefficient +uniformly across all parameters. This may be unnecessarily restrictive for some +parameters, while insufficiently restricting others. To dynamically adjust +penalty coefficients for different parameter groups, we present constrained +parameter regularization (CPR) as an alternative to traditional weight decay. +Instead of applying a single constant penalty to all parameters, we enforce an +upper bound on a statistical measure (e.g., the L$_2$-norm) of parameter +groups. Consequently, learning becomes a constraint optimization problem, which +we address by an adaptation of the augmented Lagrangian method. CPR only +requires two hyperparameters and incurs no measurable runtime overhead. +Additionally, we propose a simple but efficient mechanism to adapt the upper +bounds during the optimization. We provide empirical evidence of CPR's efficacy +in experiments on the "grokking" phenomenon, computer vision, and language +modeling tasks. Our results demonstrate that CPR counteracts the effects of +grokking and consistently matches or outperforms traditional weight decay. + +
+
+
+
+
+ + ♻ ☆ Nash Learning from Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) has emerged as the main +paradigm for aligning large language models (LLMs) with human preferences. +Typically, RLHF involves the initial step of learning a reward model from human +feedback, often expressed as preferences between pairs of text generations +produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by +optimizing it to maximize the reward model through a reinforcement learning +algorithm. However, an inherent limitation of current reward models is their +inability to fully represent the richness of human preferences and their +dependency on the sampling distribution. + In this study, we introduce an alternative pipeline for the fine-tuning of +LLMs using pairwise human feedback. Our approach entails the initial learning +of a preference model, which is conditioned on two inputs given a prompt, +followed by the pursuit of a policy that consistently generates responses +preferred over those generated by any competing policy, thus defining the Nash +equilibrium of this preference model. We term this approach Nash learning from +human feedback (NLHF). + In the context of a tabular policy representation, we present a novel +algorithmic solution, Nash-MD, founded on the principles of mirror descent. +This algorithm produces a sequence of policies, with the last iteration +converging to the regularized Nash equilibrium. Additionally, we explore +parametric representations of policies and introduce gradient descent +algorithms for deep-learning architectures. To demonstrate the effectiveness of +our approach, we present experimental results involving the fine-tuning of a +LLM for a text summarization task. We believe NLHF offers a compelling avenue +for preference learning and policy optimization with the potential of advancing +the field of aligning LLMs with human preferences. + +
+
+
+
+
+ + ♻ ☆ Memory-free Online Change-point Detection: A Novel Neural Network + Approach + + +
+ Change-point detection (CPD), which detects abrupt changes in the data +distribution, is recognized as one of the most significant tasks in time series +analysis. Despite the extensive literature on offline CPD, unsupervised online +CPD still suffers from major challenges, including scalability, hyperparameter +tuning, and learning constraints. To mitigate some of these challenges, in this +paper, we propose a novel deep learning approach for unsupervised online CPD +from multi-dimensional time series, named Adaptive LSTM-Autoencoder +Change-Point Detection (ALACPD). ALACPD exploits an LSTM-autoencoder-based +neural network to perform unsupervised online CPD. It continuously adapts to +the incoming samples without keeping the previously received input, thus being +memory-free. We perform an extensive evaluation on several real-world time +series CPD benchmarks. We show that ALACPD, on average, ranks first among +state-of-the-art CPD algorithms in terms of quality of the time series +segmentation, and it is on par with the best performer in terms of the accuracy +of the estimated change-points. The implementation of ALACPD is available +online on Github\footnote{\url{https://github.com/zahraatashgahi/ALACPD}}. + +
+
+
+
+
+ + ♻ ☆ Provably Accelerated Decentralized Gradient Method Over Unbalanced + Directed Graphs + + +
+ We consider the decentralized optimization problem, where a network of $n$ +agents aims to collaboratively minimize the average of their individual smooth +and convex objective functions through peer-to-peer communication in a directed +graph. To tackle this problem, we propose two accelerated gradient tracking +methods, namely APD and APD-SC, for non-strongly convex and strongly convex +objective functions, respectively. We show that APD and APD-SC converge at the +rates $O\left(\frac{1}{k^2}\right)$ and $O\left(\left(1 - +C\sqrt{\frac{\mu}{L}}\right)^k\right)$, respectively, up to constant factors +depending only on the mixing matrix. APD and APD-SC are the first decentralized +methods over unbalanced directed graphs that achieve the same provable +acceleration as centralized methods. Numerical experiments demonstrate the +effectiveness of both methods. + +
+
+ comment: SIAM Journal on Optimization, in press +
+
+
+
+
+ + ♻ ☆ Visual Data-Type Understanding does not emerge from Scaling + Vision-Language Models + + +
+ Recent advances in the development of vision-language models (VLMs) are +yielding remarkable success in recognizing visual semantic content, including +impressive instances of compositional image understanding. Here, we introduce +the novel task of Visual Data-Type Identification, a basic perceptual skill +with implications for data curation (e.g., noisy data-removal from large +datasets, domain-specific retrieval) and autonomous vision (e.g., +distinguishing changing weather conditions from camera lens staining). We +develop two datasets consisting of animal images altered across a diverse set +of 27 visual data-types, spanning four broad categories. An extensive zero-shot +evaluation of 39 VLMs, ranging from 100M to 80B parameters, shows a nuanced +performance landscape. While VLMs are reasonably good at identifying certain +stylistic \textit{data-types}, such as cartoons and sketches, they struggle +with simpler data-types arising from basic manipulations like image rotations +or additive noise. Our findings reveal that (i) model scaling alone yields +marginal gains for contrastively-trained models like CLIP, and (ii) there is a +pronounced drop in performance for the largest auto-regressively trained VLMs +like OpenFlamingo. This finding points to a blind spot in current frontier +VLMs: they excel in recognizing semantic content but fail to acquire an +understanding of visual data-types through scaling. By analyzing the +pre-training distributions of these models and incorporating data-type +information into the captions during fine-tuning, we achieve a significant +enhancement in performance. By exploring this previously uncharted task, we aim +to set the stage for further advancing VLMs to equip them with visual data-type +understanding. Code and datasets are released at +https://github.com/bethgelab/DataTypeIdentification. + +
+
+
+
+
+ + ♻ ☆ Function-Space Optimality of Neural Architectures With Multivariate + Nonlinearities + + +
+ We investigate the function-space optimality (specifically, the Banach-space +optimality) of a large class of shallow neural architectures with multivariate +nonlinearities/activation functions. To that end, we construct a new family of +Banach spaces defined via a regularization operator, the $k$-plane transform, +and a sparsity-promoting norm. We prove a representer theorem that states that +the solution sets to learning problems posed over these Banach spaces are +completely characterized by neural architectures with multivariate +nonlinearities. These optimal architectures have skip connections and are +tightly connected to orthogonal weight normalization and multi-index models, +both of which have received recent interest in the neural network community. +Our framework is compatible with a number of classical nonlinearities including +the rectified linear unit (ReLU) activation function, the norm activation +function, and the radial basis functions found in the theory of +thin-plate/polyharmonic splines. We also show that the underlying spaces are +special instances of reproducing kernel Banach spaces and variation spaces. Our +results shed light on the regularity of functions learned by neural networks +trained on data, particularly with multivariate nonlinearities, and provide new +theoretical motivation for several architectural choices found in practice. + +
+
+
+
+
+ + ♻ ☆ DyEdgeGAT: Dynamic Edge via Graph Attention for Early Fault Detection in + IIoT Systems + + +
+ In the industrial Internet of Things, condition monitoring sensor signals +from complex systems often exhibit strong nonlinear and stochastic +spatial-temporal dynamics under varying operating conditions. Such complex +dynamics make fault detection particularly challenging. Although previously +proposed methods effectively model these dynamics, they often neglect the +dynamic evolution of relationships between sensor signals. Undetected shifts in +these relationships can potentially result in significant system failures. +Another limitation is their inability to effectively distinguish between novel +operating conditions and actual faults. To address this gap, we propose +DyEdgeGAT (Dynamic Edge via Graph Attention), a novel approach capable of +detecting various faults, especially those characterized by relationship +changes at early stages, while distinguishing faults from novel operating +conditions. DyEdgeGAT is a graph-based framework that provides a novel graph +inference scheme for multivariate time series that dynamically constructs edges +to represent and track the evolution of relationships between time series. +Additionally, it addresses a commonly overlooked aspect: the cause-and-effect +relationships within the system, such as between control inputs and +measurements. By incorporating system-independent variables as contexts of +operating conditions into node dynamics extraction, DyEdgeGAT enhances its +robustness against novel operating conditions. We rigorously evaluate +DyEdgeGAT's performance using both a synthetic dataset, designed to simulate +varying levels of fault severity and a real-world industrial-scale benchmark +containing a variety of fault types with different detection complexities. Our +findings demonstrate that DyEdgeGAT is highly effective in fault detection, +showing particular strength in early fault detection while maintaining +robustness under novel operating conditions. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Zipformer: A faster and better encoder for automatic speech recognition + + +
+ The Conformer has become the most popular encoder model for automatic speech +recognition (ASR). It adds convolution modules to a transformer to learn both +local and global dependencies. In this work we describe a faster, more +memory-efficient, and better-performing transformer, called Zipformer. Modeling +changes include: 1) a U-Net-like encoder structure where middle stacks operate +at lower frame rates; 2) reorganized block structure with more modules, within +which we re-use attention weights for efficiency; 3) a modified form of +LayerNorm called BiasNorm allows us to retain some length information; 4) new +activation functions SwooshR and SwooshL work better than Swish. We also +propose a new optimizer, called ScaledAdam, which scales the update by each +tensor's current scale to keep the relative change about the same, and also +explictly learns the parameter scale. It achieves faster convergence and better +performance than Adam. Extensive experiments on LibriSpeech, Aishell-1, and +WenetSpeech datasets demonstrate the effectiveness of our proposed Zipformer +over other state-of-the-art ASR models. Our code is publicly available at +https://github.com/k2-fsa/icefall. + +
+
+
+
+
+ + ♻ ☆ Continual Driving Policy Optimization with Closed-Loop Individualized + Curricula + + +
+ The safety of autonomous vehicles (AV) has been a long-standing top concern, +stemming from the absence of rare and safety-critical scenarios in the +long-tail naturalistic driving distribution. To tackle this challenge, a surge +of research in scenario-based autonomous driving has emerged, with a focus on +generating high-risk driving scenarios and applying them to conduct +safety-critical testing of AV models. However, limited work has been explored +on the reuse of these extensive scenarios to iteratively improve AV models. +Moreover, it remains intractable and challenging to filter through gigantic +scenario libraries collected from other AV models with distinct behaviors, +attempting to extract transferable information for current AV improvement. +Therefore, we develop a continual driving policy optimization framework +featuring Closed-Loop Individualized Curricula (CLIC), which we factorize into +a set of standardized sub-modules for flexible implementation choices: AV +Evaluation, Scenario Selection, and AV Training. CLIC frames AV Evaluation as a +collision prediction task, where it estimates the chance of AV failures in +these scenarios at each iteration. Subsequently, by re-sampling from historical +scenarios based on these failure probabilities, CLIC tailors individualized +curricula for downstream training, aligning them with the evaluated capability +of AV. Accordingly, CLIC not only maximizes the utilization of the vast +pre-collected scenario library for closed-loop driving policy optimization but +also facilitates AV improvement by individualizing its training with more +challenging cases out of those poorly organized scenarios. Experimental results +clearly indicate that CLIC surpasses other curriculum-based training +strategies, showing substantial improvement in managing risky scenarios, while +still maintaining proficiency in handling simpler cases. + +
+
+
+
+
+ + ♻ ☆ Systematic Literature Review: Quantum Machine Learning and its + applications + + +
+ Quantum computing is the process of performing calculations using quantum +mechanics. This field studies the quantum behavior of certain subatomic +particles for subsequent use in performing calculations, as well as for +large-scale information processing. These capabilities can give quantum +computers an advantage in terms of computational time and cost over classical +computers. Nowadays, there are scientific challenges that are impossible to +perform by classical computation due to computational complexity or the time +the calculation would take, and quantum computation is one of the possible +answers. However, current quantum devices have not yet the necessary qubits and +are not fault-tolerant enough to achieve these goals. Nonetheless, there are +other fields like machine learning or chemistry where quantum computation could +be useful with current quantum devices. This manuscript aims to present a +Systematic Literature Review of the papers published between 2017 and 2023 to +identify, analyze and classify the different algorithms used in quantum machine +learning and their applications. Consequently, this study identified 94 +articles that used quantum machine learning techniques and algorithms. The main +types of found algorithms are quantum implementations of classical machine +learning algorithms, such as support vector machines or the k-nearest neighbor +model, and classical deep learning algorithms, like quantum neural networks. +Many articles try to solve problems currently answered by classical machine +learning but using quantum devices and algorithms. Even though results are +promising, quantum machine learning is far from achieving its full potential. +An improvement in the quantum hardware is required since the existing quantum +computers lack enough quality, speed, and scale to allow quantum computing to +achieve its full potential. + +
+
+ comment: 28 pages, 25 figures +
+
+
+
+
+ + ♻ ☆ A simple probabilistic neural network for machine understanding + + +
+ We discuss probabilistic neural networks with a fixed internal representation +as models for machine understanding. Here understanding is intended as mapping +data to an already existing representation which encodes an {\em a priori} +organisation of the feature space. We derive the internal representation by +requiring that it satisfies the principles of maximal relevance and of maximal +ignorance about how different features are combined. We show that, when hidden +units are binary variables, these two principles identify a unique model -- the +Hierarchical Feature Model (HFM) -- which is fully solvable and provides a +natural interpretation in terms of features. We argue that learning machines +with this architecture enjoy a number of interesting properties, like the +continuity of the representation with respect to changes in parameters and +data, the possibility to control the level of compression and the ability to +support functions that go beyond generalisation. We explore the behaviour of +the model with extensive numerical experiments and argue that models where the +internal representation is fixed reproduce a learning modality which is +qualitatively different from that of traditional models such as Restricted +Boltzmann Machines. + +
+
+ comment: 34 pages, 9 figures. Accepted in JSTAT +
+
+
+
+
+ + ♻ ☆ Time Regularization in Optimal Time Variable Learning + + +
+ Recently, optimal time variable learning in deep neural networks (DNNs) was +introduced in arXiv:2204.08528. In this manuscript we extend the concept by +introducing a regularization term that directly relates to the time horizon in +discrete dynamical systems. Furthermore, we propose an adaptive pruning +approach for Residual Neural Networks (ResNets), which reduces network +complexity without compromising expressiveness, while simultaneously decreasing +training time. The results are illustrated by applying the proposed concepts to +classification tasks on the well known MNIST and Fashion MNIST data sets. Our +PyTorch code is available on +https://github.com/frederikkoehne/time_variable_learning. + +
+
+
+
+
+ + ♻ ☆ Coherent Soft Imitation Learning + + +
+ Imitation learning methods seek to learn from an expert either through +behavioral cloning (BC) of the policy or inverse reinforcement learning (IRL) +of the reward. Such methods enable agents to learn complex tasks from humans +that are difficult to capture with hand-designed reward functions. Choosing BC +or IRL for imitation depends on the quality and state-action coverage of the +demonstrations, as well as additional access to the Markov decision process. +Hybrid strategies that combine BC and IRL are not common, as initial policy +optimization against inaccurate rewards diminishes the benefit of pretraining +the policy with BC. This work derives an imitation method that captures the +strengths of both BC and IRL. In the entropy-regularized ('soft') reinforcement +learning setting, we show that the behaviour-cloned policy can be used as both +a shaped reward and a critic hypothesis space by inverting the regularized +policy update. This coherency facilitates fine-tuning cloned policies using the +reward estimate and additional interactions with the environment. This approach +conveniently achieves imitation learning through initial behaviour cloning, +followed by refinement via RL with online or offline data sources. The +simplicity of the approach enables graceful scaling to high-dimensional and +vision-based tasks, with stable learning and minimal hyperparameter tuning, in +contrast to adversarial approaches. For the open-source implementation and +simulation results, see https://joemwatson.github.io/csil/. + +
+
+ comment: 51 pages, 49 figures. DeepMind internship report. Accepted as a + spotlight paper at Advances in Neural Information Processing Systems 2023 +
+
+
+
+
+ + ♻ ☆ When accurate prediction models yield harmful self-fulfilling prophecies ML4H 2023 + + +
+ Prediction models are popular in medical research and practice. By predicting +an outcome of interest for specific patients, these models may help inform +difficult treatment decisions, and are often hailed as the poster children for +personalized, data-driven healthcare. + We show however, that using prediction models for decision making can lead to +harmful decisions, even when the predictions exhibit good discrimination after +deployment. These models are harmful self-fulfilling prophecies: their +deployment harms a group of patients but the worse outcome of these patients +does not invalidate the predictive power of the model. Our main result is a +formal characterization of a set of such prediction models. Next we show that +models that are well calibrated before and after deployment are useless for +decision making as they made no change in the data distribution. These results +point to the need to revise standard practices for validation, deployment and +evaluation of prediction models that are used in medical decisions. + +
+
+ comment: ML4H 2023 Findings Track +
+
+
+
+
+ + ♻ ☆ SwiftSage: A Generative Agent with Fast and Slow Thinking for Complex + Interactive Tasks NeurIPS 2023 + + +
+ We introduce SwiftSage, a novel agent framework inspired by the dual-process +theory of human cognition, designed to excel in action planning for complex +interactive reasoning tasks. SwiftSage integrates the strengths of behavior +cloning and prompting large language models (LLMs) to enhance task completion +performance. The framework comprises two primary modules: the Swift module, +representing fast and intuitive thinking, and the Sage module, emulating +deliberate thought processes. The Swift module is a small encoder-decoder LM +fine-tuned on the oracle agent's action trajectories, while the Sage module +employs LLMs such as GPT-4 for subgoal planning and grounding. We develop a +heuristic method to harmoniously integrate the two modules, resulting in a more +efficient and robust problem-solving process. In 30 tasks from the ScienceWorld +benchmark, SwiftSage significantly outperforms other methods such as SayCan, +ReAct, and Reflexion, demonstrating its effectiveness in solving complex +interactive tasks. + +
+
+ comment: Accepted to NeurIPS 2023 (spotlight). Project website: + https://swiftsage.github.io +
+
+
+
+
+ + ♻ ☆ Towards early diagnosis of Alzheimer's disease: Advances in + immune-related blood biomarkers and computational modeling approaches + + +
+ Alzheimer's disease has an increasing prevalence in the population +world-wide, yet current diagnostic methods based on recommended biomarkers are +only available in specialized clinics. Due to these circumstances, Alzheimer's +disease is usually diagnosed late, which contrasts with the currently available +treatment options that are only effective for patients at an early stage. +Blood-based biomarkers could fill in the gap of easily accessible and low-cost +methods for early diagnosis of the disease. In particular, immune-based +blood-biomarkers might be a promising option, given the recently discovered +cross-talk of immune cells of the central nervous system with those in the +peripheral immune system. With the help of machine learning algorithms and +mechanistic modeling approaches, such as agent-based modeling, an in-depth +analysis of the simulation of cell dynamics is possible as well as of +high-dimensional omics resources indicative of pathway signaling changes. Here, +we give a background on advances in research on brain-immune system cross-talk +in Alzheimer's disease and review recent machine learning and mechanistic +modeling approaches which leverage modern omics technologies for blood-based +immune system-related biomarker discovery. + +
+
+
+
+
+ + ♻ ☆ Publicly available datasets of breast histopathology H&E whole-slide + images: A scoping review + + +
+ Advancements in digital pathology and computing resources have made a +significant impact in the field of computational pathology for breast cancer +diagnosis and treatment. However, access to high-quality labeled +histopathological images of breast cancer is a big challenge that limits the +development of accurate and robust deep learning models. In this scoping +review, we identified the publicly available datasets of breast H&E stained +whole-slide images (WSI) that can be used to develop deep learning algorithms. +We systematically searched nine scientific literature databases and nine +research data repositories and found 17 publicly available datasets containing +10385 H&E WSIs of breast cancer. Moreover, we reported image metadata and +characteristics for each dataset to assist researchers in selecting proper +datasets for specific tasks in breast cancer computational pathology. In +addition, we compiled two lists of breast H&E patches and private datasets as +supplementary resources for researchers. Notably, only 28% of the included +articles utilized multiple datasets, and only 14% used an external validation +set, suggesting that the performance of other developed models may be +susceptible to overestimation. The TCGA-BRCA was used in 52% of the selected +studies. This dataset has a considerable selection bias that can impact the +robustness and generalizability of the trained algorithms. There is also a lack +of consistent metadata reporting of breast WSI datasets that can be an issue in +developing accurate deep learning models, indicating the necessity of +establishing explicit guidelines for documenting breast WSI dataset +characteristics and metadata. + +
+
+ comment: 27 pages (including references), 8 figures, 3 tables, 5 supporting + information materials +
+
+
+
+
+ + ♻ ☆ Estimates on the generalization error of Physics Informed Neural + Networks (PINNs) for approximating PDEs + + +
+ Physics informed neural networks (PINNs) have recently been widely used for +robust and accurate approximation of PDEs. We provide rigorous upper bounds on +the generalization error of PINNs approximating solutions of the forward +problem for PDEs. An abstract formalism is introduced and stability properties +of the underlying PDE are leveraged to derive an estimate for the +generalization error in terms of the training error and number of training +samples. This abstract framework is illustrated with several examples of +nonlinear PDEs. Numerical experiments, validating the proposed theory, are also +presented. + +
+
+
+
+
+ + ♻ ☆ Physics Informed Neural Networks for Simulating Radiative Transfer + + +
+ We propose a novel machine learning algorithm for simulating radiative +transfer. Our algorithm is based on physics informed neural networks (PINNs), +which are trained by minimizing the residual of the underlying radiative +tranfer equations. We present extensive experiments and theoretical error +estimates to demonstrate that PINNs provide a very easy to implement, fast, +robust and accurate method for simulating radiative transfer. We also present a +PINN based algorithm for simulating inverse problems for radiative transfer +efficiently. + +
+
+
+
+
+ + ♻ ☆ Estimates on the generalization error of Physics Informed Neural + Networks (PINNs) for approximating a class of inverse problems for PDEs + + +
+ Physics informed neural networks (PINNs) have recently been very successfully +applied for efficiently approximating inverse problems for PDEs. We focus on a +particular class of inverse problems, the so-called data assimilation or unique +continuation problems, and prove rigorous estimates on the generalization error +of PINNs approximating them. An abstract framework is presented and conditional +stability estimates for the underlying inverse problem are employed to derive +the estimate on the PINN generalization error, providing rigorous justification +for the use of PINNs in this context. The abstract framework is illustrated +with examples of four prototypical linear PDEs. Numerical experiments, +validating the proposed theory, are also presented. + +
+
+
+
+
+ + ♻ ☆ A General Framework for Sequential Decision-Making under Adaptivity + Constraints + + +
+ We take the first step in studying general sequential decision-making under +two adaptivity constraints: rare policy switch and batch learning. First, we +provide a general class called the Eluder Condition class, which includes a +wide range of reinforcement learning classes. Then, for the rare policy switch +constraint, we provide a generic algorithm to achieve a +$\widetilde{\mathcal{O}}(\log K) $ switching cost with a +$\widetilde{\mathcal{O}}(\sqrt{K})$ regret on the EC class. For the batch +learning constraint, we provide an algorithm that provides a +$\widetilde{\mathcal{O}}(\sqrt{K}+K/B)$ regret with the number of batches $B.$ +This paper is the first work considering rare policy switch and batch learning +under general function classes, which covers nearly all the models studied in +the previous works such as tabular MDP (Bai et al. 2019; Zhang et al. 2020), +linear MDP (Wang et al. 2021; Gao et al. 2021), low eluder dimension MDP (Kong +et al. 2021; Gao et al. 2021), generalized linear function approximation (Qiao +et al. 2023), and also some new classes such as the low $D_\Delta$-type Bellman +eluder dimension problem, linear mixture MDP, kernelized nonlinear regulator +and undercomplete partially observed Markov decision process (POMDP). + +
+
+ comment: 48 pages +
+
+
+
+
+ + ♻ ☆ All the World's a (Hyper)Graph: A Data Drama + + +
+ We introduce Hyperbard, a dataset of diverse relational data representations +derived from Shakespeare's plays. Our representations range from simple graphs +capturing character co-occurrence in single scenes to hypergraphs encoding +complex communication settings and character contributions as hyperedges with +edge-specific node weights. By making multiple intuitive representations +readily available for experimentation, we facilitate rigorous representation +robustness checks in graph learning, graph mining, and network analysis, +highlighting the advantages and drawbacks of specific representations. +Leveraging the data released in Hyperbard, we demonstrate that many solutions +to popular graph mining problems are highly dependent on the representation +choice, thus calling current graph curation practices into question. As an +homage to our data source, and asserting that science can also be art, we +present all our points in the form of a play. + +
+
+ comment: This is the full version of our paper; an abridged version appears in + Digital Scholarship in the Humanities. Landing page for code and data: + https://hyperbard.net/ +
+
+
+
+
+ + ♻ ☆ Exploring the flavor structure of quarks and leptons with reinforcement + learning + + +
+ We propose a method to explore the flavor structure of quarks and leptons +with reinforcement learning. As a concrete model, we utilize a basic +value-based algorithm for models with $U(1)$ flavor symmetry. By training +neural networks on the $U(1)$ charges of quarks and leptons, the agent finds 21 +models to be consistent with experimentally measured masses and mixing angles +of quarks and leptons. In particular, an intrinsic value of normal ordering +tends to be larger than that of inverted ordering, and the normal ordering is +well fitted with the current experimental data in contrast to the inverted +ordering. A specific value of effective mass for the neutrinoless double beta +decay and a sizable leptonic CP violation induced by an angular component of +flavon field are predicted by autonomous behavior of the agent. Our finding +results indicate that the reinforcement learning can be a new method for +understanding the flavor structure. + +
+
+ comment: 45 pages, 15 figures, v2: published version +
+
+
+
+
+ + ♻ ☆ Neuroevolution of Physics-Informed Neural Nets: Benchmark Problems and + Comparative Results + + +
+ The potential of learned models for fundamental scientific research and +discovery is drawing increasing attention worldwide. Physics-informed neural +networks (PINNs), where the loss function directly embeds governing equations +of scientific phenomena, is one of the key techniques at the forefront of +recent advances. PINNs are typically trained using stochastic gradient descent +methods, akin to their deep learning counterparts. However, analysis in this +paper shows that PINNs' unique loss formulations lead to a high degree of +complexity and ruggedness that may not be conducive for gradient descent. +Unlike in standard deep learning, PINN training requires globally optimum +parameter values that satisfy physical laws as closely as possible. Spurious +local optimum, indicative of erroneous physics, must be avoided. Hence, +neuroevolution algorithms, with their superior global search capacity, may be a +better choice for PINNs relative to gradient descent methods. Here, we propose +a set of five benchmark problems, with open-source codes, spanning diverse +physical phenomena for novel neuroevolution algorithm development. Using this, +we compare two neuroevolution algorithms against the commonly used stochastic +gradient descent, and our baseline results support the claim that +neuroevolution can surpass gradient descent, ensuring better physics compliance +in the predicted outputs. %Furthermore, implementing neuroevolution with JAX +leads to orders of magnitude speedup relative to standard implementations. + +
+
+ comment: 11 pages, 6 figures, 4 tables +
+
+
+
+
+ + ♻ ☆ $\textbf{A}^2\textbf{CiD}^2$: Accelerating Asynchronous Communication in + Decentralized Deep Learning + + +
+ Distributed training of Deep Learning models has been critical to many recent +successes in the field. Current standard methods primarily rely on synchronous +centralized algorithms which induce major communication bottlenecks and +synchronization locks at scale. Decentralized asynchronous algorithms are +emerging as a potential alternative but their practical applicability still +lags. In order to mitigate the increase in communication cost that naturally +comes with scaling the number of workers, we introduce a principled +asynchronous, randomized, gossip-based optimization algorithm which works +thanks to a continuous local momentum named $\textbf{A}^2\textbf{CiD}^2$. Our +method allows each worker to continuously process mini-batches without +stopping, and run a peer-to-peer averaging routine in parallel, reducing idle +time. In addition to inducing a significant communication acceleration at no +cost other than adding a local momentum variable, minimal adaptation is +required to incorporate $\textbf{A}^2\textbf{CiD}^2$ to standard asynchronous +approaches. Our theoretical analysis proves accelerated rates compared to +previous asynchronous decentralized baselines and we empirically show that +using our $\textbf{A}^2\textbf{CiD}^2$ momentum significantly decrease +communication costs in poorly connected networks. In particular, we show +consistent improvement on the ImageNet dataset using up to 64 asynchronous +workers (A100 GPUs) and various communication network topologies. + +
+
+
+
+
+ + ♻ ☆ From External to Swap Regret 2.0: An Efficient Reduction and Oblivious + Adversary for Large Action Spaces + + +
+ We provide a novel reduction from swap-regret minimization to external-regret +minimization, which improves upon the classical reductions of Blum-Mansour +[BM07] and Stolz-Lugosi [SL05] in that it does not require finiteness of the +space of actions. We show that, whenever there exists a no-external-regret +algorithm for some hypothesis class, there must also exist a no-swap-regret +algorithm for that same class. For the problem of learning with expert advice, +our result implies that it is possible to guarantee that the swap regret is +bounded by {\epsilon} after $\log(N)^{O(1/\epsilon)}$ rounds and with $O(N)$ +per iteration complexity, where $N$ is the number of experts, while the +classical reductions of Blum-Mansour and Stolz-Lugosi require $O(N/\epsilon^2)$ +rounds and at least $\Omega(N^2)$ per iteration complexity. Our result comes +with an associated lower bound, which -- in contrast to that in [BM07] -- holds +for oblivious and $\ell_1$-constrained adversaries and learners that can employ +distributions over experts, showing that the number of rounds must be +$\tilde\Omega(N/\epsilon^2)$ or exponential in $1/\epsilon$. + Our reduction implies that, if no-regret learning is possible in some game, +then this game must have approximate correlated equilibria, of arbitrarily good +approximation. This strengthens the folklore implication of no-regret learning +that approximate coarse correlated equilibria exist. Importantly, it provides a +sufficient condition for the existence of correlated equilibrium which vastly +extends the requirement that the action set is finite, thus answering a +question left open by [DG22; Ass+23]. Moreover, it answers several outstanding +questions about equilibrium computation and learning in games. + +
+
+
+
+
+ + ♻ ☆ Optimal Variable Clustering for High-Dimensional Matrix Valued Data + + +
+ Matrix valued data has become increasingly prevalent in many applications. +Most of the existing clustering methods for this type of data are tailored to +the mean model and do not account for the dependence structure of the features, +which can be very informative, especially in high-dimensional settings or when +mean information is not available. To extract the information from the +dependence structure for clustering, we propose a new latent variable model for +the features arranged in matrix form, with some unknown membership matrices +representing the clusters for the rows and columns. Under this model, we +further propose a class of hierarchical clustering algorithms using the +difference of a weighted covariance matrix as the dissimilarity measure. +Theoretically, we show that under mild conditions, our algorithm attains +clustering consistency in the high-dimensional setting. While this consistency +result holds for our algorithm with a broad class of weighted covariance +matrices, the conditions for this result depend on the choice of the weight. To +investigate how the weight affects the theoretical performance of our +algorithm, we establish the minimax lower bound for clustering under our latent +variable model in terms of some cluster separation metric. Given these results, +we identify the optimal weight in the sense that using this weight guarantees +our algorithm to be minimax rate-optimal. The practical implementation of our +algorithm with the optimal weight is also discussed. Simulation studies show +that our algorithm performs better than existing methods in terms of the +adjusted Rand index (ARI). The method is applied to a genomic dataset and +yields meaningful interpretations. + +
+
+
+
+
+ + ♻ ☆ A Simple and Scalable Graph Neural Network for Large Directed Graphs + + +
+ Node classification is one of the hottest tasks in graph analysis. Though +existing studies have explored various node representations in directed and +undirected graphs, they have overlooked the distinctions of their capabilities +to capture the information of graphs. To tackle the limitation, we investigate +various combinations of node representations (aggregated features vs. adjacency +lists) and edge direction awareness within an input graph (directed vs. +undirected). We address the first empirical study to benchmark the performance +of various GNNs that use either combination of node representations and edge +direction awareness. Our experiments demonstrate that no single combination +stably achieves state-of-the-art results across datasets, which indicates that +we need to select appropriate combinations depending on the dataset +characteristics. In response, we propose a simple yet holistic classification +method A2DUG which leverages all combinations of node representations in +directed and undirected graphs. We demonstrate that A2DUG stably performs well +on various datasets and improves the accuracy up to 11.29 compared with the +state-of-the-art methods. To spur the development of new methods, we publicly +release our complete codebase under the MIT license. + +
+
+
+
+
+ + ♻ ☆ CD-GraB: Coordinating Distributed Example Orders for Provably + Accelerated Training NeurIPS 2023 + + +
+ Recent research on online Gradient Balancing (GraB) has revealed that there +exist permutation-based example orderings for SGD that are guaranteed to +outperform random reshuffling (RR). Whereas RR arbitrarily permutes training +examples, GraB leverages stale gradients from prior epochs to order examples -- +achieving a provably faster convergence rate than RR. However, GraB is limited +by design: while it demonstrates an impressive ability to scale-up training on +centralized data, it does not naturally extend to modern distributed ML +workloads. We therefore propose Coordinated Distributed GraB (CD-GraB), which +uses insights from prior work on kernel thinning to translate the benefits of +provably faster permutation-based example ordering to distributed settings. +With negligible overhead, CD-GraB exhibits a linear speedup in convergence rate +over centralized GraB and outperforms distributed RR on a variety of benchmark +tasks. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Towards Transferable Multi-modal Perception Representation Learning for + Autonomy: NeRF-Supervised Masked AutoEncoder + + +
+ This work proposes a unified self-supervised pre-training framework for +transferable multi-modal perception representation learning via masked +multi-modal reconstruction in Neural Radiance Field (NeRF), namely +NeRF-Supervised Masked AutoEncoder (NS-MAE). Specifically, conditioned on +certain view directions and locations, multi-modal embeddings extracted from +corrupted multi-modal input signals, i.e., Lidar point clouds and images, are +rendered into projected multi-modal feature maps via neural rendering. Then, +original multi-modal signals serve as reconstruction targets for the rendered +multi-modal feature maps to enable self-supervised representation learning. +Extensive experiments show that the representation learned via NS-MAE shows +promising transferability for diverse multi-modal and single-modal (camera-only +and Lidar-only) perception models on diverse 3D perception downstream tasks (3D +object detection and BEV map segmentation) with diverse amounts of fine-tuning +labeled data. Moreover, we empirically find that NS-MAE enjoys the synergy of +both the mechanism of masked autoencoder and neural radiance field. We hope +this study can inspire exploration of more general multi-modal representation +learning for autonomous agents. + +
+
+
+
+
+ + ♻ ☆ TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and + Advanced Decoding Techniques + + +
+ Recent advances in language models (LMs), have demonstrated significant +efficacy in tasks related to the arts and humanities. While LMs have exhibited +exceptional performance across a wide range of natural language processing +tasks, there are notable challenges associated with their utilization on small +datasets and their ability to replicate more creative human capacities. In this +study, we aim to address these challenges by training a Persian classical +poetry generation model using a transformer architecture on a specialized +dataset with no pretraining. Additionally, we propose a novel decoding method +to enhance coherence and meaningfulness in the generated poetry, effectively +managing the tradeoff between diversity and quality. Furthermore, the results +of our training approach and the proposed decoding method are evaluated through +comprehensive set of automatic and human evaluations and showed its superior +capability to generate coherent and meaningful poetry in compare to other +decoding methods and an existing Persian large language model (LLM). + +
+
+
+
+
+ + ♻ ☆ The SVHN Dataset Is Deceptive for Probabilistic Generative Models Due to + a Distribution Mismatch NeurIPS 2023 + + +
+ The Street View House Numbers (SVHN) dataset is a popular benchmark dataset +in deep learning. Originally designed for digit classification tasks, the SVHN +dataset has been widely used as a benchmark for various other tasks including +generative modeling. However, with this work, we aim to warn the community +about an issue of the SVHN dataset as a benchmark for generative modeling +tasks: we discover that the official split into training set and test set of +the SVHN dataset are not drawn from the same distribution. We empirically show +that this distribution mismatch has little impact on the classification task +(which may explain why this issue has not been detected before), but it +severely affects the evaluation of probabilistic generative models, such as +Variational Autoencoders and diffusion models. As a workaround, we propose to +mix and re-split the official training and test set when SVHN is used for tasks +other than classification. We publish a new split and the indices we used to +create it at https://jzenn.github.io/svhn-remix/ . + +
+
+ comment: Accepted at NeurIPS 2023 Workshop on Distribution Shifts; 4 pages + + appendix; proposed data set at https://jzenn.github.io/svhn-remix/ +
+
+
+
+
+ + ♻ ☆ A Kernel-Based Neural Network Test for High-dimensional Sequencing Data + Analysis + + +
+ The recent development of artificial intelligence (AI) technology, especially +the advance of deep neural network (DNN) technology, has revolutionized many +fields. While DNN plays a central role in modern AI technology, it has been +rarely used in sequencing data analysis due to challenges brought by +high-dimensional sequencing data (e.g., overfitting). Moreover, due to the +complexity of neural networks and their unknown limiting distributions, +building association tests on neural networks for genetic association analysis +remains a great challenge. To address these challenges and fill the important +gap of using AI in high-dimensional sequencing data analysis, we introduce a +new kernel-based neural network (KNN) test for complex association analysis of +sequencing data. The test is built on our previously developed KNN framework, +which uses random effects to model the overall effects of high-dimensional +genetic data and adopts kernel-based neural network structures to model complex +genotype-phenotype relationships. Based on KNN, a Wald-type test is then +introduced to evaluate the joint association of high-dimensional genetic data +with a disease phenotype of interest, considering non-linear and non-additive +effects (e.g., interaction effects). Through simulations, we demonstrated that +our proposed method attained higher power compared to the sequence kernel +association test (SKAT), especially in the presence of non-linear and +interaction effects. Finally, we apply the methods to the whole genome +sequencing (WGS) dataset from the Alzheimer's Disease Neuroimaging Initiative +(ADNI) study, investigating new genes associated with the hippocampal volume +change over time. + +
+
+ comment: 31 pages, 5 figures and 3 tabels +
+
+
+
+
+ + ♻ ☆ AlpacaFarm: A Simulation Framework for Methods that Learn from Human + Feedback + + +
+ Large language models (LLMs) such as ChatGPT have seen widespread adoption +due to their ability to follow user instructions well. Developing these LLMs +involves a complex yet poorly understood workflow requiring training with human +feedback. Replicating and understanding this instruction-following process +faces three major challenges: the high cost of data collection, the lack of +trustworthy evaluation, and the absence of reference method implementations. We +address these challenges with AlpacaFarm, a simulator that enables research and +development for learning from feedback at a low cost. First, we design LLM +prompts to simulate human feedback that are 45x cheaper than crowdworkers and +display high agreement with humans. Second, we propose an automatic evaluation +and validate it against human instructions obtained on real-world interactions. +Third, we contribute reference implementations for several methods (PPO, DPO, +best-of-n, expert iteration, and more) that learn from pairwise feedback. +Finally, as an end-to-end validation of AlpacaFarm, we train and evaluate +eleven models on 10k pairs of real human feedback and show that rankings of +models trained in AlpacaFarm match rankings of models trained on human data. As +a demonstration of the research possible in AlpacaFarm, we find that methods +that use a reward model can substantially improve over supervised fine-tuning +and that our reference PPO implementation leads to a +10% improvement in +win-rate against Davinci003. We release all components of AlpacaFarm at +https://github.com/tatsu-lab/alpaca_farm. + +
+
+
+
+
+ + ♻ ☆ Foundational propositions of hesitant fuzzy sets and parameter + reductions of hesitant fuzzy information systems + + +
+ Hesitant fuzzy sets are widely used in the instances of uncertainty and +hesitation. The inclusion relationship is an important and foundational +definition for sets. Hesitant fuzzy set, as a kind of set, needs explicit +definition of inclusion relationship. Base on the hesitant fuzzy membership +degree of discrete form, several kinds of inclusion relationships for hesitant +fuzzy sets are proposed. And then some foundational propositions of hesitant +fuzzy sets and the families of hesitant fuzzy sets are presented. Finally, some +foundational propositions of hesitant fuzzy information systems with respect to +parameter reductions are put forward, and an example and an algorithm are given +to illustrate the processes of parameter reductions. + +
+
+ comment: 22 pages +
+
+
+
+
+ + ♻ ☆ In-Context Learning for Text Classification with Many Labels + + +
+ In-context learning (ICL) using large language models for tasks with many +labels is challenging due to the limited context window, which makes it +difficult to fit a sufficient number of examples in the prompt. In this paper, +we use a pre-trained dense retrieval model to bypass this limitation, giving +the model only a partial view of the full label space for each inference call. +Testing with recent open-source LLMs (OPT, LLaMA), we set new state of the art +performance in few-shot settings for three common intent classification +datasets, with no finetuning. We also surpass fine-tuned performance on +fine-grained sentiment classification in certain cases. We analyze the +performance across number of in-context examples and different model scales, +showing that larger models are necessary to effectively and consistently make +use of larger context lengths for ICL. By running several ablations, we analyze +the model's use of: a) the similarity of the in-context examples to the current +input, b) the semantic content of the class names, and c) the correct +correspondence between examples and labels. We demonstrate that all three are +needed to varying degrees depending on the domain, contrary to certain recent +works. + +
+
+ comment: 12 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval + Models WSDM 2024 + + +
+ Embedding-based Retrieval Models (ERMs) have emerged as a promising framework +for large-scale text retrieval problems due to powerful large language models. +Nevertheless, fine-tuning ERMs to reach state-of-the-art results can be +expensive due to the extreme scale of data as well as the complexity of +multi-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this +work, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast +tuning of ERMs without any backward pass in the optimization. At index building +stage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN) +component. At inference stage, PEFA performs a convex combination of two +scoring functions, one from the ERM and the other from the kNN. Based on the +neighborhood definition, PEFA framework induces two realizations, namely +PEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra +small) using a single ANN index. Empirically, PEFA achieves significant +improvement on two retrieval applications. For document retrieval, regarding +Recall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an +average of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%, +respectively. For product search, PEFA improves the Recall@100 of the +fine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL, +respectively. Our code is available at +https://github.com/amzn/pecos/tree/mainline/examples/pefa-wsdm24. + +
+
+ comment: Accept by WSDM 2024 +
+
+
+
+
+ + ♻ ☆ D-Bot: Database Diagnosis System using Large Language Models + + +
+ Database administrators (DBAs) play an important role in managing, +maintaining and optimizing database systems. However, it is hard and tedious +for DBAs to manage a large number of databases and give timely response +(waiting for hours is intolerable in many online cases). In addition, existing +empirical methods only support limited diagnosis scenarios, which are also +labor-intensive to update the diagnosis rules for database version updates. +Recently large language models (LLMs) have shown great potential in various +fields. Thus, we propose D-Bot, an LLM-based database diagnosis system that can +automatically acquire knowledge from diagnosis documents, and generate +reasonable and well-founded diagnosis report (i.e., identifying the root causes +and solutions) within acceptable time (e.g., under 10 minutes compared to hours +by a DBA). The techniques in D-Bot include (i) offline knowledge extraction +from documents, (ii) automatic prompt generation (e.g., knowledge matching, +tool retrieval), (iii) root cause analysis using tree search algorithm, and +(iv) collaborative mechanism for complex anomalies with multiple root causes. +We verify D-Bot on real benchmarks (including 539 anomalies of six typical +applications), and the results show that D-Bot can effectively analyze the root +causes of unseen anomalies and significantly outperforms traditional methods +and vanilla models like GPT-4. + +
+
+
+
+
+ + ♻ ☆ TraSE: Towards Tackling Authorial Style from a Cognitive Science + Perspective + + +
+ Stylistic analysis of text is a key task in research areas ranging from +authorship attribution to forensic analysis and personality profiling. The +existing approaches for stylistic analysis are plagued by issues like topic +influence, lack of discriminability for large number of authors and the +requirement for large amounts of diverse data. In this paper, the source of +these issues are identified along with the necessity for a cognitive +perspective on authorial style in addressing them. A novel feature +representation, called Trajectory-based Style Estimation (TraSE), is introduced +to support this purpose. Authorship attribution experiments with over 27,000 +authors and 1.4 million samples in a cross-domain scenario resulted in 90% +attribution accuracy suggesting that the feature representation is immune to +such negative influences and an excellent candidate for stylistic analysis. +Finally, a qualitative analysis is performed on TraSE using physical human +characteristics, like age, to validate its claim on capturing cognitive traits. + +
+
+ comment: Experimental results in the paper are incorrectly reported due to an + unforeseen glitch in the software prototype. The paper and its findings are + withdrawn +
+
+
+
+
+ + ♻ ☆ SpaCE: The Spatial Confounding Environment + + +
+ Spatial confounding poses a significant challenge in scientific studies +involving spatial data, where unobserved spatial variables can influence both +treatment and outcome, possibly leading to spurious associations. To address +this problem, we introduce SpaCE: The Spatial Confounding Environment, the +first toolkit to provide realistic benchmark datasets and tools for +systematically evaluating causal inference methods designed to alleviate +spatial confounding. Each dataset includes training data, true counterfactuals, +a spatial graph with coordinates, and smoothness and confounding scores +characterizing the effect of a missing spatial confounder. It also includes +realistic semi-synthetic outcomes and counterfactuals, generated using +state-of-the-art machine learning ensembles, following best practices for +causal inference benchmarks. The datasets cover real treatment and covariates +from diverse domains, including climate, health and social sciences. SpaCE +facilitates an automated end-to-end pipeline, simplifying data loading, +experimental setup, and evaluating machine learning and causal inference +models. The SpaCE project provides several dozens of datasets of diverse sizes +and spatial complexity. It is publicly available as a Python package, +encouraging community feedback and contributions. + +
+
+
+
+
+ + ♻ ☆ ReSync: Riemannian Subgradient-based Robust Rotation Synchronization NeurIPS 2023 + + +
+ This work presents ReSync, a Riemannian subgradient-based algorithm for +solving the robust rotation synchronization problem, which arises in various +engineering applications. ReSync solves a least-unsquared minimization +formulation over the rotation group, which is nonsmooth and nonconvex, and aims +at recovering the underlying rotations directly. We provide strong theoretical +guarantees for ReSync under the random corruption setting. Specifically, we +first show that the initialization procedure of ReSync yields a proper initial +point that lies in a local region around the ground-truth rotations. We next +establish the weak sharpness property of the aforementioned formulation and +then utilize this property to derive the local linear convergence of ReSync to +the ground-truth rotations. By combining these guarantees, we conclude that +ReSync converges linearly to the ground-truth rotations under appropriate +conditions. Experiment results demonstrate the effectiveness of ReSync. + +
+
+ comment: Accepted for publication in NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Feature-Learning Networks Are Consistent Across Widths At Realistic + Scales NeurIPS 2023 + + +
+ We study the effect of width on the dynamics of feature-learning neural +networks across a variety of architectures and datasets. Early in training, +wide neural networks trained on online data have not only identical loss curves +but also agree in their point-wise test predictions throughout training. For +simple tasks such as CIFAR-5m this holds throughout training for networks of +realistic widths. We also show that structural properties of the models, +including internal representations, preactivation distributions, edge of +stability phenomena, and large learning rate effects are consistent across +large widths. This motivates the hypothesis that phenomena seen in realistic +models can be captured by infinite-width, feature-learning limits. For harder +tasks (such as ImageNet and language modeling), and later training times, +finite-width deviations grow systematically. Two distinct effects cause these +deviations across widths. First, the network output has +initialization-dependent variance scaling inversely with width, which can be +removed by ensembling networks. We observe, however, that ensembles of narrower +networks perform worse than a single wide network. We call this the bias of +narrower width. We conclude with a spectral perspective on the origin of this +finite-width bias. + +
+
+ comment: 24 pages, 19 figures. NeurIPS 2023. Revised based on reviewer + feedback +
+
+
+
+
+ + ♻ ☆ Algorithm as Experiment: Machine Learning, Market Design, and Policy + Eligibility Rules + + +
+ Algorithms make a growing portion of policy and business decisions. We +develop a treatment-effect estimator using algorithmic decisions as instruments +for a class of stochastic and deterministic algorithms. Our estimator is +consistent and asymptotically normal for well-defined causal effects. A special +case of our setup is multidimensional regression discontinuity designs with +complex boundaries. We apply our estimator to evaluate the Coronavirus Aid, +Relief, and Economic Security Act, which allocated many billions of dollars +worth of relief funding to hospitals via an algorithmic rule. The funding is +shown to have little effect on COVID-19-related hospital activities. Naive +estimates exhibit selection bias. + +
+
+
+
+
+ + ♻ ☆ Quantifying Spatial Under-reporting Disparities in Resident + Crowdsourcing + + +
+ Modern city governance relies heavily on crowdsourcing to identify problems +such as downed trees and power lines. A major concern is that residents do not +report problems at the same rates, with heterogeneous reporting delays directly +translating to downstream disparities in how quickly incidents can be +addressed. Here we develop a method to identify reporting delays without using +external ground-truth data. Our insight is that the rates at which duplicate +reports are made about the same incident can be leveraged to disambiguate +whether an incident has occurred by investigating its reporting rate once it +has occurred. We apply our method to over 100,000 resident reports made in New +York City and to over 900,000 reports made in Chicago, finding that there are +substantial spatial and socioeconomic disparities in how quickly incidents are +reported. We further validate our methods using external data and demonstrate +how estimating reporting delays leads to practical insights and interventions +for a more equitable, efficient government service. + +
+
+
+
+
+ + ♻ ☆ Model-tuning Via Prompts Makes NLP Models Adversarially Robust EMNLP 2023 + + +
+ In recent years, NLP practitioners have converged on the following practice: +(i) import an off-the-shelf pretrained (masked) language model; (ii) append a +multilayer perceptron atop the CLS token's hidden representation (with randomly +initialized weights); and (iii) fine-tune the entire model on a downstream task +(MLP-FT). This procedure has produced massive gains on standard NLP benchmarks, +but these models remain brittle, even to mild adversarial perturbations. In +this work, we demonstrate surprising gains in adversarial robustness enjoyed by +Model-tuning Via Prompts (MVP), an alternative method of adapting to downstream +tasks. Rather than appending an MLP head to make output prediction, MVP appends +a prompt template to the input, and makes prediction via text +infilling/completion. Across 5 NLP datasets, 4 adversarial attacks, and 3 +different models, MVP improves performance against adversarial substitutions by +an average of 8% over standard methods and even outperforms adversarial +training-based state-of-art defenses by 3.5%. By combining MVP with adversarial +training, we achieve further improvements in adversarial robustness while +maintaining performance on unperturbed examples. Finally, we conduct ablations +to investigate the mechanism underlying these gains. Notably, we find that the +main causes of vulnerability of MLP-FT can be attributed to the misalignment +between pre-training and fine-tuning tasks, and the randomly initialized MLP +parameters. + +
+
+ comment: Accepted to the EMNLP 2023 Conference +
+
+
+
+
+ + ♻ ☆ Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image + Alignment with Iterative VQA Feedback + + +
+ The field of text-conditioned image generation has made unparalleled progress +with the recent advent of latent diffusion models. While remarkable, as the +complexity of given text input increases, the state-of-the-art diffusion models +may still fail in generating images which accurately convey the semantics of +the given prompt. Furthermore, it has been observed that such misalignments are +often left undetected by pretrained multi-modal models such as CLIP. To address +these problems, in this paper we explore a simple yet effective decompositional +approach towards both evaluation and improvement of text-to-image alignment. In +particular, we first introduce a Decompositional-Alignment-Score which given a +complex prompt decomposes it into a set of disjoint assertions. The alignment +of each assertion with generated images is then measured using a VQA model. +Finally, alignment scores for different assertions are combined aposteriori to +give the final text-to-image alignment score. Experimental analysis reveals +that the proposed alignment metric shows significantly higher correlation with +human ratings as opposed to traditional CLIP, BLIP scores. Furthermore, we also +find that the assertion level alignment scores provide a useful feedback which +can then be used in a simple iterative procedure to gradually increase the +expression of different assertions in the final image outputs. Human user +studies indicate that the proposed approach surpasses previous state-of-the-art +by 8.7% in overall text-to-image alignment accuracy. Project page for our paper +is available at https://1jsingh.github.io/divide-evaluate-and-refine + +
+
+
+
+
+ + ♻ ☆ Learning Robust Output Control Barrier Functions from Safe Expert + Demonstrations + + +
+ This paper addresses learning safe output feedback control laws from partial +observations of expert demonstrations. We assume that a model of the system +dynamics and a state estimator are available along with corresponding error +bounds, e.g., estimated from data in practice. We first propose robust output +control barrier functions (ROCBFs) as a means to guarantee safety, as defined +through controlled forward invariance of a safe set. We then formulate an +optimization problem to learn ROCBFs from expert demonstrations that exhibit +safe system behavior, e.g., data collected from a human operator or an expert +controller. When the parametrization of the ROCBF is linear, then we show that, +under mild assumptions, the optimization problem is convex. Along with the +optimization problem, we provide verifiable conditions in terms of the density +of the data, smoothness of the system model and state estimator, and the size +of the error bounds that guarantee validity of the obtained ROCBF. Towards +obtaining a practical control algorithm, we propose an algorithmic +implementation of our theoretical framework that accounts for assumptions made +in our framework in practice. We empirically validate our algorithm in the +autonomous driving simulator CARLA and demonstrate how to learn safe control +laws from RGB camera images. + +
+
+ comment: Journal paper submission +
+
+
+
+
+ + ♻ ☆ A Latent Diffusion Model for Protein Structure Generation + + +
+ Proteins are complex biomolecules that perform a variety of crucial functions +within living organisms. Designing and generating novel proteins can pave the +way for many future synthetic biology applications, including drug discovery. +However, it remains a challenging computational task due to the large modeling +space of protein structures. In this study, we propose a latent diffusion model +that can reduce the complexity of protein modeling while flexibly capturing the +distribution of natural protein structures in a condensed latent space. +Specifically, we propose an equivariant protein autoencoder that embeds +proteins into a latent space and then uses an equivariant diffusion model to +learn the distribution of the latent protein representations. Experimental +results demonstrate that our method can effectively generate novel protein +backbone structures with high designability and efficiency. The code will be +made publicly available at +https://github.com/divelab/AIRS/tree/main/OpenProt/LatentDiff + +
+
+ comment: Accepted by the Second Learning on Graphs Conference (LoG 2023) +
+
+
+
+
+ + ♻ ☆ From Pixels to UI Actions: Learning to Follow Instructions via Graphical + User Interfaces + + +
+ Much of the previous work towards digital agents for graphical user +interfaces (GUIs) has relied on text-based representations (derived from HTML +or other structured data sources), which are not always readily available. +These input representations have been often coupled with custom, task-specific +action spaces. This paper focuses on creating agents that interact with the +digital world using the same conceptual interface that humans commonly use -- +via pixel-based screenshots and a generic action space corresponding to +keyboard and mouse actions. Building upon recent progress in pixel-based +pretraining, we show, for the first time, that it is possible for such agents +to outperform human crowdworkers on the MiniWob++ benchmark of GUI-based +instruction following tasks. + +
+
+
+
+
+ + ♻ ☆ Towards Grounded Visual Spatial Reasoning in Multi-Modal Vision Language + Models + + +
+ With pre-training of vision-and-language models (VLMs) on large-scale +datasets of image-text pairs, several recent works showed that these +pre-trained models lack fine-grained understanding, such as the ability to +count and recognize verbs, attributes, or relationships. The focus of this work +is to study the ability of these models to understand spatial relations. +Previously, this has been tackled using image-text matching (e.g., Visual +Spatial Reasoning benchmark) or visual question answering (e.g., GQA or VQAv2), +both showing poor performance and a large gap compared to human performance. In +this work, we use explainability tools to understand the causes of poor +performance better and present an alternative fine-grained, compositional +approach for ranking spatial clauses. We combine the evidence from grounding +noun phrases corresponding to objects and their locations to compute the final +rank of the spatial clause. We demonstrate the approach on representative VLMs +(such as LXMERT, GPV, and MDETR) and compare and highlight their abilities to +reason about spatial relationships. + +
+
+
+
+
+
+
+
+ + Multimedia 8 + +
+
+
+ + ☆ OneLLM: One Framework to Align All Modalities with Language + + +
+ Multimodal large language models (MLLMs) have gained significant attention +due to their strong multimodal understanding capability. However, existing +works rely heavily on modality-specific encoders, which usually differ in +architecture and are limited to common modalities. In this paper, we present +OneLLM, an MLLM that aligns eight modalities to language using a unified +framework. We achieve this through a unified multimodal encoder and a +progressive multimodal alignment pipeline. In detail, we first train an image +projection module to connect a vision encoder with LLM. Then, we build a +universal projection module (UPM) by mixing multiple image projection modules +and dynamic routing. Finally, we progressively align more modalities to LLM +with the UPM. To fully leverage the potential of OneLLM in following +instructions, we also curated a comprehensive multimodal instruction dataset, +including 2M items from image, audio, video, point cloud, depth/normal map, IMU +and fMRI brain activity. OneLLM is evaluated on 25 diverse benchmarks, +encompassing tasks such as multimodal captioning, question answering and +reasoning, where it delivers excellent performance. Code, data, model and +online demo are available at https://github.com/csuhan/OneLLM + +
+
+ comment: Code: https://github.com/csuhan/OneLLM +
+
+
+
+
+ + ☆ MotionCtrl: A Unified and Flexible Motion Controller for Video + Generation + + +
+ Motions in a video primarily consist of camera motion, induced by camera +movement, and object motion, resulting from object movement. Accurate control +of both camera and object motion is essential for video generation. However, +existing works either mainly focus on one type of motion or do not clearly +distinguish between the two, limiting their control capabilities and diversity. +Therefore, this paper presents MotionCtrl, a unified and flexible motion +controller for video generation designed to effectively and independently +control camera and object motion. The architecture and training strategy of +MotionCtrl are carefully devised, taking into account the inherent properties +of camera motion, object motion, and imperfect training data. Compared to +previous methods, MotionCtrl offers three main advantages: 1) It effectively +and independently controls camera motion and object motion, enabling more +fine-grained motion control and facilitating flexible and diverse combinations +of both types of motion. 2) Its motion conditions are determined by camera +poses and trajectories, which are appearance-free and minimally impact the +appearance or shape of objects in generated videos. 3) It is a relatively +generalizable model that can adapt to a wide array of camera poses and +trajectories once trained. Extensive qualitative and quantitative experiments +have been conducted to demonstrate the superiority of MotionCtrl over existing +methods. + +
+
+ comment: Project Page: https://wzhouxiff.github.io/projects/MotionCtrl/ +
+
+
+
+
+ + ☆ Kandinsky 3.0 Technical Report + + +
+ We present Kandinsky 3.0, a large-scale text-to-image generation model based +on latent diffusion, continuing the series of text-to-image Kandinsky models +and reflecting our progress to achieve higher quality and realism of image +generation. Compared to previous versions of Kandinsky 2.x, Kandinsky 3.0 +leverages a two times larger U-Net backbone, a ten times larger text encoder +and removes diffusion mapping. We describe the architecture of the model, the +data collection procedure, the training technique, and the production system of +user interaction. We focus on the key components that, as we have identified as +a result of a large number of experiments, had the most significant impact on +improving the quality of our model compared to the others. By our side-by-side +comparisons, Kandinsky becomes better in text understanding and works better on +specific domains. Project page: https://ai-forever.github.io/Kandinsky-3 + +
+
+ comment: Project page: https://ai-forever.github.io/Kandinsky-3 +
+
+
+
+
+ + ☆ Detecting Voice Cloning Attacks via Timbre Watermarking NDSS 2024 + + +
+ Nowadays, it is common to release audio content to the public. However, with +the rise of voice cloning technology, attackers have the potential to easily +impersonate a specific person by utilizing his publicly released audio without +any permission. Therefore, it becomes significant to detect any potential +misuse of the released audio content and protect its timbre from being +impersonated. To this end, we introduce a novel concept, "Timbre Watermarking", +which embeds watermark information into the target individual's speech, +eventually defeating the voice cloning attacks. To ensure the watermark is +robust to the voice cloning model's learning process, we design an end-to-end +voice cloning-resistant detection framework. The core idea of our solution is +to embed and extract the watermark in the frequency domain in a temporally +invariant manner. To acquire generalization across different voice cloning +attacks, we modulate their shared process and integrate it into our framework +as a distortion layer. Experiments demonstrate that the proposed timbre +watermarking can defend against different voice cloning attacks, exhibit strong +resistance against various adaptive attacks (e.g., reconstruction-based removal +attacks, watermark overwriting attacks), and achieve practicality in real-world +services such as PaddleSpeech, Voice-Cloning-App, and so-vits-svc. In addition, +ablation studies are also conducted to verify the effectiveness of our design. +Some audio samples are available at +https://timbrewatermarking.github.io/samples. + +
+
+ comment: NDSS 2024 +
+
+
+
+
+ + ♻ ☆ Source-Free Domain Adaptation for RGB-D Semantic Segmentation with + Vision Transformers WACV 2024 + + +
+ With the increasing availability of depth sensors, multimodal frameworks that +combine color information with depth data are gaining interest. However, ground +truth data for semantic segmentation is burdensome to provide, thus making +domain adaptation a significant research area. Yet most domain adaptation +methods are not able to effectively handle multimodal data. Specifically, we +address the challenging source-free domain adaptation setting where the +adaptation is performed without reusing source data. We propose MISFIT: +MultImodal Source-Free Information fusion Transformer, a depth-aware framework +which injects depth data into a segmentation module based on vision +transformers at multiple stages, namely at the input, feature and output +levels. Color and depth style transfer helps early-stage domain alignment while +re-wiring self-attention between modalities creates mixed features, allowing +the extraction of better semantic content. Furthermore, a depth-based entropy +minimization strategy is also proposed to adaptively weight regions at +different distances. Our framework, which is also the first approach using +RGB-D vision transformers for source-free semantic segmentation, shows +noticeable performance improvements with respect to standard strategies. + +
+
+ comment: WACV 2024, 2nd Workshop on Pretraining (WACVW) +
+
+
+
+
+ + ♻ ☆ Divide, Evaluate, and Refine: Evaluating and Improving Text-to-Image + Alignment with Iterative VQA Feedback + + +
+ The field of text-conditioned image generation has made unparalleled progress +with the recent advent of latent diffusion models. While remarkable, as the +complexity of given text input increases, the state-of-the-art diffusion models +may still fail in generating images which accurately convey the semantics of +the given prompt. Furthermore, it has been observed that such misalignments are +often left undetected by pretrained multi-modal models such as CLIP. To address +these problems, in this paper we explore a simple yet effective decompositional +approach towards both evaluation and improvement of text-to-image alignment. In +particular, we first introduce a Decompositional-Alignment-Score which given a +complex prompt decomposes it into a set of disjoint assertions. The alignment +of each assertion with generated images is then measured using a VQA model. +Finally, alignment scores for different assertions are combined aposteriori to +give the final text-to-image alignment score. Experimental analysis reveals +that the proposed alignment metric shows significantly higher correlation with +human ratings as opposed to traditional CLIP, BLIP scores. Furthermore, we also +find that the assertion level alignment scores provide a useful feedback which +can then be used in a simple iterative procedure to gradually increase the +expression of different assertions in the final image outputs. Human user +studies indicate that the proposed approach surpasses previous state-of-the-art +by 8.7% in overall text-to-image alignment accuracy. Project page for our paper +is available at https://1jsingh.github.io/divide-evaluate-and-refine + +
+
+
+
+
+ + ♻ ☆ Control-A-Video: Controllable Text-to-Video Generation with Diffusion + Models + + +
+ Recent advancements in diffusion models have unlocked unprecedented abilities +in visual creation. However, current text-to-video generation models struggle +with the trade-off among movement range, action coherence and object +consistency. To mitigate this issue, we present a controllable text-to-video +(T2V) diffusion model, called Control-A-Video, capable of maintaining +consistency while customizable video synthesis. Based on a pre-trained +conditional text-to-image (T2I) diffusion model, our model aims to generate +videos conditioned on a sequence of control signals, such as edge or depth +maps. For the purpose of improving object consistency, Control-A-Video +integrates motion priors and content priors into video generation. We propose +two motion-adaptive noise initialization strategies, which are based on pixel +residual and optical flow, to introduce motion priors from input videos, +producing more coherent videos. Moreover, a first-frame conditioned controller +is proposed to generate videos from content priors of the first frame, which +facilitates the semantic alignment with text and allows longer video generation +in an auto-regressive manner. With the proposed architecture and strategies, +our model achieves resource-efficient convergence and generate consistent and +coherent videos with fine-grained control. Extensive experiments demonstrate +its success in various video generative tasks such as video editing and video +style transfer, outperforming previous methods in terms of consistency and +quality. + +
+
+
+
+
+ + ♻ ☆ X-Adapter: Adding Universal Compatibility of Plugins for Upgraded + Diffusion Model + + +
+ We introduce X-Adapter, a universal upgrader to enable the pretrained +plug-and-play modules (e.g., ControlNet, LoRA) to work directly with the +upgraded text-to-image diffusion model (e.g., SDXL) without further retraining. +We achieve this goal by training an additional network to control the frozen +upgraded model with the new text-image data pairs. In detail, X-Adapter keeps a +frozen copy of the old model to preserve the connectors of different plugins. +Additionally, X-Adapter adds trainable mapping layers that bridge the decoders +from models of different versions for feature remapping. The remapped features +will be used as guidance for the upgraded model. To enhance the guidance +ability of X-Adapter, we employ a null-text training strategy for the upgraded +model. After training, we also introduce a two-stage denoising strategy to +align the initial latents of X-Adapter and the upgraded model. Thanks to our +strategies, X-Adapter demonstrates universal compatibility with various plugins +and also enables plugins of different versions to work together, thereby +expanding the functionalities of diffusion community. To verify the +effectiveness of the proposed method, we conduct extensive experiments and the +results show that X-Adapter may facilitate wider application in the upgraded +foundational diffusion model. + +
+
+ comment: Project page: https://showlab.github.io/X-Adapter/ +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 72 + +
+
+
+ + ☆ Describing Differences in Image Sets with Natural Language + + +
+ How do two sets of images differ? Discerning set-level differences is crucial +for understanding model behaviors and analyzing datasets, yet manually sifting +through thousands of images is impractical. To aid in this discovery process, +we explore the task of automatically describing the differences between two +$\textbf{sets}$ of images, which we term Set Difference Captioning. This task +takes in image sets $D_A$ and $D_B$, and outputs a description that is more +often true on $D_A$ than $D_B$. We outline a two-stage approach that first +proposes candidate difference descriptions from image sets and then re-ranks +the candidates by checking how well they can differentiate the two sets. We +introduce VisDiff, which first captions the images and prompts a language model +to propose candidate descriptions, then re-ranks these descriptions using CLIP. +To evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image +sets with ground truth difference descriptions. We apply VisDiff to various +domains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing +classification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing +model failure modes (supervised ResNet), characterizing differences between +generative models (e.g., StableDiffusionV1 and V2), and discovering what makes +images memorable. Using VisDiff, we are able to find interesting and previously +unknown differences in datasets and models, demonstrating its utility in +revealing nuanced insights. + +
+
+
+
+
+ + ☆ Rank-without-GPT: Building GPT-Independent Listwise Rerankers on + Open-Source Large Language Models + + +
+ Listwise rerankers based on large language models (LLM) are the zero-shot +state-of-the-art. However, current works in this direction all depend on the +GPT models, making it a single point of failure in scientific reproducibility. +Moreover, it raises the concern that the current research findings only hold +for GPT models but not LLM in general. In this work, we lift this pre-condition +and build for the first time effective listwise rerankers without any form of +dependency on GPT. Our passage retrieval experiments show that our best list se +reranker surpasses the listwise rerankers based on GPT-3.5 by 13% and achieves +97% effectiveness of the ones built on GPT-4. Our results also show that the +existing training datasets, which were expressly constructed for pointwise +ranking, are insufficient for building such listwise rerankers. Instead, +high-quality listwise ranking data is required and crucial, calling for further +work on building human-annotated listwise data resources. + +
+
+
+
+
+ + ☆ WhisBERT: Multimodal Text-Audio Language Modeling on 100M Words CoNLL 2023 + + +
+ Training on multiple modalities of input can augment the capabilities of a +language model. Here, we ask whether such a training regime can improve the +quality and efficiency of these systems as well. We focus on text--audio and +introduce Whisbert, which is inspired by the text--image approach of FLAVA +\citep{singh_flava_2022}. In accordance with Babylm \citep{warstadt2023papers} +guidelines, we pretrain Whisbert on a dataset comprising only 100 million words +plus their corresponding speech from the word-aligned version of the People's +Speech dataset \citep{galvez_peoples_2021}. To assess the impact of +multimodality, we compare versions of the model that are trained on text only +and on both audio and text simultaneously. We find that while Whisbert is able +to perform well on multimodal masked modeling and surpasses the Babylm +baselines in most benchmark tasks, it struggles to optimize its complex +objective and outperform its text-only Whisbert baseline. + +
+
+ comment: Published at the BabyLM Challenge, a shared task co-sponsored by CMCL + 2023 and CoNLL 2023, hosted by EMNLP 2023 +
+
+
+
+
+ + ☆ Let the LLMs Talk: Simulating Human-to-Human Conversational QA via + Zero-Shot LLM-to-LLM Interactions WSDM 2024 + + +
+ Conversational question-answering (CQA) systems aim to create interactive +search systems that effectively retrieve information by interacting with users. +To replicate human-to-human conversations, existing work uses human annotators +to play the roles of the questioner (student) and the answerer (teacher). +Despite its effectiveness, challenges exist as human annotation is +time-consuming, inconsistent, and not scalable. To address this issue and +investigate the applicability of large language models (LLMs) in CQA +simulation, we propose a simulation framework that employs zero-shot learner +LLMs for simulating teacher-student interactions. Our framework involves two +LLMs interacting on a specific topic, with the first LLM acting as a student, +generating questions to explore a given search topic. The second LLM plays the +role of a teacher by answering questions and is equipped with additional +information, including a text on the given topic. We implement both the student +and teacher by zero-shot prompting the GPT-4 model. To assess the effectiveness +of LLMs in simulating CQA interactions and understand the disparities between +LLM- and human-generated conversations, we evaluate the simulated data from +various perspectives. We begin by evaluating the teacher's performance through +both automatic and human assessment. Next, we evaluate the performance of the +student, analyzing and comparing the disparities between questions generated by +the LLM and those generated by humans. Furthermore, we conduct extensive +analyses to thoroughly examine the LLM performance by benchmarking +state-of-the-art reading comprehension models on both datasets. Our results +reveal that the teacher LLM generates lengthier answers that tend to be more +accurate and complete. The student LLM generates more diverse questions, +covering more aspects of a given topic. + +
+
+ comment: Accepted at WSDM 2024 +
+
+
+
+
+ + ☆ Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive + Review + + +
+ Due to the advent and increase in the popularity of the Internet, people have +been producing and disseminating textual data in several ways, such as reviews, +social media posts, and news articles. As a result, numerous researchers have +been working on discovering patterns in textual data, especially because social +media posts function as social sensors, indicating peoples' opinions, +interests, etc. However, most tasks regarding natural language processing are +addressed using traditional machine learning methods and static datasets. This +setting can lead to several problems, such as an outdated dataset, which may +not correspond to reality, and an outdated model, which has its performance +degrading over time. Concept drift is another aspect that emphasizes these +issues, which corresponds to data distribution and pattern changes. In a text +stream scenario, it is even more challenging due to its characteristics, such +as the high speed and data arriving sequentially. In addition, models for this +type of scenario must adhere to the constraints mentioned above while learning +from the stream by storing texts for a limited time and consuming low memory. +In this study, we performed a systematic literature review regarding concept +drift adaptation in text stream scenarios. Considering well-defined criteria, +we selected 40 papers to unravel aspects such as text drift categories, types +of text drift detection, model update mechanism, the addressed stream mining +tasks, types of text representations, and text representation update mechanism. +In addition, we discussed drift visualization and simulation and listed +real-world datasets used in the selected papers. Therefore, this paper +comprehensively reviews the concept drift adaptation in text stream mining +scenarios. + +
+
+ comment: 49 pages +
+
+
+
+
+ + ☆ Can a Tabula Recta provide security in the XXI century? + + +
+ In the not so unlikely scenario of total compromise of computers accessible +to a group of users, they might be tempted to resort to human-computable +paper-and-pencil cryptographic methods aided by a classic Tabula Recta, which +helps to perform addition and subtraction directly with letters. But do these +classic algorithms, or some new ones using the same simple tools, have any +chance against computer-aided cryptanalysis? In this paper I discuss how some +human-computable algorithms can indeed afford sufficient security in this +situation, drawing conclusions from computer-based statistical analysis. Three +kinds of algorithms are discussed: those that concentrate entropy from shared +text sources, stream ciphers based on arithmetic of non-binary spaces, and +hash-like algorithms that may be used to generate a password from a challenge +text. + +
+
+ comment: 18 pages, 3 figures +
+
+
+
+
+ + ☆ Clustering Pseudo Language Family in Multilingual Translation Models + with Fisher Information Matrix EMNLP 2023 + + +
+ In multilingual translation research, the comprehension and utilization of +language families are of paramount importance. Nevertheless, clustering +languages based solely on their ancestral families can yield suboptimal results +due to variations in the datasets employed during the model's training phase. +To mitigate this challenge, we introduce an innovative method that leverages +the fisher information matrix (FIM) to cluster language families, anchored on +the multilingual translation model's characteristics. We hypothesize that +language pairs with similar effects on model parameters exhibit a considerable +degree of linguistic congruence and should thus be grouped cohesively. This +concept has led us to define pseudo language families. We provide an in-depth +discussion regarding the inception and application of these pseudo language +families. Empirical evaluations reveal that employing these pseudo language +families enhances performance over conventional language families in adapting a +multilingual translation model to unfamiliar language pairs. The proposed +methodology may also be extended to scenarios requiring language similarity +measurements. The source code and associated scripts can be accessed at +https://github.com/ecoli-hit/PseudoFamily. + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ☆ Leveraging Domain Adaptation and Data Augmentation to Improve Qur'anic + IR in English and Arabic + + +
+ In this work, we approach the problem of Qur'anic information retrieval (IR) +in Arabic and English. Using the latest state-of-the-art methods in neural IR, +we research what helps to tackle this task more efficiently. Training retrieval +models requires a lot of data, which is difficult to obtain for training +in-domain. Therefore, we commence with training on a large amount of general +domain data and then continue training on in-domain data. To handle the lack of +in-domain data, we employed a data augmentation technique, which considerably +improved results in MRR@10 and NDCG@5 metrics, setting the state-of-the-art in +Qur'anic IR for both English and Arabic. The absence of an Islamic corpus and +domain-specific model for IR task in English motivated us to address this lack +of resources and take preliminary steps of the Islamic corpus compilation and +domain-specific language model (LM) pre-training, which helped to improve the +performance of the retrieval models that use the domain-specific LM as the +shared backbone. We examined several language models (LMs) in Arabic to select +one that efficiently deals with the Qur'anic IR task. Besides transferring +successful experiments from English to Arabic, we conducted additional +experiments with retrieval task in Arabic to amortize the scarcity of general +domain datasets used to train the retrieval models. Handling Qur'anic IR task +combining English and Arabic allowed us to enhance the comparison and share +valuable insights across models and languages. + +
+
+
+
+
+ + ☆ Weakly Supervised Detection of Hallucinations in LLM Activations + + +
+ We propose an auditing method to identify whether a large language model +(LLM) encodes patterns such as hallucinations in its internal states, which may +propagate to downstream tasks. We introduce a weakly supervised auditing +technique using a subset scanning approach to detect anomalous patterns in LLM +activations from pre-trained models. Importantly, our method does not need +knowledge of the type of patterns a-priori. Instead, it relies on a reference +dataset devoid of anomalies during testing. Further, our approach enables the +identification of pivotal nodes responsible for encoding these patterns, which +may offer crucial insights for fine-tuning specific sub-networks for bias +mitigation. We introduce two new scanning methods to handle LLM activations for +anomalous sentences that may deviate from the expected distribution in either +direction. Our results confirm prior findings of BERT's limited internal +capacity for encoding hallucinations, while OPT appears capable of encoding +hallucination information internally. Importantly, our scanning approach, +without prior exposure to false statements, performs comparably to a fully +supervised out-of-distribution classifier. + +
+
+
+
+
+ + ☆ Large Language Models on Graphs: A Comprehensive Survey + + +
+ Large language models (LLMs), such as ChatGPT and LLaMA, are creating +significant advancements in natural language processing, due to their strong +text encoding/decoding ability and newly found emergent capability (e.g., +reasoning). While LLMs are mainly designed to process pure texts, there are +many real-world scenarios where text data are associated with rich structure +information in the form of graphs (e.g., academic networks, and e-commerce +networks) or scenarios where graph data are paired with rich textual +information (e.g., molecules with descriptions). Besides, although LLMs have +shown their pure text-based reasoning ability, it is underexplored whether such +ability can be generalized to graph scenarios (i.e., graph-based reasoning). In +this paper, we provide a systematic review of scenarios and techniques related +to large language models on graphs. We first summarize potential scenarios of +adopting LLMs on graphs into three categories, namely pure graphs, text-rich +graphs, and text-paired graphs. We then discuss detailed techniques for +utilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM +as Aligner, and compare the advantages and disadvantages of different schools +of models. Furthermore, we mention the real-world applications of such methods +and summarize open-source codes and benchmark datasets. Finally, we conclude +with potential future research directions in this fast-growing field. The +related source can be found at +https://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Scaling Laws for Adversarial Attacks on Language Model Activations + + +
+ We explore a class of adversarial attacks targeting the activations of +language models. By manipulating a relatively small subset of model +activations, $a$, we demonstrate the ability to control the exact prediction of +a significant number (in some cases up to 1000) of subsequent tokens $t$. We +empirically verify a scaling law where the maximum number of target tokens +$t_\mathrm{max}$ predicted depends linearly on the number of tokens $a$ whose +activations the attacker controls as $t_\mathrm{max} = \kappa a$. We find that +the number of bits of control in the input space needed to control a single bit +in the output space (what we call attack resistance $\chi$) is remarkably +constant between $\approx 16$ and $\approx 25$ over 2 orders of magnitude of +model sizes for different language models. Compared to attacks on tokens, +attacks on activations are predictably much stronger, however, we identify a +surprising regularity where one bit of input steered either via activations or +via tokens is able to exert control over a similar amount of output bits. This +gives support for the hypothesis that adversarial attacks are a consequence of +dimensionality mismatch between the input and output spaces. A practical +implication of the ease of attacking language model activations instead of +tokens is for multi-modal and selected retrieval models, where additional data +sources are added as activations directly, sidestepping the tokenized input. +This opens up a new, broad attack surface. By using language models as a +controllable test-bed to study adversarial attacks, we were able to experiment +with input-output dimensions that are inaccessible in computer vision, +especially where the output dimension dominates. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ Compositional Generalization for Data-to-Text Generation + + +
+ Data-to-text generation involves transforming structured data, often +represented as predicate-argument tuples, into coherent textual descriptions. +Despite recent advances, systems still struggle when confronted with unseen +combinations of predicates, producing unfaithful descriptions (e.g. +hallucinations or omissions). We refer to this issue as compositional +generalisation, and it encouraged us to create a benchmark for assessing the +performance of different approaches on this specific problem. Furthermore, we +propose a novel model that addresses compositional generalization by clustering +predicates into groups. Our model generates text in a sentence-by-sentence +manner, relying on one cluster of predicates at a time. This approach +significantly outperforms T5~baselines across all evaluation metrics.Notably, +it achieved a 31% improvement over T5 in terms of a metric focused on +maintaining faithfulness to the input. + +
+
+
+
+
+ + ☆ Towards Measuring Representational Similarity of Large Language Models NeurIPS 2023 + + +
+ Understanding the similarity of the numerous released large language models +(LLMs) has many uses, e.g., simplifying model selection, detecting illegal +model reuse, and advancing our understanding of what makes LLMs perform well. +In this work, we measure the similarity of representations of a set of LLMs +with 7B parameters. Our results suggest that some LLMs are substantially +different from others. We identify challenges of using representational +similarity measures that suggest the need of careful study of similarity scores +to avoid false conclusions. + +
+
+ comment: Extended abstract in UniReps Workshop @ NeurIPS 2023 +
+
+
+
+
+ + ☆ Large Knowledge Model: Perspectives and Challenges + + +
+ Humankind's understanding of the world is fundamentally linked to our +perception and cognition, with \emph{human languages} serving as one of the +major carriers of \emph{world knowledge}. In this vein, \emph{Large Language +Models} (LLMs) like ChatGPT epitomize the pre-training of extensive, +sequence-based world knowledge into neural networks, facilitating the +processing and manipulation of this knowledge in a parametric space. This +article explores large models through the lens of ``knowledge''. We initially +investigate the role of symbolic knowledge such as Knowledge Graphs (KGs) in +enhancing LLMs, covering aspects like knowledge-augmented language model, +structure-inducing pre-training, knowledgeable prompts, structured CoT, +knowledge editing, semantic tools for LLM and knowledgeable AI agents. +Subsequently, we examine how LLMs can amplify traditional symbolic knowledge +bases, encompassing aspects like using LLM as KG builder and controller, +structured knowledge pretraining, LLM-enhanced symbolic reasoning, and the +amalgamation of perception with cognition. Considering the intricate nature of +human knowledge, we advocate for the creation of \emph{Large Knowledge Models} +(LKM), specifically engineered to manage diversified spectrum of knowledge +structures. This ambitious undertaking could entail several key challenges, +such as disentangling knowledge representation from language models, +restructuring pre-training with structured knowledge, and building large +commonsense models, among others. We finally propose a five-``A'' principle to +distinguish the concept of LKM. + +
+
+ comment: This is an early draft subject to revision in a near future +
+
+
+
+
+ + ☆ Prompt Optimization via Adversarial In-Context Learning + + +
+ We propose a new method, Adversarial In-Context Learning (adv-ICL), to +optimize prompt for in-context learning (ICL) by employing one LLM as a +generator, another as a discriminator, and a third as a prompt modifier. As in +traditional adversarial learning, adv-ICL is implemented as a two-player game +between the generator and discriminator, where the generator tries to generate +realistic enough output to fool the discriminator. In each round, given an +input prefixed by task instructions and several exemplars, the generator +produces an output. The discriminator is then tasked with classifying the +generator input-output pair as model-generated or real data. Based on the +discriminator loss, the prompt modifier proposes possible edits to the +generator and discriminator prompts, and the edits that most improve the +adversarial loss are selected. We show that adv-ICL results in significant +improvements over state-of-the-art prompt optimization techniques for both open +and closed-source models on 11 generation and classification tasks including +summarization, arithmetic reasoning, machine translation, data-to-text +generation, and the MMLU and big-bench hard benchmarks. In addition, because +our method uses pre-trained models and updates only prompts rather than model +parameters, it is computationally efficient, easy to extend to any LLM and +task, and effective in low-resource settings. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Impact of Tokenization on LLaMa Russian Adaptation + + +
+ Latest instruction-tuned large language models (LLM) show great results on +various tasks, however, they often face performance degradation for non-English +input. There is evidence that the reason lies in inefficient tokenization +caused by low language representation in pre-training data which hinders the +comprehension of non-English instructions, limiting the potential of target +language instruction-tuning. In this work we investigate the possibility of +addressing the issue with vocabulary substitution in the context of LLaMa +Russian language adaptation. We explore three variants of vocabulary adaptation +and test their performance on Saiga instruction-tuning and fine-tuning on +Russian Super Glue benchmark. The results of automatic evaluation show that +vocabulary substitution not only improves the model's quality in Russian but +also accelerates fine-tuning (35%) and inference (up to 60%) while reducing +memory consumption. Additional human evaluation of the instruction-tuned models +demonstrates that models with Russian-adapted vocabulary generate answers with +higher user preference than the original Saiga-LLaMa model. + +
+
+
+
+
+ + ☆ Text Intimacy Analysis using Ensembles of Multilingual Transformers + + +
+ Intimacy estimation of a given text has recently gained importance due to the +increase in direct interaction of NLP systems with humans. Intimacy is an +important aspect of natural language and has a substantial impact on our +everyday communication. Thus the level of intimacy can provide us with deeper +insights and richer semantics of conversations. In this paper, we present our +work on the SemEval shared task 9 on predicting the level of intimacy for the +given text. The dataset consists of tweets in ten languages, out of which only +six are available in the training dataset. We conduct several experiments and +show that an ensemble of multilingual models along with a language-specific +monolingual model has the best performance. We also evaluate other data +augmentation methods such as translation and present the results. Lastly, we +study the results thoroughly and present some noteworthy insights into this +problem. + +
+
+
+
+
+ + ☆ Empathy and Distress Detection using Ensembles of Transformer Models WASSA 2023 + + +
+ This paper presents our approach for the WASSA 2023 Empathy, Emotion and +Personality Shared Task. Empathy and distress are human feelings that are +implicitly expressed in natural discourses. Empathy and distress detection are +crucial challenges in Natural Language Processing that can aid our +understanding of conversations. The provided dataset consists of several +long-text examples in the English language, with each example associated with a +numeric score for empathy and distress. We experiment with several BERT-based +models as a part of our approach. We also try various ensemble methods. Our +final submission has a Pearson's r score of 0.346, placing us third in the +empathy and distress detection subtask. + +
+
+ comment: Accepted at the WASSA 2023 workshop at ACL 2023 +
+
+
+
+
+ + ☆ ULMA: Unified Language Model Alignment with Demonstration and Point-wise + Human Preference + + +
+ Language model alignment is a cutting-edge technique in large language model +training to align the model output to user's intent, e.g., being helpful and +harmless. Recent alignment framework consists of two steps: supervised +fine-tuning with demonstration data and preference learning with human +preference data. Previous preference learning methods, such as RLHF and DPO, +mainly focus on pair-wise preference data. However, in many real-world +scenarios where human feedbacks are intrinsically point-wise, these methods +will suffer from information loss or even fail. To fill this gap, in this +paper, we first develop a preference learning method called point-wise DPO to +tackle point-wise preference data. Further revelation on the connection between +supervised fine-tuning and point-wise preference learning enables us to develop +a unified framework for both human demonstration and point-wise preference +data, which sheds new light on the construction of preference dataset. +Extensive experiments on point-wise datasets with binary or continuous labels +demonstrate the superior performance and efficiency of our proposed methods. A +new dataset with high-quality demonstration samples on harmlessness is +constructed and made publicly available. + +
+
+
+
+
+ + ☆ DemaFormer: Damped Exponential Moving Average Transformer with + Energy-Based Modeling for Temporal Language Grounding EMNLP 2023 + + +
+ Temporal Language Grounding seeks to localize video moments that semantically +correspond to a natural language query. Recent advances employ the attention +mechanism to learn the relations between video moments and the text query. +However, naive attention might not be able to appropriately capture such +relations, resulting in ineffective distributions where target video moments +are difficult to separate from the remaining ones. To resolve the issue, we +propose an energy-based model framework to explicitly learn moment-query +distributions. Moreover, we propose DemaFormer, a novel Transformer-based +architecture that utilizes exponential moving average with a learnable damping +factor to effectively encode moment-query inputs. Comprehensive experiments on +four public temporal language grounding datasets showcase the superiority of +our methods over the state-of-the-art baselines. + +
+
+ comment: Accepted at EMNLP 2023 (Findings) +
+
+
+
+
+ + ☆ DRAFT: Dense Retrieval Augmented Few-shot Topic classifier Framework + + +
+ With the growing volume of diverse information, the demand for classifying +arbitrary topics has become increasingly critical. To address this challenge, +we introduce DRAFT, a simple framework designed to train a classifier for +few-shot topic classification. DRAFT uses a few examples of a specific topic as +queries to construct Customized dataset with a dense retriever model. +Multi-query retrieval (MQR) algorithm, which effectively handles multiple +queries related to a specific topic, is applied to construct the Customized +dataset. Subsequently, we fine-tune a classifier using the Customized dataset +to identify the topic. To demonstrate the efficacy of our proposed approach, we +conduct evaluations on both widely used classification benchmark datasets and +manually constructed datasets with 291 diverse topics, which simulate diverse +contents encountered in real-world applications. DRAFT shows competitive or +superior performance compared to baselines that use in-context learning, such +as GPT-3 175B and InstructGPT 175B, on few-shot topic classification tasks +despite having 177 times fewer parameters, demonstrating its effectiveness. + +
+
+
+
+
+ + ☆ MKA: A Scalable Medical Knowledge Assisted Mechanism for Generative + Models on Medical Conversation Tasks + + +
+ Using natural language processing (NLP) technologies to develop medical +chatbots makes the diagnosis of the patient more convenient and efficient, +which is a typical application in healthcare AI. Because of its importance, +lots of research have been come out. Recently, the neural generative models +have shown their impressive ability as the core of chatbot, while it cannot +scale well when directly applied to medical conversation due to the lack of +medical-specific knowledge. To address the limitation, a scalable Medical +Knowledge Assisted mechanism, MKA, is proposed in this paper. The mechanism +aims to assist general neural generative models to achieve better performance +on the medical conversation task. The medical-specific knowledge graph is +designed within the mechanism, which contains 6 types of medical-related +information, including department, drug, check, symptom, disease, food. +Besides, the specific token concatenation policy is defined to effectively +inject medical information into the input data. Evaluation of our method is +carried out on two typical medical datasets, MedDG and MedDialog-CN. The +evaluation results demonstrate that models combined with our mechanism +outperform original methods in multiple automatic evaluation metrics. Besides, +MKA-Bert-GPT achieves state-of-the-art performance. The open-sourced codes are +public: +https://github.com/LIANGKE23/Knowledge_Assisted_Medical_Dialogue_Generation_Mechanism + +
+
+
+
+
+ + ☆ MedDM:LLM-executable clinical guidance tree for clinical decision-making + + +
+ It is becoming increasingly emphasis on the importance of LLM participating +in clinical diagnosis decision-making. However, the low specialization refers +to that current medical LLMs can not provide specific medical advice, which are +more like a medical Q\&A. And there is no suitable clinical guidance tree data +set that can be used directly with LLM. To address this issue, we first propose +LLM-executavle clinical guidance tree(CGT), which can be directly used by large +language models, and construct medical diagnostic decision-making dataset +(MedDM), from flowcharts in clinical practice guidelines. We propose an +approach to screen flowcharts from medical literature, followed by their +identification and conversion into standardized diagnostic decision trees. +Constructed a knowledge base with 1202 decision trees, which came from 5000 +medical literature and covered 12 hospital departments, including internal +medicine, surgery, psychiatry, and over 500 diseases.Moreover, we propose a +method for reasoning on LLM-executable CGT and a Patient-LLM multi-turn +dialogue framework. + +
+
+
+
+
+ + ☆ Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language + Models with Creative Humor Generation + + +
+ Chain-of-Thought (CoT) guides large language models (LLMs) to reason +step-by-step, and can motivate their logical reasoning ability. While effective +for logical tasks, CoT is not conducive to creative problem-solving which often +requires out-of-box thoughts and is crucial for innovation advancements. In +this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a +non-sequential, creative paradigm involving strong associations and knowledge +leaps. To this end, we study LLMs on the popular Oogiri game which needs +participants to have good creativity and strong associative thinking for +responding unexpectedly and humorously to the given image, text, or both, and +thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the +Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset +which contains over 130,000 samples from the Oogiri game, and observe the +insufficient LoT ability or failures of most existing LLMs on the Oogiri game. +Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve +LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into +LoT-oriented instruction tuning data to train pretrained LLM for achieving +certain LoT humor generation and discrimination abilities. Then CLoT designs an +explorative self-refinement that encourages the LLM to generate more creative +LoT data via exploring parallels between seemingly unrelated concepts and +selects high-quality data to train itself for self-refinement. CLoT not only +excels in humor generation in the Oogiri game but also boosts creative +abilities in various tasks like cloud guessing game and divergent association +task. These findings advance our understanding and offer a pathway to improve +LLMs' creative capacities for innovative applications across domains. The +dataset, code, and models will be released online. +https://github.com/sail-sg/CLoT. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ MUFFIN: Curating Multi-Faceted Instructions for Improving + Instruction-Following + + +
+ In the realm of large language models (LLMs), enhancing instruction-following +capability often involves curating expansive training data. This is achieved +through two primary schemes: i) Scaling-Inputs: Amplifying (input, output) +pairs per task instruction, aiming for better instruction adherence. ii) +Scaling Input-Free Tasks: Enlarging tasks, each composed of an (instruction, +output) pair (without requiring a separate input anymore). However, LLMs under +Scaling-Inputs tend to be overly sensitive to inputs, leading to +misinterpretation or non-compliance with instructions. Conversely, Scaling +Input-Free Tasks demands a substantial number of tasks but is less effective in +instruction following when dealing with instances in Scaling-Inputs. This work +introduces MUFFIN, a new scheme of instruction-following dataset curation. +Specifically, we automatically Scale Tasks per Input by diversifying these +tasks with various input facets. Experimental results across four zero-shot +benchmarks, spanning both Scaling-Inputs and Scaling Input-Free Tasks schemes, +reveal that LLMs, at various scales, trained on MUFFIN generally demonstrate +superior instruction-following capabilities compared to those trained on the +two aforementioned schemes. + +
+
+ comment: Website: https://renzelou.github.io/Muffin/ +
+
+
+
+
+ + ☆ Visually Grounded Language Learning: a review of language games, + datasets, tasks, and models + + +
+ In recent years, several machine learning models have been proposed. They are +trained with a language modelling objective on large-scale text-only data. With +such pretraining, they can achieve impressive results on many Natural Language +Understanding and Generation tasks. However, many facets of meaning cannot be +learned by ``listening to the radio" only. In the literature, many +Vision+Language (V+L) tasks have been defined with the aim of creating models +that can ground symbols in the visual modality. In this work, we provide a +systematic literature review of several tasks and models proposed in the V+L +field. We rely on Wittgenstein's idea of `language games' to categorise such +tasks into 3 different families: 1) discriminative games, 2) generative games, +and 3) interactive games. Our analysis of the literature provides evidence that +future work should be focusing on interactive games where communication in +Natural Language is important to resolve ambiguities about object referents and +action plans and that physical embodiment is essential to understand the +semantics of situations and events. Overall, these represent key requirements +for developing grounded meanings in neural models. + +
+
+ comment: Preprint for JAIR before copyediting +
+
+
+
+
+ + ☆ Decoding Data Quality via Synthetic Corruptions: Embedding-guided + Pruning of Code Data SP + + +
+ Code datasets, often collected from diverse and uncontrolled sources such as +GitHub, potentially suffer from quality issues, thereby affecting the +performance and training efficiency of Large Language Models (LLMs) optimized +for code generation. Previous studies demonstrated the benefit of using +embedding spaces for data pruning, but they mainly focused on duplicate removal +or increasing variety, and in other modalities, such as images. Our work +focuses on using embeddings to identify and remove "low-quality" code data. +First, we explore features of "low-quality" code in embedding space, through +the use of synthetic corruptions. Armed with this knowledge, we devise novel +pruning metrics that operate in embedding space to identify and remove +low-quality entries in the Stack dataset. We demonstrate the benefits of this +synthetic corruption informed pruning (SCIP) approach on the well-established +HumanEval and MBPP benchmarks, outperforming existing embedding-based methods. +Importantly, we achieve up to a 3% performance improvement over no pruning, +thereby showing the promise of insights from synthetic corruptions for data +pruning. + +
+
+ comment: 12 pages, 4 figures, Oral Presentation at 3rd Workshop on Efficient + Natural Language and Speech Processing (ENLSP-III), NeurIPS 2023 +
+
+
+
+
+ + ☆ Efficient Online Data Mixing For Language Model Pre-Training + + +
+ The data used to pretrain large language models has a decisive impact on a +model's downstream performance, which has led to a large body of work on data +selection methods that aim to automatically determine the most suitable data to +use for pretraining. Existing data selection methods suffer from slow and +computationally expensive processes, a problem amplified by the increasing size +of models and of pretraining datasets. Data mixing, on the other hand, reduces +the complexity of data selection by grouping data points together and +determining sampling probabilities across entire groups. However, data mixing +proportions are typically fixed before training and therefore cannot adapt to +changing training dynamics. To address these limitations, we develop an +efficient algorithm for Online Data Mixing (ODM) that combines elements from +both data selection and data mixing. Based on multi-armed bandit algorithms, +our online approach optimizes the data mixing proportions during training. +Remarkably, our method trains a model that reaches the final perplexity of the +next best method with 19\% fewer training iterations, and improves performance +on the 5-shot MMLU benchmark by 1.9% relative accuracy, while adding negligible +wall-clock time during pretraining. + +
+
+
+
+
+ + ☆ A Comparative Study of AI-Generated (GPT-4) and Human-crafted MCQs in + Programming Education + + +
+ There is a constant need for educators to develop and maintain effective +up-to-date assessments. While there is a growing body of research in computing +education on utilizing large language models (LLMs) in generation and +engagement with coding exercises, the use of LLMs for generating programming +MCQs has not been extensively explored. We analyzed the capability of GPT-4 to +produce multiple-choice questions (MCQs) aligned with specific learning +objectives (LOs) from Python programming classes in higher education. +Specifically, we developed an LLM-powered (GPT-4) system for generation of MCQs +from high-level course context and module-level LOs. We evaluated 651 +LLM-generated and 449 human-crafted MCQs aligned to 246 LOs from 6 Python +courses. We found that GPT-4 was capable of producing MCQs with clear language, +a single correct choice, and high-quality distractors. We also observed that +the generated MCQs appeared to be well-aligned with the LOs. Our findings can +be leveraged by educators wishing to take advantage of the state-of-the-art +generative models to support MCQ authoring efforts. + +
+
+
+
+
+ + ☆ Combining Counting Processes and Classification Improves a Stopping Rule + for Technology Assisted Review EMNLP 2023 + + +
+ Technology Assisted Review (TAR) stopping rules aim to reduce the cost of +manually assessing documents for relevance by minimising the number of +documents that need to be examined to ensure a desired level of recall. This +paper extends an effective stopping rule using information derived from a text +classifier that can be trained without the need for any additional annotation. +Experiments on multiple data sets (CLEF e-Health, TREC Total Recall, TREC Legal +and RCV1) showed that the proposed approach consistently improves performance +and outperforms several alternative methods. + +
+
+ comment: Accepted at EMNLP 2023 Findings +
+
+
+
+
+ + ☆ FlexModel: A Framework for Interpretability of Distributed Large + Language Models NeurIPS 2023 + + +
+ With the growth of large language models, now incorporating billions of +parameters, the hardware prerequisites for their training and deployment have +seen a corresponding increase. Although existing tools facilitate model +parallelization and distributed training, deeper model interactions, crucial +for interpretability and responsible AI techniques, still demand thorough +knowledge of distributed computing. This often hinders contributions from +researchers with machine learning expertise but limited distributed computing +background. Addressing this challenge, we present FlexModel, a software package +providing a streamlined interface for engaging with models distributed across +multi-GPU and multi-node configurations. The library is compatible with +existing model distribution libraries and encapsulates PyTorch models. It +exposes user-registerable HookFunctions to facilitate straightforward +interaction with distributed model internals, bridging the gap between +distributed and single-device model paradigms. Primarily, FlexModel enhances +accessibility by democratizing model interactions and promotes more inclusive +research in the domain of large-scale neural networks. The package is found at +https://github.com/VectorInstitute/flex_model. + +
+
+ comment: 14 pages, 8 figures. To appear at the Socially Responsible Language + Modelling Research (SoLaR) Workshop, 37th Conference on Neural Information + Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ☆ Assertion Enhanced Few-Shot Learning: Instructive Technique for Large + Language Models to Generate Educational Explanations + + +
+ Human educators possess an intrinsic ability to anticipate and seek +educational explanations from students, which drives them to pose +thought-provoking questions when students cannot articulate these explanations +independently. We aim to imbue Intelligent Tutoring Systems with this ability +using few-shot learning capability of Large Language Models. Our work proposes +a novel prompting technique, Assertion Enhanced Few-Shot Learning, to +facilitate the generation of accurate, detailed oriented educational +explanations. Our central hypothesis is that, in educational domain, few-shot +demonstrations are necessary but not a sufficient condition for quality +explanation generation. We conducted a study involving 12 in-service teachers, +comparing our approach to Traditional Few-Shot Learning. The results show that +Assertion Enhanced Few-Shot Learning improves explanation accuracy by 15% and +yields higher-quality explanations, as evaluated by teachers. We also conduct a +qualitative ablation study to factor the impact of assertions to provide +educator-friendly prompting guidelines for generating explanations in their +domain of interest. + +
+
+
+
+
+ + ☆ Understanding Environmental Posts: Sentiment and Emotion Analysis of + Social Media Data + + +
+ Social media is now the predominant source of information due to the +availability of immediate public response. As a result, social media data has +become a valuable resource for comprehending public sentiments. Studies have +shown that it can amplify ideas and influence public sentiments. This study +analyzes the public perception of climate change and the environment over a +decade from 2014 to 2023. Using the Pointwise Mutual Information (PMI) +algorithm, we identify sentiment and explore prevailing emotions expressed +within environmental tweets across various social media platforms, namely +Twitter, Reddit, and YouTube. Accuracy on a human-annotated dataset was 0.65, +higher than Vader score but lower than that of an expert rater (0.90). Our +findings suggest that negative environmental tweets are far more common than +positive or neutral ones. Climate change, air quality, emissions, plastic, and +recycling are the most discussed topics on all social media platforms, +highlighting its huge global concern. The most common emotions in environmental +tweets are fear, trust, and anticipation, demonstrating public reactions wide +and complex nature. By identifying patterns and trends in opinions related to +the environment, we hope to provide insights that can help raise awareness +regarding environmental issues, inform the development of interventions, and +adapt further actions to meet environmental challenges. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ RESIN-EDITOR: A Schema-guided Hierarchical Event Graph Visualizer and + Editor + + +
+ In this paper, we present RESIN-EDITOR, an interactive event graph visualizer +and editor designed for analyzing complex events. Our RESIN-EDITOR system +allows users to render and freely edit hierarchical event graphs extracted from +multimedia and multi-document news clusters with guidance from human-curated +event schemas. RESIN-EDITOR's unique features include hierarchical graph +visualization, comprehensive source tracing, and interactive user editing, +which is more powerful and versatile than existing Information Extraction (IE) +visualization tools. In our evaluation of RESIN-EDITOR, we demonstrate ways in +which our tool is effective in understanding complex events and enhancing +system performance. The source code, a video demonstration, and a live website +for RESIN-EDITOR have been made publicly available. + +
+
+ comment: The first two authors contribute equally to this paper +
+
+
+
+
+ + ☆ LLMs for Multi-Modal Knowledge Extraction and Analysis in + Intelligence/Safety-Critical Applications + + +
+ Large Language Models have seen rapid progress in capability in recent years; +this progress has been accelerating and their capabilities, measured by various +benchmarks, are beginning to approach those of humans. There is a strong demand +to use such models in a wide variety of applications but, due to unresolved +vulnerabilities and limitations, great care needs to be used before applying +them to intelligence and safety-critical applications. This paper reviews +recent literature related to LLM assessment and vulnerabilities to synthesize +the current research landscape and to help understand what advances are most +critical to enable use of of these technologies in intelligence and +safety-critical applications. The vulnerabilities are broken down into ten +high-level categories and overlaid onto a high-level life cycle of an LLM. Some +general categories of mitigations are reviewed. + +
+
+ comment: initial draft +
+
+
+
+
+ + ☆ Clinical Notes Reveal Physician Fatigue + + +
+ Physicians write notes about patients. In doing so, they reveal much about +themselves. Using data from 129,228 emergency room visits, we train a model to +identify notes written by fatigued physicians -- those who worked 5 or more of +the prior 7 days. In a hold-out set, the model accurately identifies notes +written by these high-workload physicians, and also flags notes written in +other high-fatigue settings: on overnight shifts, and after high patient +volumes. Model predictions also correlate with worse decision-making on at +least one important metric: yield of testing for heart attack is 18% lower with +each standard deviation increase in model-predicted fatigue. Finally, the model +indicates that notes written about Black and Hispanic patients have 12% and 21% +higher predicted fatigue than Whites -- larger than overnight vs. daytime +differences. These results have an important implication for large language +models (LLMs). Our model indicates that fatigued doctors write more predictable +notes. Perhaps unsurprisingly, because word prediction is the core of how LLMs +work, we find that LLM-written notes have 17% higher predicted fatigue than +real physicians' notes. This indicates that LLMs may introduce distortions in +generated text that are not yet fully understood. + +
+
+
+
+
+ + ☆ Visual Program Distillation: Distilling Tools and Programmatic Reasoning + into Vision-Language Models + + +
+ Solving complex visual tasks such as "Who invented the musical instrument on +the right?" involves a composition of skills: understanding space, recognizing +instruments, and also retrieving prior knowledge. Recent work shows promise by +decomposing such tasks using a large language model (LLM) into an executable +program that invokes specialized vision models. However, generated programs are +error-prone: they omit necessary steps, include spurious ones, and are unable +to recover when the specialized models give incorrect outputs. Moreover, they +require loading multiple models, incurring high latency and computation costs. +We propose Visual Program Distillation (VPD), an instruction tuning framework +that produces a vision-language model (VLM) capable of solving complex visual +tasks with a single forward pass. VPD distills the reasoning ability of LLMs by +using them to sample multiple candidate programs, which are then executed and +verified to identify a correct one. It translates each correct program into a +language description of the reasoning steps, which are then distilled into a +VLM. Extensive experiments show that VPD improves the VLM's ability to count, +understand spatial relations, and reason compositionally. Our VPD-trained +PaLI-X outperforms all prior VLMs, achieving state-of-the-art performance +across complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE, +and Hateful Memes. An evaluation with human annotators also confirms that VPD +improves model response factuality and consistency. Finally, experiments on +content moderation demonstrate that VPD is also helpful for adaptation to +real-world applications with limited data. + +
+
+
+
+
+ + ☆ Inherent limitations of LLMs regarding spatial information + + +
+ Despite the significant advancements in natural language processing +capabilities demonstrated by large language models such as ChatGPT, their +proficiency in comprehending and processing spatial information, especially +within the domains of 2D and 3D route planning, remains notably underdeveloped. +This paper investigates the inherent limitations of ChatGPT and similar models +in spatial reasoning and navigation-related tasks, an area critical for +applications ranging from autonomous vehicle guidance to assistive technologies +for the visually impaired. In this paper, we introduce a novel evaluation +framework complemented by a baseline dataset, meticulously crafted for this +study. This dataset is structured around three key tasks: plotting spatial +points, planning routes in two-dimensional (2D) spaces, and devising pathways +in three-dimensional (3D) environments. We specifically developed this dataset +to assess the spatial reasoning abilities of ChatGPT. Our evaluation reveals +key insights into the model's capabilities and limitations in spatial +understanding. + +
+
+
+
+
+ + ☆ Training on Synthetic Data Beats Real Data in Multimodal Relation + Extraction + + +
+ The task of multimodal relation extraction has attracted significant research +attention, but progress is constrained by the scarcity of available training +data. One natural thought is to extend existing datasets with cross-modal +generative models. In this paper, we consider a novel problem setting, where +only unimodal data, either text or image, are available during training. We aim +to train a multimodal classifier from synthetic data that perform well on real +multimodal test data. However, training with synthetic data suffers from two +obstacles: lack of data diversity and label information loss. To alleviate the +issues, we propose Mutual Information-aware Multimodal Iterated Relational dAta +GEneration (MI2RAGE), which applies Chained Cross-modal Generation (CCG) to +promote diversity in the generated data and exploits a teacher network to +select valuable training samples with high mutual information with the +ground-truth labels. Comparing our method to direct training on synthetic data, +we observed a significant improvement of 24.06% F1 with synthetic text and +26.42% F1 with synthetic images. Notably, our best model trained on completely +synthetic images outperforms prior state-of-the-art models trained on real +multimodal data by a margin of 3.76% in F1. Our codebase will be made available +upon acceptance. + +
+
+
+
+
+ + ☆ Beyond Isolation: Multi-Agent Synergy for Improving Knowledge Graph + Construction + + +
+ Knowledge graph construction (KGC) is a multifaceted undertaking involving +the extraction of entities, relations, and events. Traditionally, large +language models (LLMs) have been viewed as solitary task-solving agents in this +complex landscape. However, this paper challenges this paradigm by introducing +a novel framework, CooperKGC. Departing from the conventional approach, +CooperKGC establishes a collaborative processing network, assembling a KGC +collaboration team capable of concurrently addressing entity, relation, and +event extraction tasks. Our experiments unequivocally demonstrate that +fostering collaboration and information interaction among diverse agents within +CooperKGC yields superior results compared to individual cognitive processes +operating in isolation. Importantly, our findings reveal that the collaboration +facilitated by CooperKGC enhances knowledge selection, correction, and +aggregation capabilities across multiple rounds of interactions. + +
+
+ comment: work in progress; 12 pages +
+
+
+
+
+ + ☆ Protein Language Model-Powered 3D Ligand Binding Site Prediction from + Protein Sequence NeurIPS 2023 + + +
+ Prediction of ligand binding sites of proteins is a fundamental and important +task for understanding the function of proteins and screening potential drugs. +Most existing methods require experimentally determined protein holo-structures +as input. However, such structures can be unavailable on novel or less-studied +proteins. To tackle this limitation, we propose LaMPSite, which only takes +protein sequences and ligand molecular graphs as input for ligand binding site +predictions. The protein sequences are used to retrieve residue-level +embeddings and contact maps from the pre-trained ESM-2 protein language model. +The ligand molecular graphs are fed into a graph neural network to compute +atom-level embeddings. Then we compute and update the protein-ligand +interaction embedding based on the protein residue-level embeddings and ligand +atom-level embeddings, and the geometric constraints in the inferred protein +contact map and ligand distance map. A final pooling on protein-ligand +interaction embedding would indicate which residues belong to the binding +sites. Without any 3D coordinate information of proteins, our proposed model +achieves competitive performance compared to baseline methods that require 3D +protein structures when predicting binding sites. Given that less than 50% of +proteins have reliable structure information in the current stage, LaMPSite +will provide new opportunities for drug discovery. + +
+
+ comment: Accepted by the AI for Science (AI4Science) Workshop and the New + Frontiers of AI for Drug Discovery and Development (AI4D3) Workshop at + NeurIPS 2023 +
+
+
+
+
+ + ☆ GPT vs Human for Scientific Reviews: A Dual Source Review on + Applications of ChatGPT in Science + + +
+ The new polymath Large Language Models (LLMs) can speed-up greatly scientific +reviews, possibly using more unbiased quantitative metrics, facilitating +cross-disciplinary connections, and identifying emerging trends and research +gaps by analyzing large volumes of data. However, at the present time, they +lack the required deep understanding of complex methodologies, they have +difficulty in evaluating innovative claims, and they are unable to assess +ethical issues and conflicts of interest. Herein, we consider 13 GPT-related +papers across different scientific domains, reviewed by a human reviewer and +SciSpace, a large language model, with the reviews evaluated by three distinct +types of evaluators, namely GPT-3.5, a crowd panel, and GPT-4. We found that +50% of SciSpace's responses to objective questions align with those of a human +reviewer, with GPT-4 (informed evaluator) often rating the human reviewer +higher in accuracy, and SciSpace higher in structure, clarity, and +completeness. In subjective questions, the uninformed evaluators (GPT-3.5 and +crowd panel) showed varying preferences between SciSpace and human responses, +with the crowd panel showing a preference for the human responses. However, +GPT-4 rated them equally in accuracy and structure but favored SciSpace for +completeness. + +
+
+
+
+
+ + ☆ Mismatch Quest: Visual and Textual Feedback for Image-Text Misalignment + + +
+ While existing image-text alignment models reach high quality binary +assessments, they fall short of pinpointing the exact source of misalignment. +In this paper, we present a method to provide detailed textual and visual +explanation of detected misalignments between text-image pairs. We leverage +large language models and visual grounding models to automatically construct a +training set that holds plausible misaligned captions for a given image and +corresponding textual explanations and visual indicators. We also publish a new +human curated test set comprising ground-truth textual and visual misalignment +annotations. Empirical results show that fine-tuning vision language models on +our training set enables them to articulate misalignments and visually indicate +them within images, outperforming strong baselines both on the binary alignment +classification and the explanation generation tasks. Our method code and human +curated test set are available at: https://mismatch-quest.github.io/ + +
+
+
+
+
+ + ♻ ☆ Jellyfish: A Large Language Model for Data Preprocessing + + +
+ In this paper, we present Jellyfish, an open-source LLM as a universal task +solver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned +with the datasets of several typical DP tasks including error detection, data +imputation, schema matching, and entity matching, and delivers generalizability +to other tasks. Remarkably, Jellyfish can operate on a local, single, and +low-priced GPU with its 13 billion parameters, ensuring data security and +enabling further tuning. Its proficiency in understanding natural language +allows users to manually craft instructions for DP tasks. Unlike many existing +methods that heavily rely on prior knowledge, Jellyfish acquires domain +knowledge during its tuning process and integrates optional knowledge injection +during inference. A distinctive feature of Jellyfish is its interpreter, which +elucidates its output decisions. To construct Jellyfish, we develop a series of +pre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance +serializer, which automatically translates raw data into model prompts, and a +knowledge injector, which optionally introduces task- and dataset-specific +knowledge to enhance DP performance. Our evaluation of Jellyfish, using a range +of real datasets, shows its competitiveness compared to state-of-the-art +methods and its strong generalizability to unseen tasks. Jellyfish's +performance rivals that of GPT series models, and its interpreter offers +enhanced reasoning capabilities compared to GPT-3.5. Furthermore, our +evaluation highlights the effectiveness of the techniques employed in +constructing Jellyfish. Our model is available at Hugging Face: +https://huggingface.co/NECOUDBFM/Jellyfish . + +
+
+
+
+
+ + ♻ ☆ ChatGPT's One-year Anniversary: Are Open-Source Large Language Models + Catching up? + + +
+ Upon its release in late 2022, ChatGPT has brought a seismic shift in the +entire landscape of AI, both in research and commerce. Through +instruction-tuning a large language model (LLM) with supervised fine-tuning and +reinforcement learning from human feedback, it showed that a model could answer +human questions and follow instructions on a broad panel of tasks. Following +this success, interests in LLMs have intensified, with new LLMs flourishing at +frequent interval across academia and industry, including many start-ups +focused on LLMs. While closed-source LLMs (e.g., OpenAI's GPT, Anthropic's +Claude) generally outperform their open-source counterparts, the progress on +the latter has been rapid with claims of achieving parity or even better on +certain tasks. This has crucial implications not only on research but also on +business. In this work, on the first anniversary of ChatGPT, we provide an +exhaustive overview of this success, surveying all tasks where an open-source +LLM has claimed to be on par or better than ChatGPT. + +
+
+ comment: version v3 +
+
+
+
+
+ + ♻ ☆ DEPN: Detecting and Editing Privacy Neurons in Pretrained Language + Models EMNLP 2023 + + +
+ Large language models pretrained on a huge amount of data capture rich +knowledge and information in the training data. The ability of data +memorization and regurgitation in pretrained language models, revealed in +previous studies, brings the risk of data leakage. In order to effectively +reduce these risks, we propose a framework DEPN to Detect and Edit Privacy +Neurons in pretrained language models, partially inspired by knowledge neurons +and model editing. In DEPN, we introduce a novel method, termed as privacy +neuron detector, to locate neurons associated with private information, and +then edit these detected privacy neurons by setting their activations to zero. +Furthermore, we propose a privacy neuron aggregator dememorize private +information in a batch processing manner. Experimental results show that our +method can significantly and efficiently reduce the exposure of private data +leakage without deteriorating the performance of the model. Additionally, we +empirically demonstrate the relationship between model memorization and privacy +neurons, from multiple perspectives, including model size, training time, +prompts, privacy neuron distribution, illustrating the robustness of our +approach. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ AlignBench: Benchmarking Chinese Alignment of Large Language Models + + +
+ Alignment has become a critical step for instruction-tuned Large Language +Models (LLMs) to become helpful assistants. However, effective evaluation of +alignment for emerging Chinese LLMs is still significantly lacking, calling for +real-scenario grounded, open-ended, challenging and automatic evaluations +tailored for alignment. To fill in this gap, we introduce AlignBench, a +comprehensive multi-dimensional benchmark for evaluating LLMs' alignment in +Chinese. Equipped with a human-in-the-loop data curation pipeline, our +benchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with +Chain-of-Thought to generate explanations and final ratings as evaluations, +ensuring high reliability and interpretability. Furthermore, we report +AlignBench evaluated by CritiqueLLM, a dedicated Chinese evaluator LLM that +recovers 95% of GPT-4's evaluation ability. We will provide public APIs for +evaluating AlignBench with CritiqueLLM to facilitate the evaluation of LLMs' +Chinese alignment. All evaluation codes, data, and LLM generations are +available at \url{https://github.com/THUDM/AlignBench}. + +
+
+
+
+
+ + ♻ ☆ ScienceBenchmark: A Complex Real-World Benchmark for Evaluating Natural + Language to SQL Systems + + +
+ Natural Language to SQL systems (NL-to-SQL) have recently shown a significant +increase in accuracy for natural language to SQL query translation. This +improvement is due to the emergence of transformer-based language models, and +the popularity of the Spider benchmark - the de-facto standard for evaluating +NL-to-SQL systems. The top NL-to-SQL systems reach accuracies of up to 85\%. +However, Spider mainly contains simple databases with few tables, columns, and +entries, which does not reflect a realistic setting. Moreover, complex +real-world databases with domain-specific content have little to no training +data available in the form of NL/SQL-pairs leading to poor performance of +existing NL-to-SQL systems. + In this paper, we introduce ScienceBenchmark, a new complex NL-to-SQL +benchmark for three real-world, highly domain-specific databases. For this new +benchmark, SQL experts and domain experts created high-quality NL/SQL-pairs for +each domain. To garner more data, we extended the small amount of +human-generated data with synthetic data generated using GPT-3. We show that +our benchmark is highly challenging, as the top performing systems on Spider +achieve a very low performance on our benchmark. Thus, the challenge is +many-fold: creating NL-to-SQL systems for highly complex domains with a small +amount of hand-made training data augmented with synthetic data. To our +knowledge, ScienceBenchmark is the first NL-to-SQL benchmark designed with +complex real-world scientific databases, containing challenging training and +test data carefully validated by domain experts. + +
+
+ comment: 12 pages, 2 figures, 5 tables +
+
+
+
+
+ + ♻ ☆ Real-Time Online Stock Forecasting Utilizing Integrated Quantitative and + Qualitative Analysis + + +
+ The application of Machine learning to finance has become a familiar +approach, even more so in stock market forecasting. The stock market is highly +volatile and huge amounts of data are generated every minute globally. The +extraction of effective intelligence from this data is of critical importance. +However, a collaboration of numerical stock data with qualitative text data can +be a challenging task. In this work, we accomplish this and provide an +unprecedented, publicly available dataset with technical and fundamental data, +sentiment that we gathered from News Archives, TV news captions, Radio +Transcripts, Tweets, Daily financial newspapers, etc. The text data entries +used for sentiment extraction total more than 1.4 Million. The dataset consists +of daily entries from January 2018 to December 2022 for 8 companies +representing diverse industrial sectors and the Dow Jones Industrial Average +(DJIA) as a whole. Holistic Fundamental and Technical data is provided training +ready for Model learning and deployment. The data generated could be used for +Incremental online learning with real-time data points retrieved daily, since +there was no stagnant data utilized, all the data was retired from APIs or +self-designed scripts. Moreover, the utilization of Spearman's rank correlation +over real-time data, linking stock returns with sentiment analysis has produced +noteworthy results for the DJIA achieving accuracy levels surpassing 60\%. The +dataset is made available at https://github.com/batking24/Huge-Stock-Dataset + +
+
+
+
+
+ + ♻ ☆ Unifying the Perspectives of NLP and Software Engineering: A Survey on + Language Models for Code + + +
+ In this work we systematically review the recent advancements in code +processing with language models, covering 50+ models, 30+ evaluation tasks, +170+ datasets, and 700 related works. We break down code processing models into +general language models represented by the GPT family and specialized models +that are specifically pretrained on code, often with tailored objectives. We +discuss the relations and differences between these models, and highlight the +historical transition of code modeling from statistical models and RNNs to +pretrained Transformers and LLMs, which is exactly the same course that had +been taken by NLP. We also discuss code-specific features such as AST, CFG, and +unit tests, along with their application in training code language models, and +identify key challenges and potential future directions in this domain. We keep +the survey open and updated on GitHub at +https://github.com/codefuse-ai/Awesome-Code-LLM. + +
+
+ comment: Repo is available at https://github.com/codefuse-ai/Awesome-Code-LLM. + 8 figures, 9 tables, and 694 references +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. A critical challenge is how +to make use of existing large-scale pre-trained VLMs, which are trained on +common objects, to perform the domain-specific transfer for accomplishing +domain-related downstream tasks. In this paper, we propose a new framework that +includes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap +between the General Vision-Language Model (GVLM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +fine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning +methods on RS5M to implement the DVLM. Experimental results show that our +proposed dataset is highly effective for various tasks, and our model GeoRSCLIP +improves upon the baseline or previous state-of-the-art model by $3\%\sim20\%$ +in Zero-shot Classification (ZSC), $3\%\sim6\%$ in Remote Sensing Cross-Modal +Text-Image Retrieval (RSCTIR) and $4\%\sim5\%$ in Semantic Localization (SeLo) +tasks. Dataset and models have been released in: +\url{https://github.com/om-ai-lab/RS5M}. + +
+
+ comment: RS5M dataset v5 +
+
+
+
+
+ + ♻ ☆ Thesis Distillation: Investigating The Impact of Bias in NLP Models on + Hate Speech Detection + + +
+ This paper is a summary of the work done in my PhD thesis. Where I +investigate the impact of bias in NLP models on the task of hate speech +detection from three perspectives: explainability, offensive stereotyping bias, +and fairness. Then, I discuss the main takeaways from my thesis and how they +can benefit the broader NLP community. Finally, I discuss important future +research directions. The findings of my thesis suggest that the bias in NLP +models impacts the task of hate speech detection from all three perspectives. +And that unless we start incorporating social sciences in studying bias in NLP +models, we will not effectively overcome the current limitations of measuring +and mitigating bias in NLP models. + +
+
+
+
+
+ + ♻ ☆ Rethinking Radiology Report Generation via Causal Reasoning and + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as an interaction between +vision and language fields. Previous works inherited the ideology of +vision-to-language generation tasks,aiming to generate paragraphs with high +consistency as reports. However, one unique characteristic of RRG, the +independence between diseases, was neglected, leading to the injection of +disease co-occurrence as a confounder that effects the results through backdoor +path. Unfortunately, this confounder confuses the process of report generation +worse because of the biased RRG data distribution. In this paper, to rethink +this issue thoroughly, we reason about its causes and effects from a novel +perspective of statistics and causality, where the Joint Vision Coupling and +the Conditional Sentence Coherence Coupling are two aspects prone to implicitly +decrease the accuracy of reports. Then, a counterfactual augmentation strategy +that contains the Counterfactual Sample Synthesis and the Counterfactual Report +Reconstruction sub-methods is proposed to break these two aspects of spurious +effects. Experimental results and further analyses on two widely used datasets +justify our reasoning and proposed methods. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ♻ ☆ General-Purpose Retrieval-Enhanced Medical Prediction Model Using + Near-Infinite History + + +
+ Developing clinical prediction models (e.g., mortality prediction) based on +electronic health records (EHRs) typically relies on expert opinion for feature +selection and adjusting observation window size. This burdens experts and +creates a bottleneck in the development process. We propose Retrieval-Enhanced +Medical prediction model (REMed) to address such challenges. REMed can +essentially evaluate an unlimited number of clinical events, select the +relevant ones, and make predictions. This approach effectively eliminates the +need for manual feature selection and enables an unrestricted observation +window. We verified these properties through experiments on 27 clinical tasks +and two independent cohorts from publicly available EHR datasets, where REMed +outperformed other contemporary architectures that aim to handle as many events +as possible. Notably, we found that the preferences of REMed align closely with +those of medical experts. We expect our approach to significantly expedite the +development of EHR prediction models by minimizing clinicians' need for manual +involvement. + +
+
+ comment: The source codes corresponding to this paper are available at: + https://github.com/starmpcc/REMed +
+
+
+
+
+ + ♻ ☆ Large Language Models, scientific knowledge and factuality: A systematic + analysis in antibiotic discovery + + +
+ Inferring over and extracting information from Large Language Models (LLMs) +trained on a large corpus of scientific literature can potentially drive a new +era in biomedical research, reducing the barriers for accessing existing +medical evidence. This work examines the potential of LLMs for dialoguing with +biomedical background knowledge, using the context of antibiotic discovery. The +systematic analysis is applied to ten state-of-the-art models, from models +specialised on biomedical scientific corpora to general models such as ChatGPT, +GPT-4 and Llama 2 in two prompting-based tasks: chemical compound definition +generation and chemical compound-fungus relation determination. The work +provides a systematic assessment on the ability of LLMs to encode and express +these relations, verifying for fluency, prompt-alignment, semantic coherence, +factual knowledge and specificity of generated responses. Results show that +while recent models have improved in fluency, factual accuracy is still low and +models are biased towards over-represented entities. The ability of LLMs to +serve as biomedical knowledge bases is questioned, and the need for additional +systematic evaluation frameworks is highlighted. The best performing GPT-4 +produced a factual definition for 70% of chemical compounds and 43.6% factual +relations to fungi, whereas the best open source model BioGPT-large 30% of the +compounds and 30% of the relations for the best-performing prompt. The results +show that while LLMs are currently not fit for purpose to be used as biomedical +factual knowledge bases, there is a promising emerging property in the +direction of factuality as the models become domain specialised, scale-up in +size and level of human feedback. + +
+
+ comment: 28 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ T3D: Towards 3D Medical Image Understanding through Vision-Language + Pre-training + + +
+ Expert annotation of 3D medical image for downstream analysis is +resource-intensive, posing challenges in clinical applications. Visual +self-supervised learning (vSSL), though effective for learning visual +invariance, neglects the incorporation of domain knowledge from medicine. To +incorporate medical knowledge into visual representation learning, +vision-language pre-training (VLP) has shown promising results in 2D image. +However, existing VLP approaches become generally impractical when applied to +high-resolution 3D medical images due to GPU hardware constraints and the +potential loss of critical details caused by downsampling, which is the +intuitive solution to hardware constraints. To address the above limitations, +we introduce T3D, the first VLP framework designed for high-resolution 3D +medical images. T3D incorporates two text-informed pretext tasks: +(\lowerromannumeral{1}) text-informed contrastive learning; +(\lowerromannumeral{2}) text-informed image restoration. These tasks focus on +learning 3D visual representations from high-resolution 3D medical images and +integrating clinical knowledge from radiology reports, without distorting +information through forced alignment of downsampled volumes with detailed +anatomical text. Trained on a newly curated large-scale dataset of 3D medical +images and radiology reports, T3D significantly outperforms current vSSL +methods in tasks like organ and tumor segmentation, as well as disease +classification. This underlines T3D's potential in representation learning for +3D medical image analysis. All data and code will be available upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Investigating the Catastrophic Forgetting in Multimodal Large Language + Models + + +
+ Following the success of GPT4, there has been a surge in interest in +multimodal large language model (MLLM) research. This line of research focuses +on developing general-purpose LLMs through fine-tuning pre-trained LLMs and +vision models. However, catastrophic forgetting, a notorious phenomenon where +the fine-tuned model fails to retain similar performance compared to the +pre-trained model, still remains an inherent problem in multimodal LLMs (MLLM). +In this paper, we introduce EMT: Evaluating MulTimodality for evaluating the +catastrophic forgetting in MLLMs, by treating each MLLM as an image classifier. +We first apply EMT to evaluate several open-source fine-tuned MLLMs and we +discover that almost all evaluated MLLMs fail to retain the same performance +levels as their vision encoders on standard image classification tasks. +Moreover, we continue fine-tuning LLaVA, an MLLM and utilize EMT to assess +performance throughout the fine-tuning. Interestingly, our results suggest that +early-stage fine-tuning on an image dataset improves performance across other +image datasets, by enhancing the alignment of text and visual features. +However, as fine-tuning proceeds, the MLLMs begin to hallucinate, resulting in +a significant loss of generalizability, even when the image encoder remains +frozen. Our results suggest that MLLMs have yet to demonstrate performance on +par with their vision models on standard image classification tasks and the +current MLLM fine-tuning procedure still has room for improvement. + +
+
+
+
+
+ + ♻ ☆ Is one brick enough to break the wall of spoken dialogue state tracking? + + +
+ In Task-Oriented Dialogue (TOD) systems, correctly updating the system's +understanding of the user's needs (a.k.a dialogue state tracking) is key to a +smooth interaction. Traditionally, TOD systems perform this update in three +steps: transcription of the user's utterance, semantic extraction of the key +concepts, and contextualization with the previously identified concepts. Such +cascade approaches suffer from cascading errors and separate optimization. +End-to-End approaches have been proved helpful up to the semantic extraction +step. This paper goes one step further paving the path towards completely +neural spoken dialogue state tracking by comparing three approaches: (1) a +state of the art cascade approach, (2) a locally E2E approach with rule-based +contextualization and (3) a completely neural approach. + +
+
+ comment: Bug found in previous version, ongoing work +
+
+
+
+
+ + ♻ ☆ LongLoRA: Efficient Fine-tuning of Long-Context Large Language Models + + +
+ We present LongLoRA, an efficient fine-tuning approach that extends the +context sizes of pre-trained large language models (LLMs), with limited +computation cost. Typically, training LLMs with long context sizes is +computationally expensive, requiring extensive training hours and GPU +resources. For example, training on the context length of 8192 needs 16x +computational costs in self-attention layers as that of 2048. In this paper, we +speed up the context extension of LLMs in two aspects. On the one hand, +although dense global attention is needed during inference, fine-tuning the +model can be effectively and efficiently done by sparse local attention. The +proposed shifted sparse attention (S$^2$-Attn) effectively enables context +extension, leading to non-trivial computation saving with similar performance +to fine-tuning with vanilla attention. Particularly, it can be implemented with +only two lines of code in training, while being optional in inference. On the +other hand, we revisit the parameter-efficient fine-tuning regime for context +expansion. Notably, we find that LoRA for context extension works well under +the premise of trainable embedding and normalization. LongLoRA combines this +improved LoRA with S$^2$-Attn. LongLoRA demonstrates strong empirical results +on various tasks on Llama2 models from 7B/13B to 70B. LongLoRA adopts Llama2 7B +from 4k context to 100k, or Llama2 70B to 32k on a single 8x A100 machine. +LongLoRA extends models' context while retaining their original architectures, +and is compatible with most existing techniques, like Flash-Attention2. In +addition, we further conduct supervised fine-tuning with LongLoRA and our long +instruction-following LongAlpaca dataset. + +
+
+ comment: Code, models, dataset, and demo are available at + https://github.com/dvlab-research/LongLoRA +
+
+
+
+
+ + ♻ ☆ ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with + GPT and Prototype Guidance ICCV 2023 + + +
+ Understanding 3D scenes from multi-view inputs has been proven to alleviate +the view discrepancy issue in 3D visual grounding. However, existing methods +normally neglect the view cues embedded in the text modality and fail to weigh +the relative importance of different views. In this paper, we propose +ViewRefer, a multi-view framework for 3D visual grounding exploring how to +grasp the view knowledge from both text and 3D modalities. For the text branch, +ViewRefer leverages the diverse linguistic knowledge of large-scale language +models, e.g., GPT, to expand a single grounding text to multiple +geometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer +fusion module with inter-view attention is introduced to boost the interaction +of objects across views. On top of that, we further present a set of learnable +multi-view prototypes, which memorize scene-agnostic knowledge for different +views, and enhance the framework from two perspectives: a view-guided attention +module for more robust text features, and a view-guided scoring strategy during +the final prediction. With our designed paradigm, ViewRefer achieves superior +performance on three benchmarks and surpasses the second-best by +2.8%, +1.5%, +and +1.35% on Sr3D, Nr3D, and ScanRefer. Code is released at +https://github.com/Ivan-Tang-3D/ViewRefer3D. + +
+
+ comment: Accepted by ICCV 2023. Code is released at + https://github.com/Ivan-Tang-3D/ViewRefer3D +
+
+
+
+
+ + ♻ ☆ GPT-Driver: Learning to Drive with GPT NeurIPS 2023 + + +
+ We present a simple yet effective approach that can transform the OpenAI +GPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion +planning is a core challenge in autonomous driving, aiming to plan a driving +trajectory that is safe and comfortable. Existing motion planners predominantly +leverage heuristic methods to forecast driving trajectories, yet these +approaches demonstrate insufficient generalization capabilities in the face of +novel and unseen driving scenarios. In this paper, we propose a novel approach +to motion planning that capitalizes on the strong reasoning capabilities and +generalization potential inherent to Large Language Models (LLMs). The +fundamental insight of our approach is the reformulation of motion planning as +a language modeling problem, a perspective not previously explored. +Specifically, we represent the planner inputs and outputs as language tokens, +and leverage the LLM to generate driving trajectories through a language +description of coordinate positions. Furthermore, we propose a novel +prompting-reasoning-finetuning strategy to stimulate the numerical reasoning +potential of the LLM. With this strategy, the LLM can describe highly precise +trajectory coordinates and also its internal decision-making process in natural +language. We evaluate our approach on the large-scale nuScenes dataset, and +extensive experiments substantiate the effectiveness, generalization ability, +and interpretability of our GPT-based motion planner. Code is now available at +https://github.com/PointsCoder/GPT-Driver. + +
+
+ comment: NeurIPS 2023 Foundation Models for Decision Making Workshop +
+
+
+
+
+ + ♻ ☆ Language Agent Tree Search Unifies Reasoning Acting and Planning in + Language Models + + +
+ While large language models (LLMs) have demonstrated impressive performance +on a range of decision-making tasks, they rely on simple acting processes and +fall short of broad deployment as autonomous agents. We introduce LATS +(Language Agent Tree Search), a general framework that synergizes the +capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration +from Monte Carlo tree search in model-based reinforcement learning, LATS +employs LLMs as agents, value functions, and optimizers, repurposing their +latent strengths for enhanced decision-making. What is crucial in this method +is the use of an environment for external feedback, which offers a more +deliberate and adaptive problem-solving mechanism that moves beyond the +limitations of existing techniques. Our experimental evaluation across diverse +domains, such as programming, HotPotQA, and WebShop, illustrates the +applicability of LATS for both reasoning and acting. In particular, LATS +achieves 94.4% for programming on HumanEval with GPT-4 and an average score of +75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness +and generality of our method. + +
+
+ comment: Website and code can be found at + https://andyz245.github.io/LanguageAgentTreeSearch +
+
+
+
+
+ + ♻ ☆ Competition-Level Problems are Effective LLM Evaluators + + +
+ Large language models (LLMs) have demonstrated impressive reasoning +capabilities, yet there is ongoing debate about these abilities and the +potential data contamination problem recently. This paper aims to evaluate the +reasoning capacities of LLMs, specifically in solving recent competition-level +programming problems in Codeforces, which are expert-crafted and unique, +requiring deep understanding and robust reasoning skills. We first provide a +comprehensive evaluation of GPT-4's peiceived zero-shot performance on this +task, considering various aspects such as problems' release time, difficulties, +and types of errors encountered. Surprisingly, the peiceived performance of +GPT-4 has experienced a cliff like decline in problems after September 2021 +consistently across all the difficulties and types of problems, which shows the +potential data contamination, as well as the challenges for any existing LLM to +solve unseen complex reasoning problems. We further explore various approaches +such as fine-tuning, Chain-of-Thought prompting and problem description +simplification, unfortunately none of them is able to consistently mitigate the +challenges. Through our work, we emphasis the importance of this excellent data +source for assessing the genuine reasoning capabilities of LLMs, and foster the +development of LLMs with stronger reasoning abilities and better generalization +in the future. + +
+
+ comment: In progress +
+
+
+
+
+ + ♻ ☆ Comparing Styles across Languages EMNLP 2023 + + +
+ Understanding how styles differ across languages is advantageous for training +both humans and computers to generate culturally appropriate text. We introduce +an explanation framework to extract stylistic differences from multilingual LMs +and compare styles across languages. Our framework (1) generates comprehensive +style lexica in any language and (2) consolidates feature importances from LMs +into comparable lexical categories. We apply this framework to compare +politeness, creating the first holistic multilingual politeness dataset and +exploring how politeness varies across four languages. Our approach enables an +effective evaluation of how distinct linguistic categories contribute to +stylistic variations and provides interpretable insights into how people +communicate differently around the world. + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ VideoDubber: Machine Translation with Speech-Aware Length Control for + Video Dubbing AAAI 2023 + + +
+ Video dubbing aims to translate the original speech in a film or television +program into the speech in a target language, which can be achieved with a +cascaded system consisting of speech recognition, machine translation and +speech synthesis. To ensure the translated speech to be well aligned with the +corresponding video, the length/duration of the translated speech should be as +close as possible to that of the original speech, which requires strict length +control. Previous works usually control the number of words or characters +generated by the machine translation model to be similar to the source +sentence, without considering the isochronicity of speech as the speech +duration of words/characters in different languages varies. In this paper, we +propose a machine translation system tailored for the task of video dubbing, +which directly considers the speech duration of each token in translation, to +match the length of source and target speech. Specifically, we control the +speech length of generated sentence by guiding the prediction of each word with +the duration information, including the speech duration of itself as well as +how much duration is left for the remaining words. We design experiments on +four language directions (German -> English, Spanish -> English, Chinese <-> +English), and the results show that the proposed method achieves better length +control ability on the generated speech than baseline methods. To make up the +lack of real-world datasets, we also construct a real-world test set collected +from films to provide comprehensive evaluations on the video dubbing task. + +
+
+ comment: AAAI 2023 camera version +
+
+
+
+
+ + ♻ ☆ Inverse Reinforcement Learning for Text Summarization EMNLP 2013 + + +
+ We introduce inverse reinforcement learning (IRL) as an effective paradigm +for training abstractive summarization models, imitating human summarization +behaviors. Our IRL model estimates the reward function using a suite of +important sub-rewards for summarization and concurrently optimizes the policy +network. Experimental results across datasets in different domains +(CNN/DailyMail and WikiHow) and various model sizes (BART-base and BART-large) +demonstrate the superiority of our proposed IRL model for summarization over +MLE and RL baselines. The resulting summaries exhibit greater similarity to +human-crafted gold references, outperforming MLE and RL baselines on metrics +such as ROUGE, coverage, novelty, compression ratio, factuality, and human +evaluations. + +
+
+ comment: 8 pages, 2 figures; accepted to Findings of EMNLP 2013 +
+
+
+
+
+ + ♻ ☆ Continual Learning for Instruction Following from Realtime Feedback NeurIPS 2023 + + +
+ We propose and deploy an approach to continually train an +instruction-following agent from feedback provided by users during +collaborative interactions. During interaction, human users instruct an agent +using natural language, and provide realtime binary feedback as they observe +the agent following their instructions. We design a contextual bandit learning +approach, converting user feedback to immediate reward. We evaluate through +thousands of human-agent interactions, demonstrating 15.4% absolute improvement +in instruction execution accuracy over time. We also show our approach is +robust to several design variations, and that the feedback signal is roughly +equivalent to the learning signal of supervised demonstration data. + +
+
+ comment: NeurIPS 2023 Spotlight paper +
+
+
+
+
+ + ♻ ☆ The neural dynamics of auditory word recognition and integration EMNLP 2023 + + +
+ Listeners recognize and integrate words in rapid and noisy everyday speech by +combining expectations about upcoming content with incremental sensory +evidence. We present a computational model of word recognition which formalizes +this perceptual process in Bayesian decision theory. We fit this model to +explain scalp EEG signals recorded as subjects passively listened to a +fictional story, revealing both the dynamics of the online auditory word +recognition process and the neural correlates of the recognition and +integration of words. + The model reveals distinct neural processing of words depending on whether or +not they can be quickly recognized. While all words trigger a neural response +characteristic of probabilistic integration -- voltage modulations predicted by +a word's surprisal in context -- these modulations are amplified for words +which require more than roughly 150 ms of input to be recognized. We observe no +difference in the latency of these neural responses according to words' +recognition times. Our results are consistent with a two-part model of speech +comprehension, combining an eager and rapid process of word recognition with a +temporally independent process of word integration. However, we also developed +alternative models of the scalp EEG signal not incorporating word recognition +dynamics which showed similar performance improvements. We discuss potential +future modeling steps which may help to separate these hypotheses. + +
+
+ comment: Published at EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Machine Reading Comprehension using Case-based Reasoning + + +
+ We present an accurate and interpretable method for answer extraction in +machine reading comprehension that is reminiscent of case-based reasoning (CBR) +from classical AI. Our method (CBR-MRC) builds upon the hypothesis that +contextualized answers to similar questions share semantic similarities with +each other. Given a test question, CBR-MRC first retrieves a set of similar +cases from a nonparametric memory and then predicts an answer by selecting the +span in the test context that is most similar to the contextualized +representations of answers in the retrieved cases. The semi-parametric nature +of our approach allows it to attribute a prediction to the specific set of +evidence cases, making it a desirable choice for building reliable and +debuggable QA systems. We show that CBR-MRC provides high accuracy comparable +with large reader models and outperforms baselines by 11.5 and 8.4 EM on +NaturalQuestions and NewsQA, respectively. Further, we demonstrate the ability +of CBR-MRC in identifying not just the correct answer tokens but also the span +with the most relevant supporting evidence. Lastly, we observe that contexts +for certain question types show higher lexical diversity than others and find +that CBR-MRC is robust to these variations while performance using +fully-parametric methods drops. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Unleashing the Creative Mind: Language Model As Hierarchical Policy For + Improved Exploration on Challenging Problem Solving + + +
+ Large Language Models (LLMs) have achieved tremendous progress, yet they +still often struggle with challenging reasoning problems. Current approaches +address this challenge by sampling or searching detailed and low-level +reasoning chains. However, these methods are still limited in their exploration +capabilities, making it challenging for correct solutions to stand out in the +huge solution space. In this work, we unleash LLMs' creative potential for +exploring multiple diverse problem solving strategies by framing an LLM as a +hierarchical policy via in-context learning. This policy comprises of a +visionary leader that proposes multiple diverse high-level problem-solving +tactics as hints, accompanied by a follower that executes detailed +problem-solving processes following each of the high-level instruction. The +follower uses each of the leader's directives as a guide and samples multiple +reasoning chains to tackle the problem, generating a solution group for each +leader proposal. Additionally, we propose an effective and efficient +tournament-based approach to select among these explored solution groups to +reach the final answer. Our approach produces meaningful and inspiring hints, +enhances problem-solving strategy exploration, and improves the final answer +accuracy on challenging problems in the MATH dataset. Code will be released at +https://github.com/lz1oceani/LLM-As-Hierarchical-Policy. + +
+
+
+
+
+ + ♻ ☆ Beyond Detection: Unveiling Fairness Vulnerabilities in Abusive Language + Models + + +
+ This work investigates the potential of undermining both fairness and +detection performance in abusive language detection. In a dynamic and complex +digital world, it is crucial to investigate the vulnerabilities of these +detection models to adversarial fairness attacks to improve their fairness +robustness. We propose a simple yet effective framework FABLE that leverages +backdoor attacks as they allow targeted control over the fairness and detection +performance. FABLE explores three types of trigger designs (i.e., rare, +artificial, and natural triggers) and novel sampling strategies. Specifically, +the adversary can inject triggers into samples in the minority group with the +favored outcome (i.e., "non-abusive") and flip their labels to the unfavored +outcome, i.e., "abusive". Experiments on benchmark datasets demonstrate the +effectiveness of FABLE attacking fairness and utility in abusive language +detection. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ Exploring Distributional Shifts in Large Language Models for Code + Analysis + + +
+ We systematically study how three large language models with code +capabilities - CodeT5, Codex, and ChatGPT - generalize to out-of-domain data. +We consider two fundamental applications - code summarization, and code +generation. We split data into domains following its natural boundaries - by an +organization, by a project, and by a module within the software project. We +establish that samples from each new domain present all the models with a +significant challenge of distribution shift. We study how established methods +adapt models to better generalize to new domains. Our experiments show that +while multitask learning alone is a reasonable baseline, combining it with +few-shot finetuning on examples retrieved from training data can achieve very +strong performance. Moreover, this solution can outperform direct finetuning +for very low-data scenarios. Finally, we consider variations of this approach +to create a more broadly applicable method to adapt to multiple domains at +once. We find that for code generation, a model adapted to multiple domains +simultaneously performs on par with those adapted to a single domain + +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 164 + +
+
+
+ + ☆ ReconFusion: 3D Reconstruction with Diffusion Priors + + +
+ 3D reconstruction methods such as Neural Radiance Fields (NeRFs) excel at +rendering photorealistic novel views of complex scenes. However, recovering a +high-quality NeRF typically requires tens to hundreds of input images, +resulting in a time-consuming capture process. We present ReconFusion to +reconstruct real-world scenes using only a few photos. Our approach leverages a +diffusion prior for novel view synthesis, trained on synthetic and multiview +datasets, which regularizes a NeRF-based 3D reconstruction pipeline at novel +camera poses beyond those captured by the set of input images. Our method +synthesizes realistic geometry and texture in underconstrained regions while +preserving the appearance of observed regions. We perform an extensive +evaluation across various real-world datasets, including forward-facing and +360-degree scenes, demonstrating significant performance improvements over +previous few-view NeRF reconstruction approaches. + +
+
+ comment: Project page: https://reconfusion.github.io/ +
+
+
+
+
+ + ☆ GPT4Point: A Unified Framework for Point-Language Understanding and + Generation + + +
+ Multimodal Large Language Models (MLLMs) have excelled in 2D image-text +comprehension and image generation, but their understanding of the 3D world is +notably deficient, limiting progress in 3D language understanding and +generation. To solve this problem, we introduce GPT4Point, an innovative +groundbreaking point-language multimodal model designed specifically for +unified 3D object understanding and generation within the MLLM framework. +GPT4Point as a powerful 3D MLLM seamlessly can execute a variety of point-text +reference tasks such as point-cloud captioning and Q&A. Additionally, GPT4Point +is equipped with advanced capabilities for controllable 3D generation, it can +get high-quality results through a low-quality point-text feature maintaining +the geometric shapes and colors. To support the expansive needs of 3D +object-text pairs, we develop Pyramid-XL, a point-language dataset annotation +engine. It constructs a large-scale database over 1M objects of varied text +granularity levels from the Objaverse-XL dataset, essential for training +GPT4Point. A comprehensive benchmark has been proposed to evaluate 3D +point-language understanding capabilities. In extensive evaluations, GPT4Point +has demonstrated superior performance in understanding and generation. + +
+
+
+
+
+ + ☆ Imitating Shortest Paths in Simulation Enables Effective Navigation and + Manipulation in the Real World + + +
+ Reinforcement learning (RL) with dense rewards and imitation learning (IL) +with human-generated trajectories are the most widely used approaches for +training modern embodied agents. RL requires extensive reward shaping and +auxiliary losses and is often too slow and ineffective for long-horizon tasks. +While IL with human supervision is effective, collecting human trajectories at +scale is extremely expensive. In this work, we show that imitating +shortest-path planners in simulation produces agents that, given a language +instruction, can proficiently navigate, explore, and manipulate objects in both +simulation and in the real world using only RGB sensors (no depth map or GPS +coordinates). This surprising result is enabled by our end-to-end, +transformer-based, SPOC architecture, powerful visual encoders paired with +extensive image augmentation, and the dramatic scale and diversity of our +training data: millions of frames of shortest-path-expert trajectories +collected inside approximately 200,000 procedurally generated houses containing +40,000 unique 3D assets. Our models, data, training code, and newly proposed +10-task benchmarking suite CHORES will be open-sourced. + +
+
+ comment: First six authors contributed equally. Project page: + https://spoc-robot.github.io/ +
+
+
+
+
+ + ☆ Dexterous Functional Grasping + + +
+ While there have been significant strides in dexterous manipulation, most of +it is limited to benchmark tasks like in-hand reorientation which are of +limited utility in the real world. The main benefit of dexterous hands over +two-fingered ones is their ability to pickup tools and other objects (including +thin ones) and grasp them firmly to apply force. However, this task requires +both a complex understanding of functional affordances as well as precise +low-level control. While prior work obtains affordances from human data this +approach doesn't scale to low-level control. Similarly, simulation training +cannot give the robot an understanding of real-world semantics. In this paper, +we aim to combine the best of both worlds to accomplish functional grasping for +in-the-wild objects. We use a modular approach. First, affordances are obtained +by matching corresponding regions of different objects and then a low-level +policy trained in sim is run to grasp it. We propose a novel application of +eigengrasps to reduce the search space of RL using a small amount of human data +and find that it leads to more stable and physically realistic motion. We find +that eigengrasp action space beats baselines in simulation and outperforms +hardcoded grasping in real and matches or outperforms a trained human +teleoperator. Results visualizations and videos at https://dexfunc.github.io/ + +
+
+ comment: In CoRL 2023. Website at https://dexfunc.github.io/ +
+
+
+
+
+ + ☆ Describing Differences in Image Sets with Natural Language + + +
+ How do two sets of images differ? Discerning set-level differences is crucial +for understanding model behaviors and analyzing datasets, yet manually sifting +through thousands of images is impractical. To aid in this discovery process, +we explore the task of automatically describing the differences between two +$\textbf{sets}$ of images, which we term Set Difference Captioning. This task +takes in image sets $D_A$ and $D_B$, and outputs a description that is more +often true on $D_A$ than $D_B$. We outline a two-stage approach that first +proposes candidate difference descriptions from image sets and then re-ranks +the candidates by checking how well they can differentiate the two sets. We +introduce VisDiff, which first captions the images and prompts a language model +to propose candidate descriptions, then re-ranks these descriptions using CLIP. +To evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image +sets with ground truth difference descriptions. We apply VisDiff to various +domains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing +classification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing +model failure modes (supervised ResNet), characterizing differences between +generative models (e.g., StableDiffusionV1 and V2), and discovering what makes +images memorable. Using VisDiff, we are able to find interesting and previously +unknown differences in datasets and models, demonstrating its utility in +revealing nuanced insights. + +
+
+
+
+
+ + ☆ GauHuman: Articulated Gaussian Splatting from Monocular Human Videos + + +
+ We present, GauHuman, a 3D human model with Gaussian Splatting for both fast +training (1 ~ 2 minutes) and real-time rendering (up to 189 FPS), compared with +existing NeRF-based implicit representation modelling frameworks demanding +hours of training and seconds of rendering per frame. Specifically, GauHuman +encodes Gaussian Splatting in the canonical space and transforms 3D Gaussians +from canonical space to posed space with linear blend skinning (LBS), in which +effective pose and LBS refinement modules are designed to learn fine details of +3D humans under negligible computational cost. Moreover, to enable fast +optimization of GauHuman, we initialize and prune 3D Gaussians with 3D human +prior, while splitting/cloning via KL divergence guidance, along with a novel +merge operation for further speeding up. Extensive experiments on ZJU_Mocap and +MonoCap datasets demonstrate that GauHuman achieves state-of-the-art +performance quantitatively and qualitatively with fast training and real-time +rendering speed. Notably, without sacrificing rendering quality, GauHuman can +fast model the 3D human performer with ~13k 3D Gaussians. + +
+
+ comment: project page: https://skhu101.github.io/GauHuman/; code: + https://github.com/skhu101/GauHuman +
+
+
+
+
+ + ☆ Alchemist: Parametric Control of Material Properties with Diffusion + Models + + +
+ We propose a method to control material attributes of objects like roughness, +metallic, albedo, and transparency in real images. Our method capitalizes on +the generative prior of text-to-image models known for photorealism, employing +a scalar value and instructions to alter low-level material properties. +Addressing the lack of datasets with controlled material attributes, we +generated an object-centric synthetic dataset with physically-based materials. +Fine-tuning a modified pre-trained text-to-image model on this synthetic +dataset enables us to edit material properties in real-world images while +preserving all other attributes. We show the potential application of our model +to material edited NeRFs. + +
+
+
+
+
+ + ☆ AmbiGen: Generating Ambigrams from Pre-trained Diffusion Model + + +
+ Ambigrams are calligraphic designs that have different meanings depending on +the viewing orientation. Creating ambigrams is a challenging task even for +skilled artists, as it requires maintaining the meaning under two different +viewpoints at the same time. In this work, we propose to generate ambigrams by +distilling a large-scale vision and language diffusion model, namely DeepFloyd +IF, to optimize the letters' outline for legibility in the two viewing +orientations. Empirically, we demonstrate that our approach outperforms +existing ambigram generation methods. On the 500 most common words in English, +our method achieves more than an 11.6% increase in word accuracy and at least a +41.9% reduction in edit distance. + +
+
+ comment: Project page: https://raymond-yeh.com/AmbiGen/ +
+
+
+
+
+ + ☆ Diffusion-SS3D: Diffusion Model for Semi-supervised 3D Object Detection NeurIPS 2023 + + +
+ Semi-supervised object detection is crucial for 3D scene understanding, +efficiently addressing the limitation of acquiring large-scale 3D bounding box +annotations. Existing methods typically employ a teacher-student framework with +pseudo-labeling to leverage unlabeled point clouds. However, producing reliable +pseudo-labels in a diverse 3D space still remains challenging. In this work, we +propose Diffusion-SS3D, a new perspective of enhancing the quality of +pseudo-labels via the diffusion model for semi-supervised 3D object detection. +Specifically, we include noises to produce corrupted 3D object size and class +label distributions, and then utilize the diffusion model as a denoising +process to obtain bounding box outputs. Moreover, we integrate the diffusion +model into the teacher-student framework, so that the denoised bounding boxes +can be used to improve pseudo-label generation, as well as the entire +semi-supervised learning process. We conduct experiments on the ScanNet and SUN +RGB-D benchmark datasets to demonstrate that our approach achieves +state-of-the-art performance against existing methods. We also present +extensive analysis to understand how our diffusion model design affects +performance in semi-supervised learning. + +
+
+ comment: Accepted in NeurIPS 2023. Code is available at + https://github.com/luluho1208/Diffusion-SS3D +
+
+
+
+
+ + ☆ MVHumanNet: A Large-scale Dataset of Multi-view Daily Dressing Human + Captures + + +
+ In this era, the success of large language models and text-to-image models +can be attributed to the driving force of large-scale datasets. However, in the +realm of 3D vision, while remarkable progress has been made with models trained +on large-scale synthetic and real-captured object data like Objaverse and +MVImgNet, a similar level of progress has not been observed in the domain of +human-centric tasks partially due to the lack of a large-scale human dataset. +Existing datasets of high-fidelity 3D human capture continue to be mid-sized +due to the significant challenges in acquiring large-scale high-quality 3D +human data. To bridge this gap, we present MVHumanNet, a dataset that comprises +multi-view human action sequences of 4,500 human identities. The primary focus +of our work is on collecting human data that features a large number of diverse +identities and everyday clothing using a multi-view human capture system, which +facilitates easily scalable data collection. Our dataset contains 9,000 daily +outfits, 60,000 motion sequences and 645 million frames with extensive +annotations, including human masks, camera parameters, 2D and 3D keypoints, +SMPL/SMPLX parameters, and corresponding textual descriptions. To explore the +potential of MVHumanNet in various 2D and 3D visual tasks, we conducted pilot +studies on view-consistent action recognition, human NeRF reconstruction, +text-driven view-unconstrained human image generation, as well as 2D +view-unconstrained human image and 3D avatar generation. Extensive experiments +demonstrate the performance improvements and effective applications enabled by +the scale provided by MVHumanNet. As the current largest-scale 3D human +dataset, we hope that the release of MVHumanNet data with annotations will +foster further innovations in the domain of 3D human-centric tasks at scale. + +
+
+ comment: Project page: https://x-zhangyang.github.io/MVHumanNet/ +
+
+
+
+
+ + ☆ Classification for everyone : Building geography agnostic models for + fairer recognition + + +
+ In this paper, we analyze different methods to mitigate inherent geographical +biases present in state of the art image classification models. We first +quantitatively present this bias in two datasets - The Dollar Street Dataset +and ImageNet, using images with location information. We then present different +methods which can be employed to reduce this bias. Finally, we analyze the +effectiveness of the different techniques on making these models more robust to +geographical locations of the images. + +
+
+
+
+
+ + ☆ Choroidalyzer: An open-source, end-to-end pipeline for choroidal + analysis in optical coherence tomography + + +
+ Purpose: To develop Choroidalyzer, an open-source, end-to-end pipeline for +segmenting the choroid region, vessels, and fovea, and deriving choroidal +thickness, area, and vascular index. + Methods: We used 5,600 OCT B-scans (233 subjects, 6 systemic disease cohorts, +3 device types, 2 manufacturers). To generate region and vessel ground-truths, +we used state-of-the-art automatic methods following manual correction of +inaccurate segmentations, with foveal positions manually annotated. We trained +a U-Net deep-learning model to detect the region, vessels, and fovea to +calculate choroid thickness, area, and vascular index in a fovea-centred region +of interest. We analysed segmentation agreement (AUC, Dice) and choroid metrics +agreement (Pearson, Spearman, mean absolute error (MAE)) in internal and +external test sets. We compared Choroidalyzer to two manual graders on a small +subset of external test images and examined cases of high error. + Results: Choroidalyzer took 0.299 seconds per image on a standard laptop and +achieved excellent region (Dice: internal 0.9789, external 0.9749), very good +vessel segmentation performance (Dice: internal 0.8817, external 0.8703) and +excellent fovea location prediction (MAE: internal 3.9 pixels, external 3.4 +pixels). For thickness, area, and vascular index, Pearson correlations were +0.9754, 0.9815, and 0.8285 (internal) / 0.9831, 0.9779, 0.7948 (external), +respectively (all p<0.0001). Choroidalyzer's agreement with graders was +comparable to the inter-grader agreement across all metrics. + Conclusions: Choroidalyzer is an open-source, end-to-end pipeline that +accurately segments the choroid and reliably extracts thickness, area, and +vascular index. Especially choroidal vessel segmentation is a difficult and +subjective task, and fully-automatic methods like Choroidalyzer could provide +objectivity and standardisation. + +
+
+
+
+
+ + ☆ LLaVA-Grounding: Grounded Visual Chat with Large Multimodal Models + + +
+ With the recent significant advancements in large multi-modal models (LMMs), +the importance of their grounding capability in visual chat is increasingly +recognized. Despite recent efforts to enable LMMs to support grounding, their +capabilities for grounding and chat are usually separate, and their chat +performance drops dramatically when asked to ground. The problem is the lack of +a dataset for grounded visual chat (GVC). Existing grounding datasets only +contain short captions. To address this issue, we have created GVC data that +allows for the combination of grounding and chat capabilities. To better +evaluate the GVC capabilities, we have introduced a benchmark called +Grounding-Bench. Additionally, we have proposed a model design that can support +GVC and various types of visual prompts by connecting segmentation models with +language models. Experimental results demonstrate that our model outperforms +other LMMs on Grounding-Bench. Furthermore, our model achieves competitive +performance on classic grounding benchmarks like RefCOCO/+/g and Flickr30K +Entities. Our code will be released at +https://github.com/UX-Decoder/LLaVA-Grounding . + +
+
+
+
+
+ + ☆ Fast CT anatomic localization algorithm + + +
+ Automatically determining the position of every slice in a CT scan is a basic +yet powerful capability allowing fast retrieval of region of interest for +visual inspection and automated analysis. Unlike conventional localization +approaches which work at the slice level, we directly localize only a fraction +of the slices and and then fit a linear model which maps slice index to its +estimated axial anatomical position based on those slices. The model is then +used to assign axial position to every slices of the scan. This approach proves +to be both computationally efficient, with a typical processing time of less +than a second per scan (regardless of its size), accurate, with a typical +median localization error of 1 cm, and robust to different noise sources, +imaging protocols, metal induced artifacts, anatomical deformations etc. +Another key element of our approach is the introduction of a mapping confidence +score. This score acts as a fail safe mechanism which allows a rejection of +unreliable localization results in rare cases of anomalous scans. Our algorithm +sets new State Of The Art results in terms of localization accuracy. It also +offers a decrease of two orders of magnitude in processing time with respect to +all published processing times. It was designed to be invariant to various scan +resolutions, scan protocols, patient orientations, strong artifacts and various +deformations and abnormalities. Additionally, our algorithm is the first one to +the best of our knowledge which supports the entire body from head to feet and +is not confined to specific anatomical region. This algorithm was tested on +thousands of scans and proves to be very reliable and useful as a preprocessing +stage for many applications. + +
+
+
+
+
+ + ☆ Drag-A-Video: Non-rigid Video Editing with Point-based Interaction + + +
+ Video editing is a challenging task that requires manipulating videos on both +the spatial and temporal dimensions. Existing methods for video editing mainly +focus on changing the appearance or style of the objects in the video, while +keeping their structures unchanged. However, there is no existing method that +allows users to interactively ``drag'' any points of instances on the first +frame to precisely reach the target points with other frames consistently +deformed. In this paper, we propose a new diffusion-based method for +interactive point-based video manipulation, called Drag-A-Video. Our method +allows users to click pairs of handle points and target points as well as masks +on the first frame of an input video. Then, our method transforms the inputs +into point sets and propagates these sets across frames. To precisely modify +the contents of the video, we employ a new video-level motion supervision to +update the features of the video and introduce the latent offsets to achieve +this update at multiple denoising timesteps. We propose a temporal-consistent +point tracking module to coordinate the movement of the points in the handle +point sets. We demonstrate the effectiveness and flexibility of our method on +various videos. The website of our work is available here: +https://drag-a-video.github.io/. + +
+
+
+
+
+ + ☆ WoVoGen: World Volume-aware Diffusion for Controllable Multi-camera + Driving Scene Generation + + +
+ Generating multi-camera street-view videos is critical for augmenting +autonomous driving datasets, addressing the urgent demand for extensive and +varied data. Due to the limitations in diversity and challenges in handling +lighting conditions, traditional rendering-based methods are increasingly being +supplanted by diffusion-based methods. However, a significant challenge in +diffusion-based methods is ensuring that the generated sensor data preserve +both intra-world consistency and inter-sensor coherence. To address these +challenges, we combine an additional explicit world volume and propose the +World Volume-aware Multi-camera Driving Scene Generator (WoVoGen). This system +is specifically designed to leverage 4D world volume as a foundational element +for video generation. Our model operates in two distinct phases: (i) +envisioning the future 4D temporal world volume based on vehicle control +sequences, and (ii) generating multi-camera videos, informed by this envisioned +4D temporal world volume and sensor interconnectivity. The incorporation of the +4D world volume empowers WoVoGen not only to generate high-quality street-view +videos in response to vehicle control inputs but also to facilitate scene +editing tasks. + +
+
+
+
+
+ + ☆ LivePhoto: Real Image Animation with Text-guided Motion Control + + +
+ Despite the recent progress in text-to-video generation, existing studies +usually overlook the issue that only spatial contents but not temporal motions +in synthesized videos are under the control of text. Towards such a challenge, +this work presents a practical system, named LivePhoto, which allows users to +animate an image of their interest with text descriptions. We first establish a +strong baseline that helps a well-learned text-to-image generator (i.e., Stable +Diffusion) take an image as a further input. We then equip the improved +generator with a motion module for temporal modeling and propose a carefully +designed training pipeline to better link texts and motions. In particular, +considering the facts that (1) text can only describe motions roughly (e.g., +regardless of the moving speed) and (2) text may include both content and +motion descriptions, we introduce a motion intensity estimation module as well +as a text re-weighting module to reduce the ambiguity of text-to-motion +mapping. Empirical evidence suggests that our approach is capable of well +decoding motion-related textual instructions into videos, such as actions, +camera movements, or even conjuring new contents from thin air (e.g., pouring +water into an empty glass). Interestingly, thanks to the proposed intensity +learning mechanism, our system offers users an additional control signal (i.e., +the motion intensity) besides text for video customization. + +
+
+ comment: Project page: https://xavierchen34.github.io/LivePhoto-Page/ +
+
+
+
+
+ + ☆ Split & Merge: Unlocking the Potential of Visual Adapters via Sparse + Training + + +
+ With the rapid growth in the scale of pre-trained foundation models, +parameter-efficient fine-tuning techniques have gained significant attention, +among which Adapter Tuning is the most widely used. Despite achieving +efficiency, Adapter Tuning still underperforms full fine-tuning, and the +performance improves at the cost of an increase in parameters. Recent efforts +address this issue by pruning the original adapters, but it also introduces +training instability and suboptimal performance on certain datasets. Motivated +by this, we propose Mixture of Sparse Adapters, or MoSA, as a novel Adapter +Tuning method to fully unleash the potential of each parameter in the adapter. +We first split the standard adapter into multiple non-overlapping modules, then +stochastically activate modules for sparse training, and finally merge them to +form a complete adapter after tuning. In this way, MoSA can achieve +significantly better performance than standard adapters without any additional +computational or storage overhead. Furthermore, we propose a hierarchical +sparse strategy to better leverage limited training data. Extensive experiments +on a series of 27 visual tasks demonstrate that MoSA consistently outperforms +other Adapter Tuning methods as well as other baselines by a significant +margin. Furthermore, in two challenging scenarios with low-resource and +multi-task settings, MoSA achieves satisfactory results, further demonstrating +the effectiveness of our design. Our code will be released. + +
+
+ comment: 16 pages, 7 figures. Official code: + https://github.com/Theia-4869/MoSA +
+
+
+
+
+ + ☆ Fine-grained Controllable Video Generation via Object Appearance and + Context + + +
+ Text-to-video generation has shown promising results. However, by taking only +natural languages as input, users often face difficulties in providing detailed +information to precisely control the model's output. In this work, we propose +fine-grained controllable video generation (FACTOR) to achieve detailed +control. Specifically, FACTOR aims to control objects' appearances and context, +including their location and category, in conjunction with the text prompt. To +achieve detailed control, we propose a unified framework to jointly inject +control signals into the existing text-to-video model. Our model consists of a +joint encoder and adaptive cross-attention layers. By optimizing the encoder +and the inserted layer, we adapt the model to generate videos that are aligned +with both text prompts and fine-grained control. Compared to existing methods +relying on dense control signals such as edge maps, we provide a more intuitive +and user-friendly interface to allow object-level fine-grained control. Our +method achieves controllability of object appearances without finetuning, which +reduces the per-subject optimization efforts for the users. Extensive +experiments on standard benchmark datasets and user-provided inputs validate +that our model obtains a 70% improvement in controllability metrics over +competitive baselines. + +
+
+ comment: Project page: https://hhsinping.github.io/factor +
+
+
+
+
+ + ☆ Multimodal Prompt Perceiver: Empower Adaptiveness, Generalizability and + Fidelity for All-in-One Image Restoration + + +
+ Despite substantial progress, all-in-one image restoration (IR) grapples with +persistent challenges in handling intricate real-world degradations. This paper +introduces MPerceiver: a novel multimodal prompt learning approach that +harnesses Stable Diffusion (SD) priors to enhance adaptiveness, +generalizability and fidelity for all-in-one image restoration. Specifically, +we develop a dual-branch module to master two types of SD prompts: textual for +holistic representation and visual for multiscale detail representation. Both +prompts are dynamically adjusted by degradation predictions from the CLIP image +encoder, enabling adaptive responses to diverse unknown degradations. Moreover, +a plug-in detail refinement module improves restoration fidelity via direct +encoder-to-decoder information transformation. To assess our method, MPerceiver +is trained on 9 tasks for all-in-one IR and outperforms state-of-the-art +task-specific methods across most tasks. Post multitask pre-training, +MPerceiver attains a generalized representation in low-level vision, exhibiting +remarkable zero-shot and few-shot capabilities in unseen tasks. Extensive +experiments on 16 IR tasks and 26 benchmarks underscore the superiority of +MPerceiver in terms of adaptiveness, generalizability and fidelity. + +
+
+ comment: 13 pages, 8 figures, 9 tables +
+
+
+
+
+ + ☆ MIND: Multi-Task Incremental Network Distillation + + +
+ The recent surge in pervasive devices generating dynamic data streams has +underscored the necessity for learning systems to adapt to data distributional +shifts continually. To tackle this challenge, the research community has put +forth a spectrum of methodologies, including the demanding pursuit of +class-incremental learning without replay data. In this study, we present MIND, +a parameter isolation method that aims to significantly enhance the performance +of replay-free solutions and achieve state-of-the-art results on several widely +studied datasets. Our approach introduces two main contributions: two +alternative distillation procedures that significantly improve the efficiency +of MIND increasing the accumulated knowledge of each sub-network, and the +optimization of the BachNorm layers across tasks inside the sub-networks. +Overall, MIND outperforms all the state-of-the-art methods for rehearsal-free +Class-Incremental learning (with an increment in classification accuracy of +approx. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx. ++40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each +contribution to demonstrate its impact on performance improvement. Our results +showcase the superior performance of MIND indicating its potential for +addressing the challenges posed by Class-incremental and Domain-Incremental +learning in resource-constrained environments. + +
+
+
+
+
+ + ☆ Unsupervised Video Domain Adaptation with Masked Pre-Training and + Collaborative Self-Training + + +
+ In this work, we tackle the problem of unsupervised domain adaptation (UDA) +for video action recognition. Our approach, which we call UNITE, uses an image +teacher model to adapt a video student model to the target domain. UNITE first +employs self-supervised pre-training to promote discriminative feature learning +on target domain videos using a teacher-guided masked distillation objective. +We then perform self-training on masked target data, using the video student +model and image teacher model together to generate improved pseudolabels for +unlabeled target videos. Our self-training process successfully leverages the +strengths of both models to achieve strong transfer performance across domains. +We evaluate our approach on multiple video domain adaptation benchmarks and +observe significant improvements upon previously reported results. + +
+
+
+
+
+ + ☆ Realistic Scatterer Based Adversarial Attacks on SAR Image Classifiers + + +
+ Adversarial attacks have highlighted the vulnerability of classifiers based +on machine learning for Synthetic Aperture Radar (SAR) Automatic Target +Recognition (ATR) tasks. An adversarial attack perturbs SAR images of on-ground +targets such that the classifiers are misled into making incorrect predictions. +However, many existing attacking techniques rely on arbitrary manipulation of +SAR images while overlooking the feasibility of executing the attacks on +real-world SAR imagery. Instead, adversarial attacks should be able to be +implemented by physical actions, for example, placing additional false objects +as scatterers around the on-ground target to perturb the SAR image and fool the +SAR ATR. + In this paper, we propose the On-Target Scatterer Attack (OTSA), a +scatterer-based physical adversarial attack. To ensure the feasibility of its +physical execution, we enforce a constraint on the positioning of the +scatterers. Specifically, we restrict the scatterers to be placed only on the +target instead of in the shadow regions or the background. To achieve this, we +introduce a positioning score based on Gaussian kernels and formulate an +optimization problem for our OTSA attack. Using a gradient ascent method to +solve the optimization problem, the OTSA can generate a vector of parameters +describing the positions, shapes, sizes and amplitudes of the scatterers to +guide the physical execution of the attack that will mislead SAR image +classifiers. The experimental results show that our attack obtains +significantly higher success rates under the positioning constraint compared +with the existing method. + +
+
+
+
+
+ + ☆ Rare Galaxy Classes Identified In Foundation Model Representations NeurIPS 2023 + + +
+ We identify rare and visually distinctive galaxy populations by searching for +structure within the learned representations of pretrained models. We show that +these representations arrange galaxies by appearance in patterns beyond those +needed to predict the pretraining labels. We design a clustering approach to +isolate specific local patterns, revealing groups of galaxies with rare and +scientifically-interesting morphologies. + +
+
+ comment: Accepted at Machine Learning and the Physical Sciences Workshop, + NeurIPS 2023 +
+
+
+
+
+ + ☆ Deep Learning Segmentation of Spiral Arms and Bars NeurIPS 2023 + + +
+ We present the first deep learning model for segmenting galactic spiral arms +and bars. In a blinded assessment by expert astronomers, our predicted spiral +arm masks are preferred over both current automated methods (99% of +evaluations) and our original volunteer labels (79% of evaluations). Experts +rated our spiral arm masks as `mostly good' to `perfect' in 89% of evaluations. +Bar lengths trivially derived from our predicted bar masks are in excellent +agreement with a dedicated crowdsourcing project. The pixelwise precision of +our masks, previously impossible at scale, will underpin new research into how +spiral arms and bars evolve. + +
+
+ comment: Accepted at Machine Learning and the Physical Sciences Workshop, + NeurIPS 2023 +
+
+
+
+
+ + ☆ HeadGaS: Real-Time Animatable Head Avatars via 3D Gaussian Splatting + + +
+ 3D head animation has seen major quality and runtime improvements over the +last few years, particularly empowered by the advances in differentiable +rendering and neural radiance fields. Real-time rendering is a highly desirable +goal for real-world applications. We propose HeadGaS, the first model to use 3D +Gaussian Splats (3DGS) for 3D head reconstruction and animation. In this paper +we introduce a hybrid model that extends the explicit representation from 3DGS +with a base of learnable latent features, which can be linearly blended with +low-dimensional parameters from parametric head models to obtain +expression-dependent final color and opacity values. We demonstrate that +HeadGaS delivers state-of-the-art results in real-time inference frame rates, +which surpasses baselines by up to ~2dB, while accelerating rendering speed by +over x10. + +
+
+
+
+
+ + ☆ BenchLMM: Benchmarking Cross-style Visual Capability of Large Multimodal + Models + + +
+ Large Multimodal Models (LMMs) such as GPT-4V and LLaVA have shown remarkable +capabilities in visual reasoning with common image styles. However, their +robustness against diverse style shifts, crucial for practical applications, +remains largely unexplored. In this paper, we propose a new benchmark, +BenchLMM, to assess the robustness of LMMs against three different styles: +artistic image style, imaging sensor style, and application style, where each +style has five sub-styles. Utilizing BenchLMM, we comprehensively evaluate +state-of-the-art LMMs and reveal: 1) LMMs generally suffer performance +degradation when working with other styles; 2) An LMM performs better than +another model in common style does not guarantee its superior performance in +other styles; 3) LMMs' reasoning capability can be enhanced by prompting LMMs +to predict the style first, based on which we propose a versatile and +training-free method for improving LMMs; 4) An intelligent LMM is expected to +interpret the causes of its errors when facing stylistic variations. We hope +that our benchmark and analysis can shed new light on developing more +intelligent and versatile LMMs. + +
+
+ comment: Code is available at https://github.com/AIFEG/BenchGPT +
+
+
+
+
+ + ☆ Towards More Practical Group Activity Detection: A New Benchmark and + Model + + +
+ Group activity detection (GAD) is the task of identifying members of each +group and classifying the activity of the group at the same time in a video. +While GAD has been studied recently, there is still much room for improvement +in both dataset and methodology due to their limited capability to address +practical GAD scenarios. To resolve these issues, we first present a new +dataset, dubbed Caf\'e. Unlike existing datasets, Caf\'e is constructed +primarily for GAD and presents more practical evaluation scenarios and metrics, +as well as being large-scale and providing rich annotations. Along with the +dataset, we propose a new GAD model that deals with an unknown number of groups +and latent group members efficiently and effectively. We evaluated our model on +three datasets including Caf\'e, where it outperformed previous work in terms +of both accuracy and inference speed. Both our dataset and code base will be +open to the public to promote future research on GAD. + +
+
+ comment: Project page: https://cvlab.postech.ac.kr/research/CAFE +
+
+
+
+
+ + ☆ A Dynamic Network for Efficient Point Cloud Registration + + +
+ For the point cloud registration task, a significant challenge arises from +non-overlapping points that consume extensive computational resources while +negatively affecting registration accuracy. In this paper, we introduce a +dynamic approach, widely utilized to improve network efficiency in computer +vision tasks, to the point cloud registration task. We employ an iterative +registration process on point cloud data multiple times to identify regions +where matching points cluster, ultimately enabling us to remove noisy points. +Specifically, we begin with deep global sampling to perform coarse global +registration. Subsequently, we employ the proposed refined node proposal module +to further narrow down the registration region and perform local registration. +Furthermore, we utilize a spatial consistency-based classifier to evaluate the +results of each registration stage. The model terminates once it reaches +sufficient confidence, avoiding unnecessary computations. Extended experiments +demonstrate that our model significantly reduces time consumption compared to +other methods with similar results, achieving a speed improvement of over 41% +on indoor dataset (3DMatch) and 33% on outdoor datasets (KITTI) while +maintaining competitive registration recall requirements. + +
+
+
+
+
+ + ☆ Are Vision Transformers More Data Hungry Than Newborn Visual Systems? NeurIPS 2023 + + +
+ Vision transformers (ViTs) are top performing models on many computer vision +benchmarks and can accurately predict human behavior on object recognition +tasks. However, researchers question the value of using ViTs as models of +biological learning because ViTs are thought to be more data hungry than +brains, with ViTs requiring more training data to reach similar levels of +performance. To test this assumption, we directly compared the learning +abilities of ViTs and animals, by performing parallel controlled rearing +experiments on ViTs and newborn chicks. We first raised chicks in impoverished +visual environments containing a single object, then simulated the training +data available in those environments by building virtual animal chambers in a +video game engine. We recorded the first-person images acquired by agents +moving through the virtual chambers and used those images to train self +supervised ViTs that leverage time as a teaching signal, akin to biological +visual systems. When ViTs were trained through the eyes of newborn chicks, the +ViTs solved the same view invariant object recognition tasks as the chicks. +Thus, ViTs were not more data hungry than newborn visual systems: both learned +view invariant object representations in impoverished visual environments. The +flexible and generic attention based learning mechanism in ViTs combined with +the embodied data streams available to newborn animals appears sufficient to +drive the development of animal-like object recognition. + +
+
+ comment: Accepted in Thirty-seventh Conference on Neural Information + Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ☆ RotaTR: Detection Transformer for Dense and Rotated Object + + +
+ Detecting the objects in dense and rotated scenes is a challenging task. +Recent works on this topic are mostly based on Faster RCNN or Retinanet. As +they are highly dependent on the pre-set dense anchors and the NMS operation, +the approach is indirect and suboptimal.The end-to-end DETR-based detectors +have achieved great success in horizontal object detection and many other areas +like segmentation, tracking, action recognition and etc.However, the DETR-based +detectors perform poorly on dense rotated target tasks and perform worse than +most modern CNN-based detectors. In this paper, we find the most significant +reason for the poor performance is that the original attention can not +accurately focus on the oriented targets. Accordingly, we propose Rotated +object detection TRansformer (RotaTR) as an extension of DETR to oriented +detection. Specifically, we design Rotation Sensitive deformable (RSDeform) +attention to enhance the DETR's ability to detect oriented targets. It is used +to build the feature alignment module and rotation-sensitive decoder for our +model. We test RotaTR on four challenging-oriented benchmarks. It shows a great +advantage in detecting dense and oriented objects compared to the original +DETR. It also achieves competitive results when compared to the +state-of-the-art. + +
+
+
+
+
+ + ☆ Deterministic Guidance Diffusion Model for Probabilistic Weather + Forecasting + + +
+ Weather forecasting requires not only accuracy but also the ability to +perform probabilistic prediction. However, deterministic weather forecasting +methods do not support probabilistic predictions, and conversely, probabilistic +models tend to be less accurate. To address these challenges, in this paper, we +introduce the \textbf{\textit{D}}eterministic \textbf{\textit{G}}uidance +\textbf{\textit{D}}iffusion \textbf{\textit{M}}odel (DGDM) for probabilistic +weather forecasting, integrating benefits of both deterministic and +probabilistic approaches. During the forward process, both the deterministic +and probabilistic models are trained end-to-end. In the reverse process, +weather forecasting leverages the predicted result from the deterministic +model, using as an intermediate starting point for the probabilistic model. By +fusing deterministic models with probabilistic models in this manner, DGDM is +capable of providing accurate forecasts while also offering probabilistic +predictions. To evaluate DGDM, we assess it on the global weather forecasting +dataset (WeatherBench) and the common video frame prediction benchmark (Moving +MNIST). We also introduce and evaluate the Pacific Northwest Windstorm +(PNW)-Typhoon weather satellite dataset to verify the effectiveness of DGDM in +high-resolution regional forecasting. As a result of our experiments, DGDM +achieves state-of-the-art results not only in global forecasting but also in +regional forecasting. The code is available at: +\url{https://github.com/DongGeun-Yoon/DGDM}. + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ BIVDiff: A Training-Free Framework for General-Purpose Video Synthesis + via Bridging Image and Video Diffusion Models + + +
+ Diffusion models have made tremendous progress in text-driven image and video +generation. Now text-to-image foundation models are widely applied to various +downstream image synthesis tasks, such as controllable image generation and +image editing, while downstream video synthesis tasks are less explored for +several reasons. First, it requires huge memory and compute overhead to train a +video generation foundation model. Even with video foundation models, +additional costly training is still required for downstream video synthesis +tasks. Second, although some works extend image diffusion models into videos in +a training-free manner, temporal consistency cannot be well kept. Finally, +these adaption methods are specifically designed for one task and fail to +generalize to different downstream video synthesis tasks. To mitigate these +issues, we propose a training-free general-purpose video synthesis framework, +coined as BIVDiff, via bridging specific image diffusion models and general +text-to-video foundation diffusion models. Specifically, we first use an image +diffusion model (like ControlNet, Instruct Pix2Pix) for frame-wise video +generation, then perform Mixed Inversion on the generated video, and finally +input the inverted latents into the video diffusion model for temporal +smoothing. Decoupling image and video models enables flexible image model +selection for different purposes, which endows the framework with strong task +generalization and high efficiency. To validate the effectiveness and general +use of BIVDiff, we perform a wide range of video generation tasks, including +controllable video generation video editing, video inpainting and outpainting. +Our project page is available at https://bivdiff.github.io. + +
+
+
+
+
+ + ☆ PMMTalk: Speech-Driven 3D Facial Animation from Complementary Pseudo + Multi-modal Features + + +
+ Speech-driven 3D facial animation has improved a lot recently while most +related works only utilize acoustic modality and neglect the influence of +visual and textual cues, leading to unsatisfactory results in terms of +precision and coherence. We argue that visual and textual cues are not trivial +information. Therefore, we present a novel framework, namely PMMTalk, using +complementary Pseudo Multi-Modal features for improving the accuracy of facial +animation. The framework entails three modules: PMMTalk encoder, cross-modal +alignment module, and PMMTalk decoder. Specifically, the PMMTalk encoder +employs the off-the-shelf talking head generation architecture and speech +recognition technology to extract visual and textual information from speech, +respectively. Subsequently, the cross-modal alignment module aligns the +audio-image-text features at temporal and semantic levels. Then PMMTalk decoder +is employed to predict lip-syncing facial blendshape coefficients. Contrary to +prior methods, PMMTalk only requires an additional random reference face image +but yields more accurate results. Additionally, it is artist-friendly as it +seamlessly integrates into standard animation production workflows by +introducing facial blendshape coefficients. Finally, given the scarcity of 3D +talking face datasets, we introduce a large-scale 3D Chinese Audio-Visual +Facial Animation (3D-CAVFA) dataset. Extensive experiments and user studies +show that our approach outperforms the state of the art. We recommend watching +the supplementary video. + +
+
+
+
+
+ + ☆ Generating Fine-Grained Human Motions Using ChatGPT-Refined Descriptions + + +
+ Recently, significant progress has been made in text-based motion generation, +enabling the generation of diverse and high-quality human motions that conform +to textual descriptions. However, it remains challenging to generate +fine-grained or stylized motions due to the lack of datasets annotated with +detailed textual descriptions. By adopting a divide-and-conquer strategy, we +propose a new framework named Fine-Grained Human Motion Diffusion Model +(FG-MDM) for human motion generation. Specifically, we first parse previous +vague textual annotation into fine-grained description of different body parts +by leveraging a large language model (GPT-3.5). We then use these fine-grained +descriptions to guide a transformer-based diffusion model. FG-MDM can generate +fine-grained and stylized motions even outside of the distribution of the +training data. Our experimental results demonstrate the superiority of FG-MDM +over previous methods, especially the strong generalization capability. We will +release our fine-grained textual annotations for HumanML3D and KIT. + +
+
+ comment: Project Page: https://sx0207.github.io/fg-mdm/ +
+
+
+
+
+ + ☆ Learning Cortical Anomaly through Masked Encoding for Unsupervised + Heterogeneity Mapping + + +
+ The detection of heterogeneous mental disorders based on brain readouts +remains challenging due to the complexity of symptoms and the absence of +reliable biomarkers. This paper introduces CAM (Cortical Anomaly Detection +through Masked Image Modeling), a novel self-supervised framework designed for +the unsupervised detection of complex brain disorders using cortical surface +features. We employ this framework for the detection of individuals on the +psychotic spectrum and demonstrate its capabilities compared to state-ofthe-art +methods, achieving an AUC of 0.696 for Schizoaffective and 0.769 for +Schizophreniform, without the need for any labels. Furthermore, the analysis of +atypical cortical regions includes Pars Triangularis and several frontal areas, +often implicated in schizophrenia, provide further confidence in our approach. +Altogether, we demonstrate a scalable approach for anomaly detection of complex +brain disorders based on cortical abnormalities. + +
+
+
+
+
+ + ☆ C3: High-performance and low-complexity neural compression from a single + image or video + + +
+ Most neural compression models are trained on large datasets of images or +videos in order to generalize to unseen data. Such generalization typically +requires large and expressive architectures with a high decoding complexity. +Here we introduce C3, a neural compression method with strong rate-distortion +(RD) performance that instead overfits a small model to each image or video +separately. The resulting decoding complexity of C3 can be an order of +magnitude lower than neural baselines with similar RD performance. C3 builds on +COOL-CHIC (Ladune et al.) and makes several simple and effective improvements +for images. We further develop new methodology to apply C3 to videos. On the +CLIC2020 image benchmark, we match the RD performance of VTM, the reference +implementation of the H.266 codec, with less than 3k MACs/pixel for decoding. +On the UVG video benchmark, we match the RD performance of the Video +Compression Transformer (Mentzer et al.), a well-established neural video +codec, with less than 5k MACs/pixel for decoding. + +
+
+
+
+
+ + ☆ C-NERF: Representing Scene Changes as Directional Consistency + Difference-based NeRF + + +
+ In this work, we aim to detect the changes caused by object variations in a +scene represented by the neural radiance fields (NeRFs). Given an arbitrary +view and two sets of scene images captured at different timestamps, we can +predict the scene changes in that view, which has significant potential +applications in scene monitoring and measuring. We conducted preliminary +studies and found that such an exciting task cannot be easily achieved by +utilizing existing NeRFs and 2D change detection methods with many false or +missing detections. The main reason is that the 2D change detection is based on +the pixel appearance difference between spatial-aligned image pairs and +neglects the stereo information in the NeRF. To address the limitations, we +propose the C-NERF to represent scene changes as directional consistency +difference-based NeRF, which mainly contains three modules. We first perform +the spatial alignment of two NeRFs captured before and after changes. Then, we +identify the change points based on the direction-consistent constraint; that +is, real change points have similar change representations across view +directions, but fake change points do not. Finally, we design the change map +rendering process based on the built NeRFs and can generate the change map of +an arbitrarily specified view direction. To validate the effectiveness, we +build a new dataset containing ten scenes covering diverse scenarios with +different changing objects. Our approach surpasses state-of-the-art 2D change +detection and NeRF-based methods by a significant margin. + +
+
+
+
+
+ + ☆ R3D-SWIN:Use Shifted Window Attention for Single-View 3D Reconstruction + + +
+ Recently, vision transformers have performed well in various computer vision +tasks, including voxel 3D reconstruction. However, the windows of the vision +transformer are not multi-scale, and there is no connection between the +windows, which limits the accuracy of voxel 3D reconstruction . Therefore, we +propose a shifted windows attention voxel 3D reconstruction network. To the +best of our knowledge, this is the first work to apply shifted window attention +to voxel 3D reconstruction. Experimental results on ShapeNet verify our method +achieves SOTA accuracy in single-view reconstruction. + +
+
+ comment: 4 pages,3 figures,5 tables +
+
+
+
+
+ + ☆ Unified learning-based lossy and lossless JPEG recompression + + +
+ JPEG is still the most widely used image compression algorithm. Most image +compression algorithms only consider uncompressed original image, while +ignoring a large number of already existing JPEG images. Recently, JPEG +recompression approaches have been proposed to further reduce the size of JPEG +files. However, those methods only consider JPEG lossless recompression, which +is just a special case of the rate-distortion theorem. In this paper, we +propose a unified lossly and lossless JPEG recompression framework, which +consists of learned quantization table and Markovian hierarchical variational +autoencoders. Experiments show that our method can achieve arbitrarily low +distortion when the bitrate is close to the upper bound, namely the bitrate of +the lossless compression model. To the best of our knowledge, this is the first +learned method that bridges the gap between lossy and lossless recompression of +JPEG images. + +
+
+
+
+
+ + ☆ MyPortrait: Morphable Prior-Guided Personalized Portrait Generation + + +
+ Generating realistic talking faces is an interesting and long-standing topic +in the field of computer vision. Although significant progress has been made, +it is still challenging to generate high-quality dynamic faces with +personalized details. This is mainly due to the inability of the general model +to represent personalized details and the generalization problem to unseen +controllable parameters. In this work, we propose Myportrait, a simple, +general, and flexible framework for neural portrait generation. We incorporate +personalized prior in a monocular video and morphable prior in 3D face +morphable space for generating personalized details under novel controllable +parameters. Our proposed framework supports both video-driven and audio-driven +face animation given a monocular video of a single person. Distinguished by +whether the test data is sent to training or not, our method provides a +real-time online version and a high-quality offline version. Comprehensive +experiments in various metrics demonstrate the superior performance of our +method over the state-of-the-art methods. The code will be publicly available. + +
+
+
+
+
+ + ☆ Neural Sign Actors: A diffusion model for 3D sign language production + from text + + +
+ Sign Languages (SL) serve as the predominant mode of communication for the +Deaf and Hard of Hearing communities. The advent of deep learning has aided +numerous methods in SL recognition and translation, achieving remarkable +results. However, Sign Language Production (SLP) poses a challenge for the +computer vision community as the motions generated must be realistic and have +precise semantic meanings. Most SLP methods rely on 2D data, thus impeding +their ability to attain a necessary level of realism. In this work, we propose +a diffusion-based SLP model trained on a curated large-scale dataset of 4D +signing avatars and their corresponding text transcripts. The proposed method +can generate dynamic sequences of 3D avatars from an unconstrained domain of +discourse using a diffusion process formed on a novel and anatomically informed +graph neural network defined on the SMPL-X body skeleton. Through a series of +quantitative and qualitative experiments, we show that the proposed method +considerably outperforms previous methods of SLP. We believe that this work +presents an important and necessary step towards realistic neural sign avatars, +bridging the communication gap between Deaf and hearing communities. The code, +method and generated data will be made publicly available. + +
+
+
+
+
+ + ☆ Revisit Human-Scene Interaction via Space Occupancy + + +
+ Human-scene Interaction (HSI) generation is a challenging task and crucial +for various downstream tasks. However, one of the major obstacles is the +limited data scale. High-quality data with simultaneously captured human and 3D +environments is rare, resulting in limited data diversity and complexity. In +this work, we argue that interaction with a scene is essentially interacting +with the space occupancy of the scene from an abstract physical perspective, +leading us to a unified novel view of Human-Occupancy Interaction. By treating +pure motion sequences as records of humans interacting with invisible scene +occupancy, we can aggregate motion-only data into a large-scale paired +human-occupancy interaction database: Motion Occupancy Base (MOB). Thus, the +need for costly paired motion-scene datasets with high-quality scene scans can +be substantially alleviated. With this new unified view of Human-Occupancy +interaction, a single motion controller is proposed to reach the target state +given the surrounding occupancy. Once trained on MOB with complex occupancy +layout, the controller could handle cramped scenes and generalize well to +general scenes with limited complexity. With no GT 3D scenes for training, our +method can generate realistic and stable HSI motions in diverse scenarios, +including both static and dynamic scenes. Our code and data would be made +publicly available at https://foruck.github.io/occu-page/. + +
+
+ comment: The first two authors contributed equally. Yong-Lu Li is the + corresponding author. Project page: https://foruck.github.io/occu-page/ +
+
+
+
+
+ + ☆ Enhancing Vehicle Entrance and Parking Management: Deep Learning + Solutions for Efficiency and Security + + +
+ The auto-management of vehicle entrance and parking in any organization is a +complex challenge encompassing record-keeping, efficiency, and security +concerns. Manual methods for tracking vehicles and finding parking spaces are +slow and a waste of time. To solve the problem of auto management of vehicle +entrance and parking, we have utilized state-of-the-art deep learning models +and automated the process of vehicle entrance and parking into any +organization. To ensure security, our system integrated vehicle detection, +license number plate verification, and face detection and recognition models to +ensure that the person and vehicle are registered with the organization. We +have trained multiple deep-learning models for vehicle detection, license +number plate detection, face detection, and recognition, however, the YOLOv8n +model outperformed all the other models. Furthermore, License plate recognition +is facilitated by Google's Tesseract-OCR Engine. By integrating these +technologies, the system offers efficient vehicle detection, precise +identification, streamlined record keeping, and optimized parking slot +allocation in buildings, thereby enhancing convenience, accuracy, and security. +Future research opportunities lie in fine-tuning system performance for a wide +range of real-world applications. + +
+
+ comment: Accepted for publication in the 25th International Multitopic + Conference (INMIC) IEEE 2023, 6 Pages, 3 figures +
+
+
+
+
+ + ☆ Analyzing and Improving the Training Dynamics of Diffusion Models + + +
+ Diffusion models currently dominate the field of data-driven image synthesis +with their unparalleled scaling to large datasets. In this paper, we identify +and rectify several causes for uneven and ineffective training in the popular +ADM diffusion model architecture, without altering its high-level structure. +Observing uncontrolled magnitude changes and imbalances in both the network +activations and weights over the course of training, we redesign the network +layers to preserve activation, weight, and update magnitudes on expectation. We +find that systematic application of this philosophy eliminates the observed +drifts and imbalances, resulting in considerably better networks at equal +computational complexity. Our modifications improve the previous record FID of +2.41 in ImageNet-512 synthesis to 1.81, achieved using fast deterministic +sampling. + As an independent contribution, we present a method for setting the +exponential moving average (EMA) parameters post-hoc, i.e., after completing +the training run. This allows precise tuning of EMA length without the cost of +performing several training runs, and reveals its surprising interactions with +network architecture, training time, and guidance. + +
+
+
+
+
+ + ☆ UPOCR: Towards Unified Pixel-Level OCR Interface + + +
+ In recent years, the optical character recognition (OCR) field has been +proliferating with plentiful cutting-edge approaches for a wide spectrum of +tasks. However, these approaches are task-specifically designed with divergent +paradigms, architectures, and training strategies, which significantly +increases the complexity of research and maintenance and hinders the fast +deployment in applications. To this end, we propose UPOCR, a +simple-yet-effective generalist model for Unified Pixel-level OCR interface. +Specifically, the UPOCR unifies the paradigm of diverse OCR tasks as +image-to-image transformation and the architecture as a vision Transformer +(ViT)-based encoder-decoder. Learnable task prompts are introduced to push the +general feature representations extracted by the encoder toward task-specific +spaces, endowing the decoder with task awareness. Moreover, the model training +is uniformly aimed at minimizing the discrepancy between the generated and +ground-truth images regardless of the inhomogeneity among tasks. Experiments +are conducted on three pixel-level OCR tasks including text removal, text +segmentation, and tampered text detection. Without bells and whistles, the +experimental results showcase that the proposed method can simultaneously +achieve state-of-the-art performance on three tasks with a unified single +model, which provides valuable strategies and insights for future research on +generalist OCR models. Code will be publicly available. + +
+
+
+
+
+ + ☆ DeepPointMap: Advancing LiDAR SLAM with Unified Neural Descriptors + + +
+ Point clouds have shown significant potential in various domains, including +Simultaneous Localization and Mapping (SLAM). However, existing approaches +either rely on dense point clouds to achieve high localization accuracy or use +generalized descriptors to reduce map size. Unfortunately, these two aspects +seem to conflict with each other. To address this limitation, we propose a +unified architecture, DeepPointMap, achieving excellent preference on both +aspects. We utilize neural network to extract highly representative and sparse +neural descriptors from point clouds, enabling memory-efficient map +representation and accurate multi-scale localization tasks (e.g., odometry and +loop-closure). Moreover, we showcase the versatility of our framework by +extending it to more challenging multi-agent collaborative SLAM. The promising +results obtained in these scenarios further emphasize the effectiveness and +potential of our approach. + +
+
+
+
+
+ + ☆ Are Synthetic Data Useful for Egocentric Hand-Object Interaction + Detection? An Investigation and the HOI-Synth Domain Adaptation Benchmark + + +
+ In this study, we investigate the effectiveness of synthetic data in +enhancing hand-object interaction detection within the egocentric vision +domain. We introduce a simulator able to generate synthetic images of +hand-object interactions automatically labeled with hand-object contact states, +bounding boxes, and pixel-wise segmentation masks. Through comprehensive +experiments and comparative analyses on three egocentric datasets, VISOR, +EgoHOS, and ENIGMA-51, we demonstrate that the use of synthetic data and domain +adaptation techniques allows for comparable performance to conventional +supervised methods while requiring annotations on only a fraction of the real +data. When tested with in-domain synthetic data generated from 3D models of +real target environments and objects, our best models show consistent +performance improvements with respect to standard fully supervised approaches +based on labeled real data only. Our study also sets a new benchmark of domain +adaptation for egocentric hand-object interaction detection (HOI-Synth) and +provides baseline results to encourage the community to engage in this +challenging task. We release the generated data, code, and the simulator at the +following link: https://iplab.dmi.unict.it/HOI-Synth/. + +
+
+
+
+
+ + ☆ FaceStudio: Put Your Face Everywhere in Seconds + + +
+ This study investigates identity-preserving image synthesis, an intriguing +task in image generation that seeks to maintain a subject's identity while +adding a personalized, stylistic touch. Traditional methods, such as Textual +Inversion and DreamBooth, have made strides in custom image creation, but they +come with significant drawbacks. These include the need for extensive resources +and time for fine-tuning, as well as the requirement for multiple reference +images. To overcome these challenges, our research introduces a novel approach +to identity-preserving synthesis, with a particular focus on human images. Our +model leverages a direct feed-forward mechanism, circumventing the need for +intensive fine-tuning, thereby facilitating quick and efficient image +generation. Central to our innovation is a hybrid guidance framework, which +combines stylized images, facial images, and textual prompts to guide the image +generation process. This unique combination enables our model to produce a +variety of applications, such as artistic portraits and identity-blended +images. Our experimental results, including both qualitative and quantitative +evaluations, demonstrate the superiority of our method over existing baseline +models and previous works, particularly in its remarkable efficiency and +ability to preserve the subject's identity with high fidelity. + +
+
+ comment: Project homepage: https://icoz69.github.io/facestudio/ +
+
+
+
+
+ + ☆ TPA3D: Triplane Attention for Fast Text-to-3D Generation + + +
+ Due to the lack of large-scale text-3D correspondence data, recent text-to-3D +generation works mainly rely on utilizing 2D diffusion models for synthesizing +3D data. Since diffusion-based methods typically require significant +optimization time for both training and inference, the use of GAN-based models +would still be desirable for fast 3D generation. In this work, we propose +Triplane Attention for text-guided 3D generation (TPA3D), an end-to-end +trainable GAN-based deep learning model for fast text-to-3D generation. With +only 3D shape data and their rendered 2D images observed during training, our +TPA3D is designed to retrieve detailed visual descriptions for synthesizing the +corresponding 3D mesh data. This is achieved by the proposed attention +mechanisms on the extracted sentence and word-level text features. In our +experiments, we show that TPA3D generates high-quality 3D textured shapes +aligned with fine-grained descriptions, while impressive computation efficiency +can be observed. + +
+
+
+
+
+ + ☆ Synchronization is All You Need: Exocentric-to-Egocentric Transfer for + Temporal Action Segmentation with Unlabeled Synchronized Video Pairs + + +
+ We consider the problem of transferring a temporal action segmentation system +initially designed for exocentric (fixed) cameras to an egocentric scenario, +where wearable cameras capture video data. The conventional supervised approach +requires the collection and labeling of a new set of egocentric videos to adapt +the model, which is costly and time-consuming. Instead, we propose a novel +methodology which performs the adaptation leveraging existing labeled +exocentric videos and a new set of unlabeled, synchronized +exocentric-egocentric video pairs, for which temporal action segmentation +annotations do not need to be collected. We implement the proposed methodology +with an approach based on knowledge distillation, which we investigate both at +the feature and model level. To evaluate our approach, we introduce a new +benchmark based on the Assembly101 dataset. Results demonstrate the feasibility +and effectiveness of the proposed method against classic unsupervised domain +adaptation and temporal sequence alignment approaches. Remarkably, without +bells and whistles, our best model performs on par with supervised approaches +trained on labeled egocentric data, without ever seeing a single egocentric +label, achieving a +15.99% (28.59% vs 12.60%) improvement in the edit score on +the Assembly101 dataset compared to a baseline model trained solely on +exocentric data. + +
+
+
+
+
+ + ☆ Diffusion Noise Feature: Accurate and Fast Generated Image Detection + + +
+ Generative models have reached an advanced stage where they can produce +remarkably realistic images. However, this remarkable generative capability +also introduces the risk of disseminating false or misleading information. +Notably, existing image detectors for generated images encounter challenges +such as low accuracy and limited generalization. This paper seeks to address +this issue by seeking a representation with strong generalization capabilities +to enhance the detection of generated images. Our investigation has revealed +that real and generated images display distinct latent Gaussian representations +when subjected to an inverse diffusion process within a pre-trained diffusion +model. Exploiting this disparity, we can amplify subtle artifacts in generated +images. Building upon this insight, we introduce a novel image representation +known as Diffusion Noise Feature (DNF). DNF is an ensemble representation that +estimates the noise generated during the inverse diffusion process. A simple +classifier, e.g., ResNet, trained on DNF achieves high accuracy, robustness, +and generalization capabilities for detecting generated images, even from +previously unseen classes or models. We conducted experiments using a widely +recognized and standard dataset, achieving state-of-the-art effects of +Detection. + +
+
+
+
+
+ + ☆ DreaMo: Articulated 3D Reconstruction From A Single Casual Video + + +
+ Articulated 3D reconstruction has valuable applications in various domains, +yet it remains costly and demands intensive work from domain experts. Recent +advancements in template-free learning methods show promising results with +monocular videos. Nevertheless, these approaches necessitate a comprehensive +coverage of all viewpoints of the subject in the input video, thus limiting +their applicability to casually captured videos from online sources. In this +work, we study articulated 3D shape reconstruction from a single and casually +captured internet video, where the subject's view coverage is incomplete. We +propose DreaMo that jointly performs shape reconstruction while solving the +challenging low-coverage regions with view-conditioned diffusion prior and +several tailored regularizations. In addition, we introduce a skeleton +generation strategy to create human-interpretable skeletons from the learned +neural bones and skinning weights. We conduct our study on a self-collected +internet video collection characterized by incomplete view coverage. DreaMo +shows promising quality in novel-view rendering, detailed articulated shape +reconstruction, and skeleton generation. Extensive qualitative and quantitative +studies validate the efficacy of each proposed component, and show existing +methods are unable to solve correct geometry due to the incomplete view +coverage. + +
+
+ comment: Project page: https://ttaoretw.github.io/dreamo/ +
+
+
+
+
+ + ☆ Facilitating the Production of Well-tailored Video Summaries for Sharing + on Social Media + + +
+ This paper presents a web-based tool that facilitates the production of +tailored summaries for online sharing on social media. Through an interactive +user interface, it supports a ``one-click'' video summarization process. Based +on the integrated AI models for video summarization and aspect ratio +transformation, it facilitates the generation of multiple summaries of a +full-length video according to the needs of target platforms with regard to the +video's length and aspect ratio. + +
+
+ comment: Accepted for publication, 30th Int. Conf. on MultiMedia Modeling (MMM + 2024), Amsterdam, NL, Jan.-Feb. 2024. This is the "submitted manuscript" + version +
+
+
+
+
+ + ☆ Projection Regret: Reducing Background Bias for Novelty Detection via + Diffusion Models NeurIPS 2023 + + +
+ Novelty detection is a fundamental task of machine learning which aims to +detect abnormal ($\textit{i.e.}$ out-of-distribution (OOD)) samples. Since +diffusion models have recently emerged as the de facto standard generative +framework with surprising generation results, novelty detection via diffusion +models has also gained much attention. Recent methods have mainly utilized the +reconstruction property of in-distribution samples. However, they often suffer +from detecting OOD samples that share similar background information to the +in-distribution data. Based on our observation that diffusion models can +\emph{project} any sample to an in-distribution sample with similar background +information, we propose \emph{Projection Regret (PR)}, an efficient novelty +detection method that mitigates the bias of non-semantic information. To be +specific, PR computes the perceptual distance between the test image and its +diffusion-based projection to detect abnormality. Since the perceptual distance +often fails to capture semantic changes when the background information is +dominant, we cancel out the background bias by comparing it against recursive +projections. Extensive experiments demonstrate that PR outperforms the prior +art of generative-model-based novelty detection methods by a significant +margin. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ A Unified Simulation Framework for Visual and Behavioral Fidelity in + Crowd Analysis + + +
+ Simulation is a powerful tool to easily generate annotated data, and a highly +desirable feature, especially in those domains where learning models need large +training datasets. Machine learning and deep learning solutions, have proven to +be extremely data-hungry and sometimes, the available real-world data are not +sufficient to effectively model the given task. Despite the initial skepticism +of a portion of the scientific community, the potential of simulation has been +largely confirmed in many application areas, and the recent developments in +terms of rendering and virtualization engines, have shown a good ability also +in representing complex scenes. This includes environmental factors, such as +weather conditions and surface reflectance, as well as human-related events, +like human actions and behaviors. We present a human crowd simulator, called +UniCrowd, and its associated validation pipeline. We show how the simulator can +generate annotated data, suitable for computer vision tasks, in particular for +detection and segmentation, as well as the related applications, as crowd +counting, human pose estimation, trajectory analysis and prediction, and +anomaly detection. + +
+
+
+
+
+ + ☆ Panoptica -- instance-wise evaluation of 3D semantic and instance + segmentation maps + + +
+ This paper introduces panoptica, a versatile and performance-optimized +package designed for computing instance-wise segmentation quality metrics from +2D and 3D segmentation maps. panoptica addresses the limitations of existing +metrics and provides a modular framework that complements the original +intersection over union-based panoptic quality with other metrics, such as the +distance metric Average Symmetric Surface Distance. The package is open-source, +implemented in Python, and accompanied by comprehensive documentation and +tutorials. panoptica employs a three-step metrics computation process to cover +diverse use cases. The efficacy of panoptica is demonstrated on various +real-world biomedical datasets, where an instance-wise evaluation is +instrumental for an accurate representation of the underlying clinical task. +Overall, we envision panoptica as a valuable tool facilitating in-depth +evaluation of segmentation methods. + +
+
+ comment: 15 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ Accelerating Learnt Video Codecs with Gradient Decay and Layer-wise + Distillation + + +
+ In recent years, end-to-end learnt video codecs have demonstrated their +potential to compete with conventional coding algorithms in term of compression +efficiency. However, most learning-based video compression models are +associated with high computational complexity and latency, in particular at the +decoder side, which limits their deployment in practical applications. In this +paper, we present a novel model-agnostic pruning scheme based on gradient decay +and adaptive layer-wise distillation. Gradient decay enhances parameter +exploration during sparsification whilst preventing runaway sparsity and is +superior to the standard Straight-Through Estimation. The adaptive layer-wise +distillation regulates the sparse training in various stages based on the +distortion of intermediate features. This stage-wise design efficiently updates +parameters with minimal computational overhead. The proposed approach has been +applied to three popular end-to-end learnt video codecs, FVC, DCVC, and +DCVC-HEM. Results confirm that our method yields up to 65% reduction in MACs +and 2x speed-up with less than 0.3dB drop in BD-PSNR. Supporting code and +supplementary material can be downloaded from: +https://jasminepp.github.io/lightweightdvc/ + +
+
+
+
+
+ + ☆ An Integrated System for Spatio-Temporal Summarization of 360-degrees + Videos + + +
+ In this work, we present an integrated system for spatiotemporal +summarization of 360-degrees videos. The video summary production mainly +involves the detection of salient events and their synopsis into a concise +summary. The analysis relies on state-of-the-art methods for saliency detection +in 360-degrees video (ATSal and SST-Sal) and video summarization (CA-SUM). It +also contains a mechanism that classifies a 360-degrees video based on the use +of static or moving camera during recording and decides which saliency +detection method will be used, as well as a 2D video production component that +is responsible to create a conventional 2D video containing the salient events +in the 360-degrees video. Quantitative evaluations using two datasets for +360-degrees video saliency detection (VR-EyeTracking, Sports-360) show the +accuracy and positive impact of the developed decision mechanism, and justify +our choice to use two different methods for detecting the salient events. A +qualitative analysis using content from these datasets, gives further insights +about the functionality of the decision mechanism, shows the pros and cons of +each used saliency detection method and demonstrates the advanced performance +of the trained summarization method against a more conventional approach. + +
+
+ comment: Accepted for publication, 30th Int. Conf. on MultiMedia Modeling (MMM + 2024), Amsterdam, NL, Jan.-Feb. 2024. This is the "submitted manuscript" + version +
+
+
+
+
+ + ☆ Prompt2NeRF-PIL: Fast NeRF Generation via Pretrained Implicit Latent + + +
+ This paper explores promptable NeRF generation (e.g., text prompt or single +image prompt) for direct conditioning and fast generation of NeRF parameters +for the underlying 3D scenes, thus undoing complex intermediate steps while +providing full 3D generation with conditional control. Unlike previous +diffusion-CLIP-based pipelines that involve tedious per-prompt optimizations, +Prompt2NeRF-PIL is capable of generating a variety of 3D objects with a single +forward pass, leveraging a pre-trained implicit latent space of NeRF +parameters. Furthermore, in zero-shot tasks, our experiments demonstrate that +the NeRFs produced by our method serve as semantically informative +initializations, significantly accelerating the inference process of existing +prompt-to-NeRF methods. Specifically, we will show that our approach speeds up +the text-to-NeRF model DreamFusion and the 3D reconstruction speed of the +image-to-NeRF method Zero-1-to-3 by 3 to 5 times. + +
+
+
+
+
+ + ☆ Think Twice Before Selection: Federated Evidential Active Learning for + Medical Image Analysis with Domain Shifts + + +
+ Federated learning facilitates the collaborative learning of a global model +across multiple distributed medical institutions without centralizing data. +Nevertheless, the expensive cost of annotation on local clients remains an +obstacle to effectively utilizing local data. To mitigate this issue, federated +active learning methods suggest leveraging local and global model predictions +to select a relatively small amount of informative local data for annotation. +However, existing methods mainly focus on all local data sampled from the same +domain, making them unreliable in realistic medical scenarios with domain +shifts among different clients. In this paper, we make the first attempt to +assess the informativeness of local data derived from diverse domains and +propose a novel methodology termed Federated Evidential Active Learning (FEAL) +to calibrate the data evaluation under domain shift. Specifically, we introduce +a Dirichlet prior distribution in both local and global models to treat the +prediction as a distribution over the probability simplex and capture both +aleatoric and epistemic uncertainties by using the Dirichlet-based evidential +model. Then we employ the epistemic uncertainty to calibrate the aleatoric +uncertainty. Afterward, we design a diversity relaxation strategy to reduce +data redundancy and maintain data diversity. Extensive experiments and analyses +are conducted to show the superiority of FEAL over the state-of-the-art active +learning methods and the efficiency of FEAL under the federated active learning +framework. + +
+
+
+
+
+ + ☆ DemaFormer: Damped Exponential Moving Average Transformer with + Energy-Based Modeling for Temporal Language Grounding EMNLP 2023 + + +
+ Temporal Language Grounding seeks to localize video moments that semantically +correspond to a natural language query. Recent advances employ the attention +mechanism to learn the relations between video moments and the text query. +However, naive attention might not be able to appropriately capture such +relations, resulting in ineffective distributions where target video moments +are difficult to separate from the remaining ones. To resolve the issue, we +propose an energy-based model framework to explicitly learn moment-query +distributions. Moreover, we propose DemaFormer, a novel Transformer-based +architecture that utilizes exponential moving average with a learnable damping +factor to effectively encode moment-query inputs. Comprehensive experiments on +four public temporal language grounding datasets showcase the superiority of +our methods over the state-of-the-art baselines. + +
+
+ comment: Accepted at EMNLP 2023 (Findings) +
+
+
+
+
+ + ☆ GeNIe: Generative Hard Negative Images Through Diffusion + + +
+ Data augmentation is crucial in training deep models, preventing them from +overfitting to limited data. Common data augmentation methods are effective, +but recent advancements in generative AI, such as diffusion models for image +generation, enable more sophisticated augmentation techniques that produce data +resembling natural images. We recognize that augmented samples closer to the +ideal decision boundary of a classifier are particularly effective and +efficient in guiding the learning process. We introduce GeNIe which leverages a +diffusion model conditioned on a text prompt to merge contrasting data points +(an image from the source category and a text prompt from the target category) +to generate challenging samples for the target category. Inspired by recent +image editing methods, we limit the number of diffusion iterations and the +amount of noise. This ensures that the generated image retains low-level and +contextual features from the source image, potentially conflicting with the +target category. Our extensive experiments, in few-shot and also long-tail +distribution settings, demonstrate the effectiveness of our novel augmentation +method, especially benefiting categories with a limited number of examples. + +
+
+ comment: Our code is available https://github.com/UCDvision/GeNIe +
+
+
+
+
+ + ☆ Machine Vision Therapy: Multimodal Large Language Models Can Enhance + Visual Robustness via Denoising In-Context Learning + + +
+ Although vision models such as Contrastive Language-Image Pre-Training (CLIP) +show impressive generalization performance, their zero-shot robustness is still +limited under Out-of-Distribution (OOD) scenarios without fine-tuning. Instead +of undesirably providing human supervision as commonly done, it is possible to +take advantage of Multi-modal Large Language Models (MLLMs) that hold powerful +visual understanding abilities. However, MLLMs are shown to struggle with +vision problems due to the incompatibility of tasks, thus hindering their +utilization. In this paper, we propose to effectively leverage MLLMs to conduct +Machine Vision Therapy which aims to rectify the noisy predictions from vision +models. By fine-tuning with the denoised labels, the learning model performance +can be boosted in an unsupervised manner. To solve the incompatibility issue, +we propose a novel Denoising In-Context Learning (DICL) strategy to align +vision tasks with MLLMs. Concretely, by estimating a transition matrix that +captures the probability of one class being confused with another, an +instruction containing a correct exemplar and an erroneous one from the most +probable noisy class can be constructed. Such an instruction can help any MLLMs +with ICL ability to detect and rectify incorrect predictions of vision models. +Through extensive experiments on ImageNet, WILDS, DomainBed, and other OOD +datasets, we carefully validate the quantitative and qualitative effectiveness +of our method. Our code is available at +https://github.com/tmllab/Machine_Vision_Therapy. + +
+
+ comment: 37 pages, 19 figures, and 13 tables +
+
+
+
+
+ + ☆ Graph Information Bottleneck for Remote Sensing Segmentation + + +
+ Remote sensing segmentation has a wide range of applications in environmental +protection, and urban change detection, etc. Despite the success of deep +learning-based remote sensing segmentation methods (e.g., CNN and Transformer), +they are not flexible enough to model irregular objects. In addition, existing +graph contrastive learning methods usually adopt the way of maximizing mutual +information to keep the node representations consistent between different graph +views, which may cause the model to learn task-independent redundant +information. To tackle the above problems, this paper treats images as graph +structures and introduces a simple contrastive vision GNN (SC-ViG) architecture +for remote sensing segmentation. Specifically, we construct a node-masked and +edge-masked graph view to obtain an optimal graph structure representation, +which can adaptively learn whether to mask nodes and edges. Furthermore, this +paper innovatively introduces information bottleneck theory into graph +contrastive learning to maximize task-related information while minimizing +task-independent redundant information. Finally, we replace the convolutional +module in UNet with the SC-ViG module to complete the segmentation and +classification tasks of remote sensing images. Extensive experiments on +publicly available real datasets demonstrate that our method outperforms +state-of-the-art remote sensing image segmentation methods. + +
+
+ comment: 13 pages, 6 figures +
+
+
+
+
+ + ☆ Explainable Severity ranking via pairwise n-hidden comparison: a case + study of glaucoma + + +
+ Primary open-angle glaucoma (POAG) is a chronic and progressive optic nerve +condition that results in an acquired loss of optic nerve fibers and potential +blindness. The gradual onset of glaucoma results in patients progressively +losing their vision without being consciously aware of the changes. To diagnose +POAG and determine its severity, patients must undergo a comprehensive dilated +eye examination. In this work, we build a framework to rank, compare, and +interpret the severity of glaucoma using fundus images. We introduce a +siamese-based severity ranking using pairwise n-hidden comparisons. We +additionally have a novel approach to explaining why a specific image is deemed +more severe than others. Our findings indicate that the proposed severity +ranking model surpasses traditional ones in terms of diagnostic accuracy and +delivers improved saliency explanations. + +
+
+ comment: 4 pages +
+
+
+
+
+ + ☆ Towards Open-set Gesture Recognition via Feature Activation Enhancement + and Orthogonal Prototype Learning + + +
+ Gesture recognition is a foundational task in human-machine interaction +(HMI). While there has been significant progress in gesture recognition based +on surface electromyography (sEMG), accurate recognition of predefined gestures +only within a closed set is still inadequate in practice. It is essential to +effectively discern and reject unknown gestures of disinterest in a robust +system. Numerous methods based on prototype learning (PL) have been proposed to +tackle this open set recognition (OSR) problem. However, they do not fully +explore the inherent distinctions between known and unknown classes. In this +paper, we propose a more effective PL method leveraging two novel and inherent +distinctions, feature activation level and projection inconsistency. +Specifically, the Feature Activation Enhancement Mechanism (FAEM) widens the +gap in feature activation values between known and unknown classes. +Furthermore, we introduce Orthogonal Prototype Learning (OPL) to construct +multiple perspectives. OPL acts to project a sample from orthogonal directions +to maximize the distinction between its two projections, where unknown samples +will be projected near the clusters of different known classes while known +samples still maintain intra-class similarity. Our proposed method +simultaneously achieves accurate closed-set classification for predefined +gestures and effective rejection for unknown gestures. Extensive experiments +demonstrate its efficacy and superiority in open-set gesture recognition based +on sEMG. + +
+
+
+
+
+ + ☆ Towards Automatic Power Battery Detection: New Challenge, Benchmark + Dataset and Baseline + + +
+ We conduct a comprehensive study on a new task named power battery detection +(PBD), which aims to localize the dense cathode and anode plates endpoints from +X-ray images to evaluate the quality of power batteries. Existing manufacturers +usually rely on human eye observation to complete PBD, which makes it difficult +to balance the accuracy and efficiency of detection. To address this issue and +drive more attention into this meaningful task, we first elaborately collect a +dataset, called X-ray PBD, which has $1,500$ diverse X-ray images selected from +thousands of power batteries of $5$ manufacturers, with $7$ different visual +interference. Then, we propose a novel segmentation-based solution for PBD, +termed multi-dimensional collaborative network (MDCNet). With the help of line +and counting predictors, the representation of the point segmentation branch +can be improved at both semantic and detail aspects. Besides, we design an +effective distance-adaptive mask generation strategy, which can alleviate the +visual challenge caused by the inconsistent distribution density of plates to +provide MDCNet with stable supervision. Without any bells and whistles, our +segmentation-based MDCNet consistently outperforms various other corner +detection, crowd counting and general/tiny object detection-based solutions, +making it a strong baseline that can help facilitate future research in PBD. +Finally, we share some potential difficulties and works for future researches. +The source code and datasets will be publicly available at +\href{http://www.gy3000.company/x3000%e5%bc%80%e6%94%be%e5%b9%b3%e5%8f%b0}{X-ray +PBD}. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Retrieving Conditions from Reference Images for Diffusion Models + + +
+ Recent diffusion-based subject driven generative methods have enabled image +generations with good fidelity for specific objects or human portraits. +However, to achieve better versatility for applications, we argue that not only +improved datasets and evaluations are desired, but also more careful methods to +retrieve only relevant information from conditional images are anticipated. To +this end, we propose an anime figures dataset RetriBooru-V1, with enhanced +identity and clothing labels. We state new tasks enabled by this dataset, and +introduce a new diversity metric to measure success in completing these tasks, +quantifying the flexibility of image generations. We establish an RAG-inspired +baseline method, designed to retrieve precise conditional information from +reference images. Then, we compare with current methods on existing task to +demonstrate the capability of the proposed method. Finally, we provide baseline +experiment results on new tasks, and conduct ablation studies on the possible +structural choices. + +
+
+
+
+
+ + ☆ Towards More Unified In-context Visual Understanding + + +
+ The rapid advancement of large language models (LLMs) has accelerated the +emergence of in-context learning (ICL) as a cutting-edge approach in the +natural language processing domain. Recently, ICL has been employed in visual +understanding tasks, such as semantic segmentation and image captioning, +yielding promising results. However, existing visual ICL framework can not +enable producing content across multiple modalities, which limits their +potential usage scenarios. To address this issue, we present a new ICL +framework for visual understanding with multi-modal output enabled. First, we +quantize and embed both text and visual prompt into a unified representational +space, structured as interleaved in-context sequences. Then a decoder-only +sparse transformer architecture is employed to perform generative modeling on +them, facilitating in-context learning. Thanks to this design, the model is +capable of handling in-context vision understanding tasks with multimodal +output in a unified pipeline. Experimental results demonstrate that our model +achieves competitive performance compared with specialized models and previous +ICL baselines. Overall, our research takes a further step toward unified +multimodal in-context learning. + +
+
+
+
+
+ + ☆ AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation + with Unified Audio-Visual Speech Representation + + +
+ This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech +Translation (AV2AV) framework, where the input and output of the system are +multimodal (i.e., audio and visual speech). With the proposed AV2AV, two key +advantages can be brought: 1) We can perform real-like conversations with +individuals worldwide in a virtual meeting by utilizing our own primary +languages. In contrast to Speech-to-Speech Translation (A2A), which solely +translates between audio modalities, the proposed AV2AV directly translates +between audio-visual speech. This capability enhances the dialogue experience +by presenting synchronized lip movements along with the translated speech. 2) +We can improve the robustness of the spoken language translation system. By +employing the complementary information of audio-visual speech, the system can +effectively translate spoken language even in the presence of acoustic noise, +showcasing robust performance. To mitigate the problem of the absence of a +parallel AV2AV translation dataset, we propose to train our spoken language +translation system with the audio-only dataset of A2A. This is done by learning +unified audio-visual speech representations through self-supervised learning in +advance to train the translation system. Moreover, we propose an AV-Renderer +that can generate raw audio and video in parallel. It is designed with +zero-shot speaker modeling, thus the speaker in source audio-visual speech can +be maintained at the target translated audio-visual speech. The effectiveness +of AV2AV is evaluated with extensive experiments in a many-to-many language +translation setting. The demo page is available on +https://choijeongsoo.github.io/av2av. + +
+
+
+
+
+ + ☆ SAVE: Protagonist Diversification with Structure Agnostic Video Editing + + +
+ Driven by the upsurge progress in text-to-image (T2I) generation models, +text-to-video (T2V) generation has experienced a significant advance as well. +Accordingly, tasks such as modifying the object or changing the style in a +video have been possible. However, previous works usually work well on trivial +and consistent shapes, and easily collapse on a difficult target that has a +largely different body shape from the original one. In this paper, we spot the +bias problem in the existing video editing method that restricts the range of +choices for the new protagonist and attempt to address this issue using the +conventional image-level personalization method. We adopt motion +personalization that isolates the motion from a single source video and then +modifies the protagonist accordingly. To deal with the natural discrepancy +between image and video, we propose a motion word with an inflated textual +embedding to properly represent the motion in a source video. We also regulate +the motion word to attend to proper motion-related areas by introducing a novel +pseudo optical flow, efficiently computed from the pre-calculated attention +maps. Finally, we decouple the motion from the appearance of the source video +with an additional pseudo word. Extensive experiments demonstrate the editing +capability of our method, taking a step toward more diverse and extensive video +editing. + +
+
+ comment: Project website: https://ldynx.github.io/SAVE/ +
+
+
+
+
+ + ☆ Inspecting Model Fairness in Ultrasound Segmentation Tasks + + +
+ With the rapid expansion of machine learning and deep learning (DL), +researchers are increasingly employing learning-based algorithms to alleviate +diagnostic challenges across diverse medical tasks and applications. While +advancements in diagnostic precision are notable, some researchers have +identified a concerning trend: their models exhibit biased performance across +subgroups characterized by different sensitive attributes. This bias not only +infringes upon the rights of patients but also has the potential to lead to +life-altering consequences. In this paper, we inspect a series of DL +segmentation models using two ultrasound datasets, aiming to assess the +presence of model unfairness in these specific tasks. Our findings reveal that +even state-of-the-art DL algorithms demonstrate unfair behavior in ultrasound +segmentation tasks. These results serve as a crucial warning, underscoring the +necessity for careful model evaluation before their deployment in real-world +scenarios. Such assessments are imperative to ensure ethical considerations and +mitigate the risk of adverse impacts on patient outcomes. + +
+
+ comment: Submitted to ISBI 2024 +
+
+
+
+
+ + ☆ ReconU-Net: a direct PET image reconstruction using U-Net architecture + with back projection-induced skip connection + + +
+ [Objective] This study aims to introduce a novel back projection-induced +U-Net-shaped architecture, called ReconU-Net, for deep learning-based direct +positron emission tomography (PET) image reconstruction. Additionally, our +objective is to analyze the behavior of direct PET image reconstruction and +gain deeper insights by comparing the proposed ReconU-Net architecture with +other encoder-decoder architectures without skip connections. [Approach] The +proposed ReconU-Net architecture uniquely integrates the physical model of the +back projection operation into the skip connection. This distinctive feature +facilitates the effective transfer of intrinsic spatial information from the +input sinogram to the reconstructed image via an embedded physical model. The +proposed ReconU-Net was trained using Monte Carlo simulation data from the +Brainweb phantom and tested on both simulated and real Hoffman brain phantom +data. [Main results] The proposed ReconU-Net method generated a reconstructed +image with a more accurate structure compared to other deep learning-based +direct reconstruction methods. Further analysis showed that the proposed +ReconU-Net architecture has the ability to transfer features of multiple +resolutions, especially non-abstract high-resolution information, through skip +connections. Despite limited training on simulated data, the proposed +ReconU-Net successfully reconstructed the real Hoffman brain phantom, unlike +other deep learning-based direct reconstruction methods, which failed to +produce a reconstructed image. [Significance] The proposed ReconU-Net can +improve the fidelity of direct PET image reconstruction, even when dealing with +small training datasets, by leveraging the synergistic relationship between +data-driven modeling and the physics model of the imaging process. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Flexible Communication for Optimal Distributed Learning over + Unpredictable Networks + + +
+ Gradient compression alleviates expensive communication in distributed deep +learning by sending fewer values and its corresponding indices, typically via +Allgather (AG). Training with high compression ratio (CR) achieves high +accuracy like DenseSGD, but has lower parallel scaling due to high +communication cost (i.e., parallel efficiency). Using lower CRs improves +parallel efficiency by lowering synchronization cost, but degrades model +accuracy as well (statistical efficiency). Further, speedup attained with +different models and CRs also varies with network latency, effective bandwidth +and collective op used for aggregation. In many cases, collectives like +Allreduce (AR) have lower cost than AG to exchange the same amount of data. In +this paper, we propose an AR-compatible Topk compressor that is +bandwidth-optimal and thus performs better than AG in certain network +configurations. We develop a flexible communication strategy that switches +between AG and AR based on which collective is optimal in the current settings, +and model the pareto-relationship between parallel and statistical efficiency +as a multi-objective optimization (MOO) problem to dynamically adjust CR and +accelerate training while still converging to high accuracy. + +
+
+ comment: 2023 IEEE International Conference on Big Data (BigData) +
+
+
+
+
+ + ☆ EtC: Temporal Boundary Expand then Clarify for Weakly Supervised Video + Grounding with Multimodal Large Language Model + + +
+ Early weakly supervised video grounding (WSVG) methods often struggle with +incomplete boundary detection due to the absence of temporal boundary +annotations. To bridge the gap between video-level and boundary-level +annotation, explicit-supervision methods, i.e., generating pseudo-temporal +boundaries for training, have achieved great success. However, data +augmentations in these methods might disrupt critical temporal information, +yielding poor pseudo boundaries. In this paper, we propose a new perspective +that maintains the integrity of the original temporal content while introducing +more valuable information for expanding the incomplete boundaries. To this end, +we propose EtC (Expand then Clarify), first use the additional information to +expand the initial incomplete pseudo boundaries, and subsequently refine these +expanded ones to achieve precise boundaries. Motivated by video continuity, +i.e., visual similarity across adjacent frames, we use powerful multimodal +large language models (MLLMs) to annotate each frame within initial pseudo +boundaries, yielding more comprehensive descriptions for expanded boundaries. +To further clarify the noise of expanded boundaries, we combine mutual learning +with a tailored proposal-level contrastive objective to use a learnable +approach to harmonize a balance between incomplete yet clean (initial) and +comprehensive yet noisy (expanded) boundaries for more precise ones. +Experiments demonstrate the superiority of our method on two challenging WSVG +datasets. + +
+
+
+
+
+ + ☆ Learning to Holistically Detect Bridges from Large-Size VHR Remote + Sensing Imagery + + +
+ Bridge detection in remote sensing images (RSIs) plays a crucial role in +various applications, but it poses unique challenges compared to the detection +of other objects. In RSIs, bridges exhibit considerable variations in terms of +their spatial scales and aspect ratios. Therefore, to ensure the visibility and +integrity of bridges, it is essential to perform holistic bridge detection in +large-size very-high-resolution (VHR) RSIs. However, the lack of datasets with +large-size VHR RSIs limits the deep learning algorithms' performance on bridge +detection. Due to the limitation of GPU memory in tackling large-size images, +deep learning-based object detection methods commonly adopt the cropping +strategy, which inevitably results in label fragmentation and discontinuous +prediction. To ameliorate the scarcity of datasets, this paper proposes a +large-scale dataset named GLH-Bridge comprising 6,000 VHR RSIs sampled from +diverse geographic locations across the globe. These images encompass a wide +range of sizes, varying from 2,048*2,048 to 16,38*16,384 pixels, and +collectively feature 59,737 bridges. Furthermore, we present an efficient +network for holistic bridge detection (HBD-Net) in large-size RSIs. The HBD-Net +presents a separate detector-based feature fusion (SDFF) architecture and is +optimized via a shape-sensitive sample re-weighting (SSRW) strategy. Based on +the proposed GLH-Bridge dataset, we establish a bridge detection benchmark +including the OBB and HBB tasks, and validate the effectiveness of the proposed +HBD-Net. Additionally, cross-dataset generalization experiments on two publicly +available datasets illustrate the strong generalization capability of the +GLH-Bridge dataset. + +
+
+ comment: 16 pages, 11 figures, 6 tables; due to the limitation "The abstract + field cannot be longer than 1,920 characters", the abstract appearing here is + slightly shorter than that in the PDF file +
+
+
+
+
+ + ☆ Differentiable Point-based Inverse Rendering + + +
+ We present differentiable point-based inverse rendering, DPIR, an +analysis-by-synthesis method that processes images captured under diverse +illuminations to estimate shape and spatially-varying BRDF. To this end, we +adopt point-based rendering, eliminating the need for multiple samplings per +ray, typical of volumetric rendering, thus significantly enhancing the speed of +inverse rendering. To realize this idea, we devise a hybrid point-volumetric +representation for geometry and a regularized basis-BRDF representation for +reflectance. The hybrid geometric representation enables fast rendering through +point-based splatting while retaining the geometric details and stability +inherent to SDF-based representations. The regularized basis-BRDF mitigates the +ill-posedness of inverse rendering stemming from limited light-view angular +samples. We also propose an efficient shadow detection method using point-based +shadow map rendering. Our extensive evaluations demonstrate that DPIR +outperforms prior works in terms of reconstruction accuracy, computational +efficiency, and memory footprint. Furthermore, our explicit point-based +representation and rendering enables intuitive geometry and reflectance +editing. The code will be publicly available. + +
+
+
+
+
+ + ☆ Generator Born from Classifier + + +
+ In this paper, we make a bold attempt toward an ambitious task: given a +pre-trained classifier, we aim to reconstruct an image generator, without +relying on any data samples. From a black-box perspective, this challenge seems +intractable, since it inevitably involves identifying the inverse function for +a classifier, which is, by nature, an information extraction process. As such, +we resort to leveraging the knowledge encapsulated within the parameters of the +neural network. Grounded on the theory of Maximum-Margin Bias of gradient +descent, we propose a novel learning paradigm, in which the generator is +trained to ensure that the convergence conditions of the network parameters are +satisfied over the generated distribution of the samples. Empirical validation +from various image generation tasks substantiates the efficacy of our strategy. + +
+
+
+
+
+ + ☆ Learning Energy-based Model via Dual-MCMC Teaching + + +
+ This paper studies the fundamental learning problem of the energy-based model +(EBM). Learning the EBM can be achieved using the maximum likelihood estimation +(MLE), which typically involves the Markov Chain Monte Carlo (MCMC) sampling, +such as the Langevin dynamics. However, the noise-initialized Langevin dynamics +can be challenging in practice and hard to mix. This motivates the exploration +of joint training with the generator model where the generator model serves as +a complementary model to bypass MCMC sampling. However, such a method can be +less accurate than the MCMC and result in biased EBM learning. While the +generator can also serve as an initializer model for better MCMC sampling, its +learning can be biased since it only matches the EBM and has no access to +empirical training examples. Such biased generator learning may limit the +potential of learning the EBM. To address this issue, we present a joint +learning framework that interweaves the maximum likelihood learning algorithm +for both the EBM and the complementary generator model. In particular, the +generator model is learned by MLE to match both the EBM and the empirical data +distribution, making it a more informative initializer for MCMC sampling of +EBM. Learning generator with observed examples typically requires inference of +the generator posterior. To ensure accurate and efficient inference, we adopt +the MCMC posterior sampling and introduce a complementary inference model to +initialize such latent MCMC sampling. We show that three separate models can be +seamlessly integrated into our joint framework through two (dual-) MCMC +teaching, enabling effective and efficient EBM learning. + +
+
+
+
+
+ + ☆ SAM-Assisted Remote Sensing Imagery Semantic Segmentation with Object + and Boundary Constraints + + +
+ Semantic segmentation of remote sensing imagery plays a pivotal role in +extracting precise information for diverse down-stream applications. Recent +development of the Segment Anything Model (SAM), an advanced general-purpose +segmentation model, has revolutionized this field, presenting new avenues for +accurate and efficient segmentation. However, SAM is limited to generating +segmentation results without class information. Consequently, the utilization +of such a powerful general vision model for semantic segmentation in remote +sensing images has become a focal point of research. In this paper, we present +a streamlined framework aimed at leveraging the raw output of SAM by exploiting +two novel concepts called SAM-Generated Object (SGO) and SAM-Generated Boundary +(SGB). More specifically, we propose a novel object loss and further introduce +a boundary loss as augmentative components to aid in model optimization in a +general semantic segmentation framework. Taking into account the content +characteristics of SGO, we introduce the concept of object consistency to +leverage segmented regions lacking semantic information. By imposing +constraints on the consistency of predicted values within objects, the object +loss aims to enhance semantic segmentation performance. Furthermore, the +boundary loss capitalizes on the distinctive features of SGB by directing the +model's attention to the boundary information of the object. Experimental +results on two well-known datasets, namely ISPRS Vaihingen and LoveDA Urban, +demonstrate the effectiveness of our proposed method. The source code for this +work will be accessible at https://github.com/sstary/SSRS. + +
+
+ comment: 10 pages, 4 figures +
+
+
+
+
+ + ☆ Let's Think Outside the Box: Exploring Leap-of-Thought in Large Language + Models with Creative Humor Generation + + +
+ Chain-of-Thought (CoT) guides large language models (LLMs) to reason +step-by-step, and can motivate their logical reasoning ability. While effective +for logical tasks, CoT is not conducive to creative problem-solving which often +requires out-of-box thoughts and is crucial for innovation advancements. In +this paper, we explore the Leap-of-Thought (LoT) abilities within LLMs -- a +non-sequential, creative paradigm involving strong associations and knowledge +leaps. To this end, we study LLMs on the popular Oogiri game which needs +participants to have good creativity and strong associative thinking for +responding unexpectedly and humorously to the given image, text, or both, and +thus is suitable for LoT study. Then to investigate LLMs' LoT ability in the +Oogiri game, we first build a multimodal and multilingual Oogiri-GO dataset +which contains over 130,000 samples from the Oogiri game, and observe the +insufficient LoT ability or failures of most existing LLMs on the Oogiri game. +Accordingly, we introduce a creative Leap-of-Thought (CLoT) paradigm to improve +LLM's LoT ability. CLoT first formulates the Oogiri-GO dataset into +LoT-oriented instruction tuning data to train pretrained LLM for achieving +certain LoT humor generation and discrimination abilities. Then CLoT designs an +explorative self-refinement that encourages the LLM to generate more creative +LoT data via exploring parallels between seemingly unrelated concepts and +selects high-quality data to train itself for self-refinement. CLoT not only +excels in humor generation in the Oogiri game but also boosts creative +abilities in various tasks like cloud guessing game and divergent association +task. These findings advance our understanding and offer a pathway to improve +LLMs' creative capacities for innovative applications across domains. The +dataset, code, and models will be released online. +https://github.com/sail-sg/CLoT. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ GDN: A Stacking Network Used for Skin Cancer Diagnosis SP + + +
+ Skin cancer, the primary type of cancer that can be identified by visual +recognition, requires an automatic identification system that can accurately +classify different types of lesions. This paper presents GoogLe-Dense Network +(GDN), which is an image-classification model to identify two types of skin +cancer, Basal Cell Carcinoma, and Melanoma. GDN uses stacking of different +networks to enhance the model performance. Specifically, GDN consists of two +sequential levels in its structure. The first level performs basic +classification tasks accomplished by GoogLeNet and DenseNet, which are trained +in parallel to enhance efficiency. To avoid low accuracy and long training +time, the second level takes the output of the GoogLeNet and DenseNet as the +input for a logistic regression model. We compare our method with four baseline +networks including ResNet, VGGNet, DenseNet, and GoogLeNet on the dataset, in +which GoogLeNet and DenseNet significantly outperform ResNet and VGGNet. In the +second level, different stacking methods such as perceptron, logistic +regression, SVM, decision trees and K-neighbor are studied in which Logistic +Regression shows the best prediction result among all. The results prove that +GDN, compared to a single network structure, has higher accuracy in optimizing +skin cancer detection. + +
+
+ comment: Published at ICSPS 2021 +
+
+
+
+
+ + ☆ FINER: Flexible spectral-bias tuning in Implicit NEural Representation + by Variable-periodic Activation Functions + + +
+ Implicit Neural Representation (INR), which utilizes a neural network to map +coordinate inputs to corresponding attributes, is causing a revolution in the +field of signal processing. However, current INR techniques suffer from a +restricted capability to tune their supported frequency set, resulting in +imperfect performance when representing complex signals with multiple +frequencies. We have identified that this frequency-related problem can be +greatly alleviated by introducing variable-periodic activation functions, for +which we propose FINER. By initializing the bias of the neural network within +different ranges, sub-functions with various frequencies in the +variable-periodic function are selected for activation. Consequently, the +supported frequency set of FINER can be flexibly tuned, leading to improved +performance in signal representation. We demonstrate the capabilities of FINER +in the contexts of 2D image fitting, 3D signed distance field representation, +and 5D neural radiance fields optimization, and we show that it outperforms +existing INRs. + +
+
+ comment: 10 pages, 9 figures +
+
+
+
+
+ + ☆ Lenna: Language Enhanced Reasoning Detection Assistant + + +
+ With the fast-paced development of multimodal large language models (MLLMs), +we can now converse with AI systems in natural languages to understand images. +However, the reasoning power and world knowledge embedded in the large language +models have been much less investigated and exploited for image perception +tasks. In this paper, we propose Lenna, a language-enhanced reasoning detection +assistant, which utilizes the robust multimodal feature representation of +MLLMs, while preserving location information for detection. This is achieved by +incorporating an additional token in the MLLM vocabulary that is free of +explicit semantic context but serves as a prompt for the detector to identify +the corresponding position. To evaluate the reasoning capability of Lenna, we +construct a ReasonDet dataset to measure its performance on reasoning-based +detection. Remarkably, Lenna demonstrates outstanding performance on ReasonDet +and comes with significantly low training costs. It also incurs minimal +transferring overhead when extended to other tasks. Our code and model will be +available at https://git.io/Lenna. + +
+
+
+
+
+ + ☆ Orthogonal Adaptation for Modular Customization of Diffusion Models + + +
+ Customization techniques for text-to-image models have paved the way for a +wide range of previously unattainable applications, enabling the generation of +specific concepts across diverse contexts and styles. While existing methods +facilitate high-fidelity customization for individual concepts or a limited, +pre-defined set of them, they fall short of achieving scalability, where a +single model can seamlessly render countless concepts. In this paper, we +address a new problem called Modular Customization, with the goal of +efficiently merging customized models that were fine-tuned independently for +individual concepts. This allows the merged model to jointly synthesize +concepts in one image without compromising fidelity or incurring any additional +computational costs. + To address this problem, we introduce Orthogonal Adaptation, a method +designed to encourage the customized models, which do not have access to each +other during fine-tuning, to have orthogonal residual weights. This ensures +that during inference time, the customized models can be summed with minimal +interference. + Our proposed method is both simple and versatile, applicable to nearly all +optimizable weights in the model architecture. Through an extensive set of +quantitative and qualitative evaluations, our method consistently outperforms +relevant baselines in terms of efficiency and identity preservation, +demonstrating a significant leap toward scalable customization of diffusion +models. + +
+
+ comment: Project page: https://ryanpo.com/ortha/ +
+
+
+
+
+ + ☆ FreestyleRet: Retrieving Images from Style-Diversified Queries + + +
+ Image Retrieval aims to retrieve corresponding images based on a given query. +In application scenarios, users intend to express their retrieval intent +through various query styles. However, current retrieval tasks predominantly +focus on text-query retrieval exploration, leading to limited retrieval query +options and potential ambiguity or bias in user intention. In this paper, we +propose the Style-Diversified Query-Based Image Retrieval task, which enables +retrieval based on various query styles. To facilitate the novel setting, we +propose the first Diverse-Style Retrieval dataset, encompassing diverse query +styles including text, sketch, low-resolution, and art. We also propose a +light-weighted style-diversified retrieval framework. For various query style +inputs, we apply the Gram Matrix to extract the query's textural features and +cluster them into a style space with style-specific bases. Then we employ the +style-init prompt tuning module to enable the visual encoder to comprehend the +texture and style information of the query. Experiments demonstrate that our +model, employing the style-init prompt tuning strategy, outperforms existing +retrieval models on the style-diversified retrieval task. Moreover, +style-diversified queries~(sketch+text, art+text, etc) can be simultaneously +retrieved in our model. The auxiliary information from other queries enhances +the retrieval performance within the respective query. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Towards Granularity-adjusted Pixel-level Semantic Annotation + + +
+ Recent advancements in computer vision predominantly rely on learning-based +systems, leveraging annotations as the driving force to develop specialized +models. However, annotating pixel-level information, particularly in semantic +segmentation, presents a challenging and labor-intensive task, prompting the +need for autonomous processes. In this work, we propose GranSAM which +distinguishes itself by providing semantic segmentation at the user-defined +granularity level on unlabeled data without the need for any manual +supervision, offering a unique contribution in the realm of semantic mask +annotation method. Specifically, we propose an approach to enable the Segment +Anything Model (SAM) with semantic recognition capability to generate +pixel-level annotations for images without any manual supervision. For this, we +accumulate semantic information from synthetic images generated by the Stable +Diffusion model or web crawled images and employ this data to learn a mapping +function between SAM mask embeddings and object class labels. As a result, SAM, +enabled with granularity-adjusted mask recognition, can be used for pixel-level +semantic annotation purposes. We conducted experiments on the PASCAL VOC 2012 +and COCO-80 datasets and observed a +17.95% and +5.17% increase in mIoU, +respectively, compared to existing state-of-the-art methods when evaluated +under our problem setting. + +
+
+
+
+
+ + ☆ MGTR: Multi-Granular Transformer for Motion Prediction with LiDAR + + +
+ Motion prediction has been an essential component of autonomous driving +systems since it handles highly uncertain and complex scenarios involving +moving agents of different types. In this paper, we propose a Multi-Granular +TRansformer (MGTR) framework, an encoder-decoder network that exploits context +features in different granularities for different kinds of traffic agents. To +further enhance MGTR's capabilities, we leverage LiDAR point cloud data by +incorporating LiDAR semantic features from an off-the-shelf LiDAR feature +extractor. We evaluate MGTR on Waymo Open Dataset motion prediction benchmark +and show that the proposed method achieved state-of-the-art performance, +ranking 1st on its leaderboard +(https://waymo.com/open/challenges/2023/motion-prediction/). + +
+
+
+
+
+ + ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image + Generation from Spontaneous Facial Expression Reaction + + +
+ Researchers have proposed to use data of human preference feedback to +fine-tune text-to-image generative models. However, the scalability of human +feedback collection has been limited by its reliance on manual annotation. +Therefore, we develop and test a method to automatically annotate user +preferences from their spontaneous facial expression reaction to the generated +images. We collect a dataset of Facial Expression Reaction to Generated Images +(FERGI) and show that the activations of multiple facial action units (AUs) are +highly correlated with user evaluations of the generated images. Specifically, +AU4 (brow lowerer) is most consistently reflective of negative evaluations of +the generated image. This can be useful in two ways. Firstly, we can +automatically annotate user preferences between image pairs with substantial +difference in AU4 responses to them with an accuracy significantly +outperforming state-of-the-art scoring models. Secondly, directly integrating +the AU4 responses with the scoring models improves their consistency with human +preferences. Additionally, the AU4 response best reflects the user's evaluation +of the image fidelity, making it complementary to the state-of-the-art scoring +models, which are generally better at reflecting image-text alignment. Finally, +this method of automatic annotation with facial expression analysis can be +potentially generalized to other generation tasks. The code is available at +https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at +the same link for research purposes. + +
+
+
+
+
+ + ☆ HybridNeRF: Efficient Neural Rendering via Adaptive Volumetric Surfaces + + +
+ Neural radiance fields provide state-of-the-art view synthesis quality but +tend to be slow to render. One reason is that they make use of volume +rendering, thus requiring many samples (and model queries) per ray at render +time. Although this representation is flexible and easy to optimize, most +real-world objects can be modeled more efficiently with surfaces instead of +volumes, requiring far fewer samples per ray. This observation has spurred +considerable progress in surface representations such as signed distance +functions, but these may struggle to model semi-opaque and thin structures. We +propose a method, HybridNeRF, that leverages the strengths of both +representations by rendering most objects as surfaces while modeling the +(typically) small fraction of challenging regions volumetrically. We evaluate +HybridNeRF against the challenging Eyeful Tower dataset along with other +commonly used view synthesis datasets. When comparing to state-of-the-art +baselines, including recent rasterization-based approaches, we improve error +rates by 15-30% while achieving real-time framerates (at least 36 FPS) for +virtual-reality resolutions (2Kx2K). + +
+
+ comment: Project page: https://haithemturki.com/hybrid-nerf/ +
+
+
+
+
+ + ☆ ViscoNet: Bridging and Harmonizing Visual and Textual Conditioning for + ControlNet + + +
+ This paper introduces ViscoNet, a novel method that enhances text-to-image +human generation models with visual prompting. Unlike existing methods that +rely on lengthy text descriptions to control the image structure, ViscoNet +allows users to specify the visual appearance of the target object with a +reference image. ViscoNet disentangles the object's appearance from the image +background and injects it into a pre-trained latent diffusion model (LDM) model +via a ControlNet branch. This way, ViscoNet mitigates the style mode collapse +problem and enables precise and flexible visual control. We demonstrate the +effectiveness of ViscoNet on human image generation, where it can manipulate +visual attributes and artistic styles with text and image prompts. We also show +that ViscoNet can learn visual conditioning from small and specific object +domains while preserving the generative power of the LDM backbone. + +
+
+
+
+
+ + ☆ Predicting Bone Degradation Using Vision Transformer and Synthetic + Cellular Microstructures Dataset + + +
+ Bone degradation, especially for astronauts in microgravity conditions, is +crucial for space exploration missions since the lower applied external forces +accelerate the diminution in bone stiffness and strength substantially. +Although existing computational models help us understand this phenomenon and +possibly restrict its effect in the future, they are time-consuming to simulate +the changes in the bones, not just the bone microstructures, of each individual +in detail. In this study, a robust yet fast computational method to predict and +visualize bone degradation has been developed. Our deep-learning method, +TransVNet, can take in different 3D voxelized images and predict their +evolution throughout months utilizing a hybrid 3D-CNN-VisionTransformer +autoencoder architecture. Because of limited available experimental data and +challenges of obtaining new samples, a digital twin dataset of diverse and +initial bone-like microstructures was generated to train our TransVNet on the +evolution of the 3D images through a previously developed degradation model for +microgravity. + +
+
+ comment: 8 pages, 5 figures +
+
+
+
+
+ + ☆ AI-SAM: Automatic and Interactive Segment Anything Model + + +
+ Semantic segmentation is a core task in computer vision. Existing methods are +generally divided into two categories: automatic and interactive. Interactive +approaches, exemplified by the Segment Anything Model (SAM), have shown promise +as pre-trained models. However, current adaptation strategies for these models +tend to lean towards either automatic or interactive approaches. Interactive +methods depend on prompts user input to operate, while automatic ones bypass +the interactive promptability entirely. Addressing these limitations, we +introduce a novel paradigm and its first model: the Automatic and Interactive +Segment Anything Model (AI-SAM). In this paradigm, we conduct a comprehensive +analysis of prompt quality and introduce the pioneering Automatic and +Interactive Prompter (AI-Prompter) that automatically generates initial point +prompts while accepting additional user inputs. Our experimental results +demonstrate AI-SAM's effectiveness in the automatic setting, achieving +state-of-the-art performance. Significantly, it offers the flexibility to +incorporate additional user prompts, thereby further enhancing its performance. +The project page is available at https://github.com/ymp5078/AI-SAM. + +
+
+ comment: 11 pages, 9 figures +
+
+
+
+
+ + ☆ The Automated Bias Triangle Feature Extraction Framework + + +
+ Bias triangles represent features in stability diagrams of Quantum Dot (QD) +devices, whose occurrence and property analysis are crucial indicators for spin +physics. Nevertheless, challenges associated with quality and availability of +data as well as the subtlety of physical phenomena of interest have hindered an +automatic and bespoke analysis framework, often still relying (in part) on +human labelling and verification. We introduce a feature extraction framework +for bias triangles, built from unsupervised, segmentation-based computer vision +methods, which facilitates the direct identification and quantification of +physical properties of the former. Thereby, the need for human input or large +training datasets to inform supervised learning approaches is circumvented, +while additionally enabling the automation of pixelwise shape and feature +labeling. In particular, we demonstrate that Pauli Spin Blockade (PSB) +detection can be conducted effectively, efficiently and without any training +data as a direct result of this approach. + +
+
+
+
+
+ + ☆ Fully Convolutional Slice-to-Volume Reconstruction for Single-Stack MRI + + +
+ In magnetic resonance imaging (MRI), slice-to-volume reconstruction (SVR) +refers to computational reconstruction of an unknown 3D magnetic resonance +volume from stacks of 2D slices corrupted by motion. While promising, current +SVR methods require multiple slice stacks for accurate 3D reconstruction, +leading to long scans and limiting their use in time-sensitive applications +such as fetal fMRI. Here, we propose a SVR method that overcomes the +shortcomings of previous work and produces state-of-the-art reconstructions in +the presence of extreme inter-slice motion. Inspired by the recent success of +single-view depth estimation methods, we formulate SVR as a single-stack motion +estimation task and train a fully convolutional network to predict a motion +stack for a given slice stack, producing a 3D reconstruction as a byproduct of +the predicted motion. Extensive experiments on the SVR of adult and fetal +brains demonstrate that our fully convolutional method is twice as accurate as +previous SVR methods. Our code is available at github.com/seannz/svr. + +
+
+
+
+
+ + ☆ ScAR: Scaling Adversarial Robustness for LiDAR Object Detection + + +
+ The adversarial robustness of a model is its ability to resist adversarial +attacks in the form of small perturbations to input data. Universal adversarial +attack methods such as Fast Sign Gradient Method (FSGM) and Projected Gradient +Descend (PGD) are popular for LiDAR object detection, but they are often +deficient compared to task-specific adversarial attacks. Additionally, these +universal methods typically require unrestricted access to the model's +information, which is difficult to obtain in real-world applications. To +address these limitations, we present a black-box Scaling Adversarial +Robustness (ScAR) method for LiDAR object detection. By analyzing the +statistical characteristics of 3D object detection datasets such as KITTI, +Waymo, and nuScenes, we have found that the model's prediction is sensitive to +scaling of 3D instances. We propose three black-box scaling adversarial attack +methods based on the available information: model-aware attack, +distribution-aware attack, and blind attack. We also introduce a strategy for +generating scaling adversarial examples to improve the model's robustness +against these three scaling adversarial attacks. Comparison with other methods +on public datasets under different 3D object detection architectures +demonstrates the effectiveness of our proposed method. + +
+
+
+
+
+ + ☆ LooseControl: Lifting ControlNet for Generalized Depth Conditioning + + +
+ We present LooseControl to allow generalized depth conditioning for +diffusion-based image generation. ControlNet, the SOTA for depth-conditioned +image generation, produces remarkable results but relies on having access to +detailed depth maps for guidance. Creating such exact depth maps, in many +scenarios, is challenging. This paper introduces a generalized version of depth +conditioning that enables many new content-creation workflows. Specifically, we +allow (C1) scene boundary control for loosely specifying scenes with only +boundary conditions, and (C2) 3D box control for specifying layout locations of +the target objects rather than the exact shape and appearance of the objects. +Using LooseControl, along with text guidance, users can create complex +environments (e.g., rooms, street views, etc.) by specifying only scene +boundaries and locations of primary objects. Further, we provide two editing +mechanisms to refine the results: (E1) 3D box editing enables the user to +refine images by changing, adding, or removing boxes while freezing the style +of the image. This yields minimal changes apart from changes induced by the +edited boxes. (E2) Attribute editing proposes possible editing directions to +change one particular aspect of the scene, such as the overall object density +or a particular object. Extensive tests and comparisons with baselines +demonstrate the generality of our method. We believe that LooseControl can +become an important design tool for easily creating complex environments and be +extended to other forms of guidance channels. Code and more information are +available at https://shariqfarooq123.github.io/loose-control/ . + +
+
+
+
+
+ + ☆ DiffusionPCR: Diffusion Models for Robust Multi-Step Point Cloud + Registration + + +
+ Point Cloud Registration (PCR) estimates the relative rigid transformation +between two point clouds. We propose formulating PCR as a denoising diffusion +probabilistic process, mapping noisy transformations to the ground truth. +However, using diffusion models for PCR has nontrivial challenges, such as +adapting a generative model to a discriminative task and leveraging the +estimated nonlinear transformation from the previous step. Instead of training +a diffusion model to directly map pure noise to ground truth, we map the +predictions of an off-the-shelf PCR model to ground truth. The predictions of +off-the-shelf models are often imperfect, especially in challenging cases where +the two points clouds have low overlap, and thus could be seen as noisy +versions of the real rigid transformation. In addition, we transform the +rotation matrix into a spherical linear space for interpolation between samples +in the forward process, and convert rigid transformations into auxiliary +information to implicitly exploit last-step estimations in the reverse process. +As a result, conditioned on time step, the denoising model adapts to the +increasing accuracy across steps and refines registrations. Our extensive +experiments showcase the effectiveness of our DiffusionPCR, yielding +state-of-the-art registration recall rates (95.3%/81.6%) on 3DMatch and +3DLoMatch. The code will be made public upon publication. + +
+
+
+
+
+ + ☆ Visual Program Distillation: Distilling Tools and Programmatic Reasoning + into Vision-Language Models + + +
+ Solving complex visual tasks such as "Who invented the musical instrument on +the right?" involves a composition of skills: understanding space, recognizing +instruments, and also retrieving prior knowledge. Recent work shows promise by +decomposing such tasks using a large language model (LLM) into an executable +program that invokes specialized vision models. However, generated programs are +error-prone: they omit necessary steps, include spurious ones, and are unable +to recover when the specialized models give incorrect outputs. Moreover, they +require loading multiple models, incurring high latency and computation costs. +We propose Visual Program Distillation (VPD), an instruction tuning framework +that produces a vision-language model (VLM) capable of solving complex visual +tasks with a single forward pass. VPD distills the reasoning ability of LLMs by +using them to sample multiple candidate programs, which are then executed and +verified to identify a correct one. It translates each correct program into a +language description of the reasoning steps, which are then distilled into a +VLM. Extensive experiments show that VPD improves the VLM's ability to count, +understand spatial relations, and reason compositionally. Our VPD-trained +PaLI-X outperforms all prior VLMs, achieving state-of-the-art performance +across complex vision tasks, including MMBench, OK-VQA, A-OKVQA, TallyQA, POPE, +and Hateful Memes. An evaluation with human annotators also confirms that VPD +improves model response factuality and consistency. Finally, experiments on +content moderation demonstrate that VPD is also helpful for adaptation to +real-world applications with limited data. + +
+
+
+
+
+ + ♻ ☆ Investigation of UAV Detection in Images with Complex Backgrounds and + Rainy Artifacts + + +
+ To detect unmanned aerial vehicles (UAVs) in real-time, computer vision and +deep learning approaches are evolving research areas. Interest in this problem +has grown due to concerns regarding the possible hazards and misuse of +employing UAVs in many applications. These include potential privacy +violations. To address the concerns, vision-based object detection methods have +been developed for UAV detection. However, UAV detection in images with complex +backgrounds and weather artifacts like rain has yet to be reasonably studied. +Hence, for this purpose, we prepared two training datasets. The first dataset +has the sky as its background and is called the Sky Background Dataset (SBD). +The second training dataset has more complex scenes (with diverse backgrounds) +and is named the Complex Background Dataset (CBD). Additionally, two test sets +were prepared: one containing clear images and the other with images with three +rain artifacts, named the Rainy Test Set (RTS). This work also focuses on +benchmarking state-of-the-art object detection models, and to the best of our +knowledge, it is the first to investigate the performance of recent and popular +vision-based object detection methods for UAV detection under challenging +conditions such as complex backgrounds, varying UAV sizes, and low-to-heavy +rainy conditions. The findings presented in the paper shall help provide +insights concerning the performance of the selected models for UAV detection +under challenging conditions and pave the way to develop more robust UAV +detection methods. The codes and datasets are available at: +https://github.com/AdnanMunir294/UAVD-CBRA. + +
+
+ comment: Accepted at the Real-World Surveillance Workshop, IEEE/CVF Winter + Conference on Applications of Computer Vision 2024 +
+
+
+
+
+ + ♻ ☆ Multi-task Image Restoration Guided By Robust DINO Features + + +
+ Multi-task image restoration has gained significant interest due to its +inherent versatility and efficiency compared to its single-task counterpart. +Despite its potential, performance degradation is observed with an increase in +the number of tasks, primarily attributed to the distinct nature of each +restoration task. Addressing this challenge, we introduce +\mbox{\textbf{DINO-IR}}, a novel multi-task image restoration approach +leveraging robust features extracted from DINOv2. Our empirical analysis shows +that while shallow features of DINOv2 capture rich low-level image +characteristics, the deep features ensure a robust semantic representation +insensitive to degradations while preserving high-frequency contour details. +Building on these features, we devise specialized components, including +multi-layer semantic fusion module, DINO-Restore adaption and fusion module, +and DINO perception contrastive loss, to integrate DINOv2 features into the +restoration paradigm. Equipped with the aforementioned components, our DINO-IR +performs favorably against existing multi-task image restoration approaches in +various tasks by a large margin, indicating the superiority and necessity of +reinforcing the robust features for multi-task image restoration. + +
+
+ comment: Some important information need to add +
+
+
+
+
+ + ♻ ☆ VideoSwap: Customized Video Subject Swapping with Interactive Semantic + Point Correspondence + + +
+ Current diffusion-based video editing primarily focuses on +structure-preserved editing by utilizing various dense correspondences to +ensure temporal consistency and motion alignment. However, these approaches are +often ineffective when the target edit involves a shape change. To embark on +video editing with shape change, we explore customized video subject swapping +in this work, where we aim to replace the main subject in a source video with a +target subject having a distinct identity and potentially different shape. In +contrast to previous methods that rely on dense correspondences, we introduce +the VideoSwap framework that exploits semantic point correspondences, inspired +by our observation that only a small number of semantic points are necessary to +align the subject's motion trajectory and modify its shape. We also introduce +various user-point interactions (\eg, removing points and dragging points) to +address various semantic point correspondence. Extensive experiments +demonstrate state-of-the-art video subject swapping results across a variety of +real-world videos. + +
+
+ comment: Project page at https://videoswap.github.io +
+
+
+
+
+ + ♻ ☆ Grounding Everything: Emerging Localization Properties in + Vision-Language Transformers + + +
+ Vision-language foundation models have shown remarkable performance in +various zero-shot settings such as image retrieval, classification, or +captioning. But so far, those models seem to fall behind when it comes to +zero-shot localization of referential expressions and objects in images. As a +result, they need to be fine-tuned for this task. In this paper, we show that +pretrained vision-language (VL) models allow for zero-shot open-vocabulary +object localization without any fine-tuning. To leverage those capabilities, we +propose a Grounding Everything Module (GEM) that generalizes the idea of +value-value attention introduced by CLIPSurgery to a self-self attention path. +We show that the concept of self-self attention corresponds to clustering, thus +enforcing groups of tokens arising from the same object to be similar while +preserving the alignment with the language space. To further guide the group +formation, we propose a set of regularizations that allows the model to finally +generalize across datasets and backbones. We evaluate the proposed GEM +framework on various benchmark tasks and datasets for semantic segmentation. It +shows that GEM not only outperforms other training-free open-vocabulary +localization methods, but also achieves state-of-the-art results on the +recently proposed OpenImagesV7 large-scale segmentation benchmark. + +
+
+ comment: Code available at https://github.com/WalBouss/GEM +
+
+
+
+
+ + ♻ ☆ One-step Diffusion with Distribution Matching Distillation + + +
+ Diffusion models generate high-quality images but require dozens of forward +passes. We introduce Distribution Matching Distillation (DMD), a procedure to +transform a diffusion model into a one-step image generator with minimal impact +on image quality. We enforce the one-step image generator match the diffusion +model at distribution level, by minimizing an approximate KL divergence whose +gradient can be expressed as the difference between 2 score functions, one of +the target distribution and the other of the synthetic distribution being +produced by our one-step generator. The score functions are parameterized as +two diffusion models trained separately on each distribution. Combined with a +simple regression loss matching the large-scale structure of the multi-step +diffusion outputs, our method outperforms all published few-step diffusion +approaches, reaching 2.62 FID on ImageNet 64x64 and 11.49 FID on zero-shot +COCO-30k, comparable to Stable Diffusion but orders of magnitude faster. +Utilizing FP16 inference, our model generates images at 20 FPS on modern +hardware. + +
+
+ comment: Project page: https://tianweiy.github.io/dmd/ +
+
+
+
+
+ + ♻ ☆ Balance is Essence: Accelerating Sparse Training via Adaptive Gradient + Correction + + +
+ Despite impressive performance, deep neural networks require significant +memory and computation costs, prohibiting their application in +resource-constrained scenarios. Sparse training is one of the most common +techniques to reduce these costs, however, the sparsity constraints add +difficulty to the optimization, resulting in an increase in training time and +instability. In this work, we aim to overcome this problem and achieve +space-time co-efficiency. To accelerate and stabilize the convergence of sparse +training, we analyze the gradient changes and develop an adaptive gradient +correction method. Specifically, we approximate the correlation between the +current and previous gradients, which is used to balance the two gradients to +obtain a corrected gradient. Our method can be used with the most popular +sparse training pipelines under both standard and adversarial setups. +Theoretically, we prove that our method can accelerate the convergence rate of +sparse training. Extensive experiments on multiple datasets, model +architectures, and sparsities demonstrate that our method outperforms leading +sparse training methods by up to \textbf{5.0\%} in accuracy given the same +number of training epochs, and reduces the number of training epochs by up to +\textbf{52.1\%} to achieve the same accuracy. Our code is available on: +\url{https://github.com/StevenBoys/AGENT}. + +
+
+
+
+
+ + ♻ ☆ Stable Segment Anything Model + + +
+ The Segment Anything Model (SAM) achieves remarkable promptable segmentation +given high-quality prompts which, however, often require good skills to +specify. To make SAM robust to casual prompts, this paper presents the first +comprehensive analysis on SAM's segmentation stability across a diverse +spectrum of prompt qualities, notably imprecise bounding boxes and insufficient +points. Our key finding reveals that given such low-quality prompts, SAM's mask +decoder tends to activate image features that are biased towards the background +or confined to specific object parts. To mitigate this issue, our key idea +consists of calibrating solely SAM's mask attention by adjusting the sampling +locations and amplitudes of image features, while the original SAM model +architecture and weights remain unchanged. Consequently, our deformable +sampling plugin (DSP) enables SAM to adaptively shift attention to the prompted +target regions in a data-driven manner, facilitated by our effective robust +training strategy (RTS). During inference, dynamic routing plugin (DRP) is +proposed that toggles SAM between the deformable and regular grid sampling +modes, conditioned on the input prompt quality. Thus, our solution, termed +Stable-SAM, offers several advantages: 1) improved SAM's segmentation stability +across a wide range of prompt qualities, while 2) retaining SAM's powerful +promptable segmentation efficiency and generality, with 3) minimal learnable +parameters (0.08 M) and fast adaptation (by 1 training epoch). Extensive +experiments across multiple datasets validate the effectiveness and advantages +of our approach, underscoring Stable-SAM as a more robust solution for +segmenting anything. Codes will be released upon acceptance. +https://github.com/fanq15/Stable-SAM + +
+
+ comment: Smaller file size for the easy access. Codes will be released upon + acceptance. https://github.com/fanq15/Stable-SAM +
+
+
+
+
+ + ♻ ☆ Revitalizing Legacy Video Content: Deinterlacing with Bidirectional + Information Propagation + + +
+ Due to old CRT display technology and limited transmission bandwidth, early +film and TV broadcasts commonly used interlaced scanning. This meant each field +contained only half of the information. Since modern displays require full +frames, this has spurred research into deinterlacing, i.e. restoring the +missing information in legacy video content. In this paper, we present a +deep-learning-based method for deinterlacing animated and live-action content. +Our proposed method supports bidirectional spatio-temporal information +propagation across multiple scales to leverage information in both space and +time. More specifically, we design a Flow-guided Refinement Block (FRB) which +performs feature refinement including alignment, fusion, and rectification. +Additionally, our method can process multiple fields simultaneously, reducing +per-frame processing time, and potentially enabling real-time processing. Our +experimental results demonstrate that our proposed method achieves superior +performance compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ NeuroMixGDP: A Neural Collapse-Inspired Random Mixup for Private Data + Release + + +
+ Privacy-preserving data release algorithms have gained increasing attention +for their ability to protect user privacy while enabling downstream machine +learning tasks. However, the utility of current popular algorithms is not +always satisfactory. Mixup of raw data provides a new way of data augmentation, +which can help improve utility. However, its performance drastically +deteriorates when differential privacy (DP) noise is added. To address this +issue, this paper draws inspiration from the recently observed Neural Collapse +(NC) phenomenon, which states that the last layer features of a neural network +concentrate on the vertices of a simplex as Equiangular Tight Frame (ETF). We +propose a scheme to mixup the Neural Collapse features to exploit the ETF +simplex structure and release noisy mixed features to enhance the utility of +the released data. By using Gaussian Differential Privacy (GDP), we obtain an +asymptotic rate for the optimal mixup degree. To further enhance the utility +and address the label collapse issue when the mixup degree is large, we propose +a Hierarchical sampling method to stratify the mixup samples on a small number +of classes. This method remarkably improves utility when the number of classes +is large. Extensive experiments demonstrate the effectiveness of our proposed +method in protecting against attacks and improving utility. In particular, our +approach shows significantly improved utility compared to directly training +classification networks with DPSGD on CIFAR100 and MiniImagenet datasets, +highlighting the benefits of using privacy-preserving data release. We release +reproducible code in https://github.com/Lidonghao1996/NeuroMixGDP. + +
+
+ comment: 28 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Alleviating Exposure Bias in Diffusion Models through Sampling with + Shifted Time Steps + + +
+ Diffusion Probabilistic Models (DPM) have shown remarkable efficacy in the +synthesis of high-quality images. However, their inference process +characteristically requires numerous, potentially hundreds, of iterative steps, +which could exaggerate the problem of exposure bias due to the training and +inference discrepancy. Previous work has attempted to mitigate this issue by +perturbing inputs during training, which consequently mandates the retraining +of the DPM. In this work, we conduct a systematic study of exposure bias in DPM +and, intriguingly, we find that the exposure bias could be alleviated with a +novel sampling method that we propose, without retraining the model. We +empirically and theoretically show that, during inference, for each backward +time step $t$ and corresponding state $\hat{x}_t$, there might exist another +time step $t_s$ which exhibits superior coupling with $\hat{x}_t$. Based on +this finding, we introduce a sampling method named Time-Shift Sampler. Our +framework can be seamlessly integrated to existing sampling algorithms, such as +DDPM, DDIM and other high-order solvers, inducing merely minimal additional +computations. Experimental results show our method brings significant and +consistent improvements in FID scores on different datasets and sampling +methods. For example, integrating Time-Shift Sampler to F-PNDM yields a +FID=3.88, achieving 44.49\% improvements as compared to F-PNDM, on CIFAR-10 +with 10 sampling steps, which is more performant than the vanilla DDIM with 100 +sampling steps. We will release the code upon acceptance. + +
+
+ comment: added new results & proof +
+
+
+
+
+ + ♻ ☆ GaussianDreamer: Fast Generation from Text to 3D Gaussians by Bridging + 2D and 3D Diffusion Models + + +
+ In recent times, the generation of 3D assets from text prompts has shown +impressive results. Both 2D and 3D diffusion models can help generate decent 3D +objects based on prompts. 3D diffusion models have good 3D consistency, but +their quality and generalization are limited as trainable 3D data is expensive +and hard to obtain. 2D diffusion models enjoy strong abilities of +generalization and fine generation, but 3D consistency is hard to guarantee. +This paper attempts to bridge the power from the two types of diffusion models +via the recent explicit and efficient 3D Gaussian splatting representation. A +fast 3D object generation framework, named as GaussianDreamer, is proposed, +where the 3D diffusion model provides priors for initialization and the 2D +diffusion model enriches the geometry and appearance. Operations of noisy point +growing and color perturbation are introduced to enhance the initialized +Gaussians. Our GaussianDreamer can generate a high-quality 3D instance or 3D +avatar within 15 minutes on one GPU, much faster than previous methods, while +the generated instances can be directly rendered in real time. Demos and code +are available at https://taoranyi.com/gaussiandreamer/. + +
+
+ comment: Project page: https://taoranyi.com/gaussiandreamer/ +
+
+
+
+
+ + ♻ ☆ Leveraging Model Fusion for Improved License Plate Recognition + + +
+ License Plate Recognition (LPR) plays a critical role in various +applications, such as toll collection, parking management, and traffic law +enforcement. Although LPR has witnessed significant advancements through the +development of deep learning, there has been a noticeable lack of studies +exploring the potential improvements in results by fusing the outputs from +multiple recognition models. This research aims to fill this gap by +investigating the combination of up to 12 different models using +straightforward approaches, such as selecting the most confident prediction or +employing majority vote-based strategies. Our experiments encompass a wide +range of datasets, revealing substantial benefits of fusion approaches in both +intra- and cross-dataset setups. Essentially, fusing multiple models reduces +considerably the likelihood of obtaining subpar performance on a particular +dataset/scenario. We also found that combining models based on their speed is +an appealing approach. Specifically, for applications where the recognition +task can tolerate some additional time, though not excessively, an effective +strategy is to combine 4-6 models. These models may not be the most accurate +individually, but their fusion strikes an optimal balance between speed and +accuracy. + +
+
+ comment: Accepted for presentation at the Iberoamerican Congress on Pattern + Recognition (CIARP) 2023 +
+
+
+
+
+ + ♻ ☆ TR3D: Towards Real-Time Indoor 3D Object Detection + + +
+ Recently, sparse 3D convolutions have changed 3D object detection. Performing +on par with the voting-based approaches, 3D CNNs are memory-efficient and scale +to large scenes better. However, there is still room for improvement. With a +conscious, practice-oriented approach to problem-solving, we analyze the +performance of such methods and localize the weaknesses. Applying modifications +that resolve the found issues one by one, we end up with TR3D: a fast +fully-convolutional 3D object detection model trained end-to-end, that achieves +state-of-the-art results on the standard benchmarks, ScanNet v2, SUN RGB-D, and +S3DIS. Moreover, to take advantage of both point cloud and RGB inputs, we +introduce an early fusion of 2D and 3D features. We employ our fusion module to +make conventional 3D object detection methods multimodal and demonstrate an +impressive boost in performance. Our model with early feature fusion, which we +refer to as TR3D+FF, outperforms existing 3D object detection approaches on the +SUN RGB-D dataset. Overall, besides being accurate, both TR3D and TR3D+FF +models are lightweight, memory-efficient, and fast, thereby marking another +milestone on the way toward real-time 3D object detection. Code is available at +https://github.com/SamsungLabs/tr3d . + +
+
+
+
+
+ + ♻ ☆ Self-Evolving Neural Radiance Fields + + +
+ Recently, neural radiance field (NeRF) has shown remarkable performance in +novel view synthesis and 3D reconstruction. However, it still requires abundant +high-quality images, limiting its applicability in real-world scenarios. To +overcome this limitation, recent works have focused on training NeRF only with +sparse viewpoints by giving additional regularizations, often called few-shot +NeRF. We observe that due to the under-constrained nature of the task, solely +using additional regularization is not enough to prevent the model from +overfitting to sparse viewpoints. In this paper, we propose a novel framework, +dubbed Self-Evolving Neural Radiance Fields (SE-NeRF), that applies a +self-training framework to NeRF to address these problems. We formulate +few-shot NeRF into a teacher-student framework to guide the network to learn a +more robust representation of the scene by training the student with additional +pseudo labels generated from the teacher. By distilling ray-level pseudo labels +using distinct distillation schemes for reliable and unreliable rays obtained +with our novel reliability estimation method, we enable NeRF to learn a more +accurate and robust geometry of the 3D scene. We show and evaluate that +applying our self-training framework to existing models improves the quality of +the rendered images and achieves state-of-the-art performance in multiple +settings. + +
+
+ comment: 34 pages, 21 figures Our project page can be found at : + https://ku-cvlab.github.io/SE-NeRF/ +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. A critical challenge is how +to make use of existing large-scale pre-trained VLMs, which are trained on +common objects, to perform the domain-specific transfer for accomplishing +domain-related downstream tasks. In this paper, we propose a new framework that +includes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap +between the General Vision-Language Model (GVLM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +fine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning +methods on RS5M to implement the DVLM. Experimental results show that our +proposed dataset is highly effective for various tasks, and our model GeoRSCLIP +improves upon the baseline or previous state-of-the-art model by $3\%\sim20\%$ +in Zero-shot Classification (ZSC), $3\%\sim6\%$ in Remote Sensing Cross-Modal +Text-Image Retrieval (RSCTIR) and $4\%\sim5\%$ in Semantic Localization (SeLo) +tasks. Dataset and models have been released in: +\url{https://github.com/om-ai-lab/RS5M}. + +
+
+ comment: RS5M dataset v5 +
+
+
+
+
+ + ♻ ☆ Revisiting Hidden Representations in Transfer Learning for Medical + Imaging + + +
+ While a key component to the success of deep learning is the availability of +massive amounts of training data, medical image datasets are often limited in +diversity and size. Transfer learning has the potential to bridge the gap +between related yet different domains. For medical applications, however, it +remains unclear whether it is more beneficial to pre-train on natural or +medical images. We aim to shed light on this problem by comparing +initialization on ImageNet and RadImageNet on seven medical classification +tasks. Our work includes a replication study, which yields results contrary to +previously published findings. In our experiments, ResNet50 models pre-trained +on ImageNet tend to outperform those trained on RadImageNet. To gain further +insights, we investigate the learned representations using Canonical +Correlation Analysis (CCA) and compare the predictions of the different models. +Our results indicate that, contrary to intuition, ImageNet and RadImageNet may +converge to distinct intermediate representations, which appear to diverge +further during fine-tuning. Despite these distinct representations, the +predictions of the models remain similar. Our findings show that the similarity +between networks before and after fine-tuning does not correlate with +performance gains, suggesting that the advantages of transfer learning might +not solely originate from the reuse of features in the early layers of a +convolutional neural network. + +
+
+ comment: Published in TMLR +
+
+
+
+
+ + ♻ ☆ TriDeNT: Triple Deep Network Training for Privileged Knowledge + Distillation in Histopathology + + +
+ Computational pathology models rarely utilise data that will not be available +for inference. This means most models cannot learn from highly informative data +such as additional immunohistochemical (IHC) stains and spatial +transcriptomics. We present TriDeNT, a novel self-supervised method for +utilising privileged data that is not available during inference to improve +performance. We demonstrate the efficacy of this method for a range of +different paired data including immunohistochemistry, spatial transcriptomics +and expert nuclei annotations. In all settings, TriDeNT outperforms other +state-of-the-art methods in downstream tasks, with observed improvements of up +to 101%. Furthermore, we provide qualitative and quantitative measurements of +the features learned by these models and how they differ from baselines. +TriDeNT offers a novel method to distil knowledge from scarce or costly data +during training, to create significantly better models for routine inputs. + +
+
+
+
+
+ + ♻ ☆ Domain-wise Invariant Learning for Panoptic Scene Graph Generation + + +
+ Panoptic Scene Graph Generation (PSG) involves the detection of objects and +the prediction of their corresponding relationships (predicates). However, the +presence of biased predicate annotations poses a significant challenge for PSG +models, as it hinders their ability to establish a clear decision boundary +among different predicates. This issue substantially impedes the practical +utility and real-world applicability of PSG models. To address the intrinsic +bias above, we propose a novel framework to infer potentially biased +annotations by measuring the predicate prediction risks within each +subject-object pair (domain), and adaptively transfer the biased annotations to +consistent ones by learning invariant predicate representation embeddings. +Experiments show that our method significantly improves the performance of +benchmark models, achieving a new state-of-the-art performance, and shows great +generalization and effectiveness on PSG dataset. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2307.15567 +
+
+
+
+
+ + ♻ ☆ Contour-based Interactive Segmentation + + +
+ Recent advances in interactive segmentation (IS) allow speeding up and +simplifying image editing and labeling greatly. The majority of modern IS +approaches accept user input in the form of clicks. However, using clicks may +require too many user interactions, especially when selecting small objects, +minor parts of an object, or a group of objects of the same type. In this +paper, we consider such a natural form of user interaction as a loose contour, +and introduce a contour-based IS method. We evaluate the proposed method on the +standard segmentation benchmarks, our novel UserContours dataset, and its +subset UserContours-G containing difficult segmentation cases. Through +experiments, we demonstrate that a single contour provides the same accuracy as +multiple clicks, thus reducing the required amount of user interactions. + +
+
+
+
+
+ + ♻ ☆ HRCTCov19 -- A High-Resolution Chest CT Scan Image Dataset for COVID-19 + Diagnosis and Differentiation + + +
+ Introduction: During the COVID-19 pandemic, computed tomography (CT) was a +popular method for diagnosing COVID-19 patients. HRCT (High-Resolution Computed +Tomography) is a form of computed tomography that uses advanced methods to +improve image resolution. Publicly accessible COVID-19 CT image datasets are +very difficult to come by due to privacy concerns, which impedes the study and +development of AI-powered COVID-19 diagnostic algorithms based on CT images. +Data description: To address this problem, we have introduced HRCTCov19, a new +COVID-19 high-resolution chest CT scan image dataset that includes not only +COVID-19 cases of Ground Glass Opacity (GGO), Crazy Paving, and Air Space +Consolidation but also CT images of cases with negative COVID-19. The HRCTCov19 +dataset, which includes slice-level, and patient-level labels, has the +potential to aid COVID-19 research, especially for diagnosis and +differentiation using artificial intelligence algorithms, machine learning, and +deep learning methods. This dataset is accessible through the web at: +http://databiox.com and includes 181,106 chest HRCT images from 395 patients +with four labels: GGO, Crazy Paving, Air Space Consolidation, and Negative. +Keywords: COVID-19, CT scan, Computed Tomography, Chest Image, Dataset, Medical +Imaging + +
+
+ comment: 5 pages, 2 figures and 1 table +
+
+
+
+
+ + ♻ ☆ Rethinking Radiology Report Generation via Causal Reasoning and + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as an interaction between +vision and language fields. Previous works inherited the ideology of +vision-to-language generation tasks,aiming to generate paragraphs with high +consistency as reports. However, one unique characteristic of RRG, the +independence between diseases, was neglected, leading to the injection of +disease co-occurrence as a confounder that effects the results through backdoor +path. Unfortunately, this confounder confuses the process of report generation +worse because of the biased RRG data distribution. In this paper, to rethink +this issue thoroughly, we reason about its causes and effects from a novel +perspective of statistics and causality, where the Joint Vision Coupling and +the Conditional Sentence Coherence Coupling are two aspects prone to implicitly +decrease the accuracy of reports. Then, a counterfactual augmentation strategy +that contains the Counterfactual Sample Synthesis and the Counterfactual Report +Reconstruction sub-methods is proposed to break these two aspects of spurious +effects. Experimental results and further analyses on two widely used datasets +justify our reasoning and proposed methods. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ♻ ☆ Interpretable 2D Vision Models for 3D Medical Images + + +
+ Training Artificial Intelligence (AI) models on 3D images presents unique +challenges compared to the 2D case: Firstly, the demand for computational +resources is significantly higher, and secondly, the availability of large +datasets for pre-training is often limited, impeding training success. This +study proposes a simple approach of adapting 2D networks with an intermediate +feature representation for processing 3D images. Our method employs attention +pooling to learn to assign each slice an importance weight and, by that, obtain +a weighted average of all 2D slices. These weights directly quantify the +contribution of each slice to the contribution and thus make the model +prediction inspectable. We show on all 3D MedMNIST datasets as benchmark and +two real-world datasets consisting of several hundred high-resolution CT or MRI +scans that our approach performs on par with existing methods. Furthermore, we +compare the in-built interpretability of our approach to HiResCam, a +state-of-the-art retrospective interpretability approach. + +
+
+
+
+
+ + ♻ ☆ Spintronics for image recognition: performance benchmarking via + ultrafast data-driven simulations + + +
+ We present a demonstration of image classification using an echo-state +network (ESN) relying on a single simulated spintronic nanostructure known as +the vortex-based spin-torque oscillator (STVO) delayed in time. We employ an +ultrafast data-driven simulation framework called the data-driven Thiele +equation approach (DD-TEA) to simulate the STVO dynamics. This allows us to +avoid the challenges associated with repeated experimental manipulation of such +a nanostructured system. We showcase the versatility of our solution by +successfully applying it to solve classification challenges with the MNIST, +EMNIST-letters and Fashion MNIST datasets. Through our simulations, we +determine that within a large ESN the results obtained using the STVO dynamics +as an activation function are comparable to the ones obtained with other +conventional nonlinear activation functions like the reLU and the sigmoid. +While achieving state-of-the-art accuracy levels on the MNIST dataset, our +model's performance on EMNIST-letters and Fashion MNIST is lower due to the +relative simplicity of the system architecture and the increased complexity of +the tasks. We expect that the DD-TEA framework will enable the exploration of +deeper architectures, ultimately leading to improved classification accuracy. + +
+
+
+
+
+ + ♻ ☆ Learning Content-enhanced Mask Transformer for Domain Generalized + Urban-Scene Segmentation + + +
+ Domain-generalized urban-scene semantic segmentation (USSS) aims to learn +generalized semantic predictions across diverse urban-scene styles. Unlike +domain gap challenges, USSS is unique in that the semantic categories are often +similar in different urban scenes, while the styles can vary significantly due +to changes in urban landscapes, weather conditions, lighting, and other +factors. Existing approaches typically rely on convolutional neural networks +(CNNs) to learn the content of urban scenes. + In this paper, we propose a Content-enhanced Mask TransFormer (CMFormer) for +domain-generalized USSS. The main idea is to enhance the focus of the +fundamental component, the mask attention mechanism, in Transformer +segmentation models on content information. To achieve this, we introduce a +novel content-enhanced mask attention mechanism. It learns mask queries from +both the image feature and its down-sampled counterpart, as lower-resolution +image features usually contain more robust content information and are less +sensitive to style variations. These features are fused into a Transformer +decoder and integrated into a multi-resolution content-enhanced mask attention +learning scheme. + Extensive experiments conducted on various domain-generalized urban-scene +segmentation datasets demonstrate that the proposed CMFormer significantly +outperforms existing CNN-based methods for domain-generalized semantic +segmentation, achieving improvements of up to 14.00\% in terms of mIoU (mean +intersection over union). The source code is publicly available at +\url{https://github.com/BiQiWHU/CMFormer}. + +
+
+ comment: 18 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ LLaFS: When Large-Language Models Meet Few-Shot Segmentation + + +
+ This paper proposes LLaFS, the first attempt to leverage large language +models (LLMs) in few-shot segmentation. In contrast to the conventional +few-shot segmentation methods that only rely on the limited and biased +information from the annotated support images, LLaFS leverages the vast prior +knowledge gained by LLM as an effective supplement and directly uses the LLM to +segment images in a few-shot manner. To enable the text-based LLM to handle +image-related tasks, we carefully design an input instruction that allows the +LLM to produce segmentation results represented as polygons, and propose a +region-attribute table to simulate the human visual mechanism and provide +multi-modal guidance. We also synthesize pseudo samples and use curriculum +learning for pretraining to augment data and achieve better optimization. LLaFS +achieves state-of-the-art results on multiple datasets, showing the potential +of using LLMs for few-shot computer vision tasks. Code will be available at +https://github.com/lanyunzhu99/LLaFS. + +
+
+
+
+
+ + ♻ ☆ NeuRAD: Neural Rendering for Autonomous Driving + + +
+ Neural radiance fields (NeRFs) have gained popularity in the autonomous +driving (AD) community. Recent methods show NeRFs' potential for closed-loop +simulation, enabling testing of AD systems, and as an advanced training data +augmentation technique. However, existing methods often require long training +times, dense semantic supervision, or lack generalizability. This, in turn, +hinders the application of NeRFs for AD at scale. In this paper, we propose +NeuRAD, a robust novel view synthesis method tailored to dynamic AD data. Our +method features simple network design, extensive sensor modeling for both +camera and lidar -- including rolling shutter, beam divergence and ray dropping +-- and is applicable to multiple datasets out of the box. We verify its +performance on five popular AD datasets, achieving state-of-the-art performance +across the board. To encourage further development, we will openly release the +NeuRAD source code. See https://github.com/georghess/NeuRAD . + +
+
+
+
+
+ + ♻ ☆ Primal-Attention: Self-attention through Asymmetric Kernel SVD in Primal + Representation NeurIPS 2023 + + +
+ Recently, a new line of works has emerged to understand and improve +self-attention in Transformers by treating it as a kernel machine. However, +existing works apply the methods for symmetric kernels to the asymmetric +self-attention, resulting in a nontrivial gap between the analytical +understanding and numerical implementation. In this paper, we provide a new +perspective to represent and optimize self-attention through asymmetric Kernel +Singular Value Decomposition (KSVD), which is also motivated by the low-rank +property of self-attention normally observed in deep layers. Through asymmetric +KSVD, $i$) a primal-dual representation of self-attention is formulated, where +the optimization objective is cast to maximize the projection variances in the +attention outputs; $ii$) a novel attention mechanism, i.e., Primal-Attention, +is proposed via the primal representation of KSVD, avoiding explicit +computation of the kernel matrix in the dual; $iii$) with KKT conditions, we +prove that the stationary solution to the KSVD optimization in Primal-Attention +yields a zero-value objective. In this manner, KSVD optimization can be +implemented by simply minimizing a regularization loss, so that low-rank +property is promoted without extra decomposition. Numerical experiments show +state-of-the-art performance of our Primal-Attention with improved efficiency. +Moreover, we demonstrate that the deployed KSVD optimization regularizes +Primal-Attention with a sharper singular value decay than that of the canonical +self-attention, further verifying the great potential of our method. To the +best of our knowledge, this is the first work that provides a primal-dual +representation for the asymmetric kernel in self-attention and successfully +applies it to modeling and optimization. + +
+
+ comment: NeurIPS 2023. We provide a primal-dual representation for the + asymmetric self-attention in transformer that allows to avoid explicit + computation of the kernel matrix +
+
+
+
+
+ + ♻ ☆ Spiking Denoising Diffusion Probabilistic Models WACV 2024 + + +
+ Spiking neural networks (SNNs) have ultra-low energy consumption and high +biological plausibility due to their binary and bio-driven nature compared with +artificial neural networks (ANNs). While previous research has primarily +focused on enhancing the performance of SNNs in classification tasks, the +generative potential of SNNs remains relatively unexplored. In our paper, we +put forward Spiking Denoising Diffusion Probabilistic Models (SDDPM), a new +class of SNN-based generative models that achieve high sample quality. To fully +exploit the energy efficiency of SNNs, we propose a purely Spiking U-Net +architecture, which achieves comparable performance to its ANN counterpart +using only 4 time steps, resulting in significantly reduced energy consumption. +Extensive experimental results reveal that our approach achieves +state-of-the-art on the generative tasks and substantially outperforms other +SNN-based generative models, achieving up to 12x and 6x improvement on the +CIFAR-10 and the CelebA datasets, respectively. Moreover, we propose a +threshold-guided strategy that can further improve the performances by 2.69% in +a training-free manner. The SDDPM symbolizes a significant advancement in the +field of SNN generation, injecting new perspectives and potential avenues of +exploration. Our code is available at https://github.com/AndyCao1125/SDDPM. + +
+
+ comment: Accepted by WACV 2024 +
+
+
+
+
+ + ♻ ☆ Efficient Deep Learning Models for Privacy-preserving People Counting on + Low-resolution Infrared Arrays + + +
+ Ultra-low-resolution Infrared (IR) array sensors offer a low-cost, +energy-efficient, and privacy-preserving solution for people counting, with +applications such as occupancy monitoring. Previous work has shown that Deep +Learning (DL) can yield superior performance on this task. However, the +literature was missing an extensive comparative analysis of various efficient +DL architectures for IR array-based people counting, that considers not only +their accuracy, but also the cost of deploying them on memory- and +energy-constrained Internet of Things (IoT) edge nodes. In this work, we +address this need by comparing 6 different DL architectures on a novel dataset +composed of IR images collected from a commercial 8x8 array, which we made +openly available. With a wide architectural exploration of each model type, we +obtain a rich set of Pareto-optimal solutions, spanning cross-validated +balanced accuracy scores in the 55.70-82.70% range. When deployed on a +commercial Microcontroller (MCU) by STMicroelectronics, the STM32L4A6ZG, these +models occupy 0.41-9.28kB of memory, and require 1.10-7.74ms per inference, +while consuming 17.18-120.43 $\mu$J of energy. Our models are significantly +more accurate than a previous deterministic method (up to +39.9%), while being +up to 3.53x faster and more energy efficient. Further, our models' accuracy is +comparable to state-of-the-art DL solutions on similar resolution sensors, +despite a much lower complexity. All our models enable continuous, real-time +inference on a MCU-based IoT node, with years of autonomous operation without +battery recharging. + +
+
+ comment: This article has been accepted for publication in IEEE Internet of + Things Journal; Fixed typos +
+
+
+
+
+ + ♻ ☆ DINO-Mix: Enhancing Visual Place Recognition with Foundational Vision + Model and Feature Mixing + + +
+ Utilizing visual place recognition (VPR) technology to ascertain the +geographical location of publicly available images is a pressing issue for +real-world VPR applications. Although most current VPR methods achieve +favorable results under ideal conditions, their performance in complex +environments, characterized by lighting variations, seasonal changes, and +occlusions caused by moving objects, is generally unsatisfactory. In this +study, we utilize the DINOv2 model as the backbone network for trimming and +fine-tuning to extract robust image features. We propose a novel VPR +architecture called DINO-Mix, which combines a foundational vision model with +feature aggregation. This architecture relies on the powerful image feature +extraction capabilities of foundational vision models. We employ an +MLP-Mixer-based mix module to aggregate image features, resulting in globally +robust and generalizable descriptors that enable high-precision VPR. We +experimentally demonstrate that the proposed DINO-Mix architecture +significantly outperforms current state-of-the-art (SOTA) methods. In test sets +having lighting variations, seasonal changes, and occlusions (Tokyo24/7, +Nordland, SF-XL-Testv1), our proposed DINO-Mix architecture achieved Top-1 +accuracy rates of 91.75%, 80.18%, and 82%, respectively. Compared with SOTA +methods, our architecture exhibited an average accuracy improvement of 5.14%. + +
+
+ comment: Under review / Open source code +
+
+
+
+
+ + ♻ ☆ Multi-View Spectrogram Transformer for Respiratory Sound Classification + + +
+ Deep neural networks have been applied to audio spectrograms for respiratory +sound classification. Existing models often treat the spectrogram as a +synthetic image while overlooking its physical characteristics. In this paper, +a Multi-View Spectrogram Transformer (MVST) is proposed to embed different +views of time-frequency characteristics into the vision transformer. +Specifically, the proposed MVST splits the mel-spectrogram into different sized +patches, representing the multi-view acoustic elements of a respiratory sound. +These patches and positional embeddings are then fed into transformer encoders +to extract the attentional information among patches through a self-attention +mechanism. Finally, a gated fusion scheme is designed to automatically weigh +the multi-view features to highlight the best one in a specific scenario. +Experimental results on the ICBHI dataset demonstrate that the proposed MVST +significantly outperforms state-of-the-art methods for classifying respiratory +sounds. + +
+
+ comment: Under review +
+
+
+
+
+ + ♻ ☆ TokenCut: Segmenting Objects in Images and Videos with Self-supervised + Transformer and Normalized Cut + + +
+ In this paper, we describe a graph-based algorithm that uses the features +obtained by a self-supervised transformer to detect and segment salient objects +in images and videos. With this approach, the image patches that compose an +image or video are organised into a fully connected graph, where the edge +between each pair of patches is labeled with a similarity score between patches +using features learned by the transformer. Detection and segmentation of +salient objects is then formulated as a graph-cut problem and solved using the +classical Normalized Cut algorithm. Despite the simplicity of this approach, it +achieves state-of-the-art results on several common image and video detection +and segmentation tasks. For unsupervised object discovery, this approach +outperforms the competing approaches by a margin of 6.1%, 5.7%, and 2.6%, +respectively, when tested with the VOC07, VOC12, and COCO20K datasets. For the +unsupervised saliency detection task in images, this method improves the score +for Intersection over Union (IoU) by 4.4%, 5.6% and 5.2%. When tested with the +ECSSD, DUTS, and DUT-OMRON datasets, respectively, compared to current +state-of-the-art techniques. This method also achieves competitive results for +unsupervised video object segmentation tasks with the DAVIS, SegTV2, and FBMS +datasets. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2202.11539 +
+
+
+
+
+ + ♻ ☆ T3D: Towards 3D Medical Image Understanding through Vision-Language + Pre-training + + +
+ Expert annotation of 3D medical image for downstream analysis is +resource-intensive, posing challenges in clinical applications. Visual +self-supervised learning (vSSL), though effective for learning visual +invariance, neglects the incorporation of domain knowledge from medicine. To +incorporate medical knowledge into visual representation learning, +vision-language pre-training (VLP) has shown promising results in 2D image. +However, existing VLP approaches become generally impractical when applied to +high-resolution 3D medical images due to GPU hardware constraints and the +potential loss of critical details caused by downsampling, which is the +intuitive solution to hardware constraints. To address the above limitations, +we introduce T3D, the first VLP framework designed for high-resolution 3D +medical images. T3D incorporates two text-informed pretext tasks: +(\lowerromannumeral{1}) text-informed contrastive learning; +(\lowerromannumeral{2}) text-informed image restoration. These tasks focus on +learning 3D visual representations from high-resolution 3D medical images and +integrating clinical knowledge from radiology reports, without distorting +information through forced alignment of downsampled volumes with detailed +anatomical text. Trained on a newly curated large-scale dataset of 3D medical +images and radiology reports, T3D significantly outperforms current vSSL +methods in tasks like organ and tumor segmentation, as well as disease +classification. This underlines T3D's potential in representation learning for +3D medical image analysis. All data and code will be available upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Hardware Resilience Properties of Text-Guided Image Classifiers NeurIPS 2023 + + +
+ This paper presents a novel method to enhance the reliability of image +classification models during deployment in the face of transient hardware +errors. By utilizing enriched text embeddings derived from GPT-3 with question +prompts per class and CLIP pretrained text encoder, we investigate their impact +as an initialization for the classification layer. Our approach achieves a +remarkable $5.5\times$ average increase in hardware reliability (and up to +$14\times$) across various architectures in the most critical layer, with +minimal accuracy drop ($0.3\%$ on average) compared to baseline PyTorch models. +Furthermore, our method seamlessly integrates with any image classification +backbone, showcases results across various network architectures, decreases +parameter and FLOPs overhead, and follows a consistent training recipe. This +research offers a practical and efficient solution to bolster the robustness of +image classification models against hardware failures, with potential +implications for future studies in this domain. Our code and models are +released at https://github.com/TalalWasim/TextGuidedResilience. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Data Upcycling Knowledge Distillation for Image Super-Resolution + + +
+ Knowledge distillation (KD) emerges as a promising yet challenging technique +for compressing deep neural networks, aiming to transfer extensive learning +representations from proficient and computationally intensive teacher models to +compact student models. However, current KD methods for super-resolution (SR) +models have limited performance and restricted applications, since the +characteristics of SR tasks are overlooked. In this paper, we put forth an +approach from the perspective of effective data utilization, namely, the Data +Upcycling Knowledge Distillation (DUKD), which facilitates the student model by +the prior knowledge the teacher provided through the upcycled in-domain data +derived from the input images. Besides, for the first time, we realize the +label consistency regularization in KD for SR models, which is implemented by +the paired invertible data augmentations. It constrains the training process of +KD and leads to better generalization capability of the student model. The +DUKD, due to its versatility, can be applied across a broad spectrum of +teacher-student architectures (e.g., CNN and Transformer models) and SR tasks, +such as single image SR, real-world SR, and SR quantization, and is in parallel +with other compression techniques. Comprehensive experiments on diverse +benchmarks demonstrate that the DUKD method significantly outperforms previous +art. + +
+
+
+
+
+ + ♻ ☆ Deep Learning in Computed Tomography Pulmonary Angiography Imaging: A + Dual-Pronged Approach for Pulmonary Embolism Detection + + +
+ The increasing reliance on Computed Tomography Pulmonary Angiography for +Pulmonary Embolism (PE) diagnosis presents challenges and a pressing need for +improved diagnostic solutions. The primary objective of this study is to +leverage deep learning techniques to enhance the Computer Assisted Diagnosis of +PE. In this study, we propose a classifier-guided detection approach that +effectively leverages the classifier's probabilistic inference to direct the +detection predictions, marking a novel contribution in the domain of automated +PE diagnosis. Our end-to-end classification framework introduces an +Attention-Guided Convolutional Neural Network (AG-CNN) that leverages local +context by utilizing an attention mechanism. This approach emulates the +attention of a human expert by looking at both global appearances and local +lesion regions before forming a conclusive decision. The classifier achieves a +notable AUROC, sensitivity, specificity and F1-score of 0.927, 0.862, 0.879 and +0.805 respectively on the FUMPE dataset with Inception-v3 backbone +architecture. Moreover, AG-CNN outperforms the baseline DenseNet-121 model, +achieving an 8.1% AUROC gain. While prior studies have primarily focused on PE +detection in main arteries, our utilization of state-of-the-art object +detection models and ensembling techniques significantly enhances detection +accuracy for small embolisms in the peripheral arteries. Finally, our proposed +classifier-guided detection approach further refines the detection metrics +contributing new state-of-the-art to the community: mAP$_{50}$, sensitivity and +F1-score of 0.846, 0.901 and 0.779 respectively outperforming the former +benchmark with a significant 3.7% improvement in mAP$_{50}$. Our research aims +to elevate PE patient care by integrating AI solutions into clinical workflows, +highlighting the potential of human-AI collaboration in medical diagnostics. + +
+
+ comment: 24 pages, 13 figures, Submitted to Expert Systems With Applications +
+
+
+
+
+ + ♻ ☆ Denoising Diffusion Bridge Models + + +
+ Diffusion models are powerful generative models that map noise to data using +stochastic processes. However, for many applications such as image editing, the +model input comes from a distribution that is not random noise. As such, +diffusion models must rely on cumbersome methods like guidance or projected +sampling to incorporate this information in the generative process. In our +work, we propose Denoising Diffusion Bridge Models (DDBMs), a natural +alternative to this paradigm based on diffusion bridges, a family of processes +that interpolate between two paired distributions given as endpoints. Our +method learns the score of the diffusion bridge from data and maps from one +endpoint distribution to the other by solving a (stochastic) differential +equation based on the learned score. Our method naturally unifies several +classes of generative models, such as score-based diffusion models and +OT-Flow-Matching, allowing us to adapt existing design and architectural +choices to our more general problem. Empirically, we apply DDBMs to challenging +image datasets in both pixel and latent space. On standard image translation +problems, DDBMs achieve significant improvement over baseline methods, and, +when we reduce the problem to image generation by setting the source +distribution to random noise, DDBMs achieve comparable FID scores to +state-of-the-art methods despite being built for a more general task. + +
+
+ comment: Github: https://github.com/alexzhou907/DDBM/ +
+
+
+
+
+ + ♻ ☆ Fairness in Medical Image Analysis and Healthcare: A Literature Survey + + +
+ Machine learning-enabled medical imaging analysis has become a vital part of +the automatic diagnosis system. However, machine learning, especially deep +learning models have been shown to demonstrate a systematic bias towards +certain subgroups of people. For instance, they yield a preferential predictive +performance to males over females, which is unfair and potentially harmful +especially in healthcare scenarios. In this literature survey, we give a +comprehensive review of the current progress of fairness studies in medical +image analysis (MedIA) and healthcare. Specifically, we first discuss the +definitions of fairness, the source of unfairness and potential solutions. +Then, we discuss current research on fairness for MedIA categorized by fairness +evaluation and unfairness mitigation. Furthermore, we conduct extensive +experiments to evaluate the fairness of different medical imaging tasks. +Finally, we discuss the challenges and future directions in developing fair +MedIA and healthcare applications + +
+
+ comment: Submitted to IEEE R-BME +
+
+
+
+
+ + ♻ ☆ Qualitative Failures of Image Generation Models and Their Application in + Detecting Deepfakes + + +
+ The ability of image and video generation models to create photorealistic +images has reached unprecedented heights, making it difficult to distinguish +between real and fake images in many cases. However, despite this progress, a +gap remains between the quality of generated images and those found in the real +world. To address this, we have reviewed a vast body of literature from both +academic publications and social media to identify qualitative shortcomings in +image generation models, which we have classified into five categories. By +understanding these failures, we can identify areas where these models need +improvement, as well as develop strategies for detecting deep fakes. The +prevalence of deep fakes in today's society is a serious concern, and our +findings can help mitigate their negative impact. + +
+
+
+
+
+ + ♻ ☆ Finding Point with Image: A Simple and Efficient Method for UAV + Self-Localization + + +
+ Image retrieval has emerged as a prominent solution for the self-localization +task of unmanned aerial vehicles (UAVs). However, this approach involves +complicated pre-processing and post-processing operations, placing significant +demands on both computational and storage resources. To mitigate this issue, +this paper presents an end-to-end positioning framework, namely Finding Point +with Image (FPI), which aims to directly identify the corresponding location of +a UAV in satellite-view images via a UAV-view image. To validate the +practicality of our framework, we construct a paired dataset, namely UL14, that +consists of UAV and satellite views. In addition, we establish two +transformer-based baseline models, Post Fusion and Mix Fusion, for end-to-end +training and inference. Through experiments, we can conclude that fusion in the +backbone network can achieve better performance than later fusion. Furthermore, +considering the singleness of paired images, Random Scale Crop (RSC) is +proposed to enrich the diversity of the paired data. Also, the ratio and weight +of positive and negative samples play a key role in model convergence. +Therefore, we conducted experimental verification and proposed a Weight Balance +Loss (WBL) to weigh the impact of positive and negative samples. Last, our +proposed baseline based on Mix Fusion structure exhibits superior performance +in time and storage efficiency, amounting to just 1/24 and 1/68, respectively, +while delivering comparable or even superior performance compared to the image +retrieval method. The dataset and code will be made publicly available. + +
+
+ comment: 15 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Hulk: A Universal Knowledge Translator for Human-Centric Tasks + + +
+ Human-centric perception tasks, e.g., human mesh recovery, pedestrian +detection, skeleton-based action recognition, and pose estimation, have wide +industrial applications, such as metaverse and sports analysis. There is a +recent surge to develop human-centric foundation models that can benefit a +broad range of human-centric perception tasks. While many human-centric +foundation models have achieved success, most of them only excel in 2D vision +tasks or require extensive fine-tuning for practical deployment in real-world +scenarios. These limitations severely restrict their usability across various +downstream tasks and situations. To tackle these problems, we present Hulk, the +first multimodal human-centric generalist model, capable of addressing most of +the mainstream tasks simultaneously without task-specific finetuning, covering +2D vision, 3D vision, skeleton-based, and vision-language tasks. The key to +achieving this is condensing various task-specific heads into two general +heads, one for discrete representations, e.g., languages, and the other for +continuous representations, e.g., location coordinates. The outputs of two +heads can be further stacked into four distinct input and output modalities. +This uniform representation enables Hulk to treat human-centric tasks as +modality translation, integrating knowledge across a wide range of tasks. To +validate the effectiveness of our proposed method, we conduct comprehensive +experiments on 11 benchmarks across 8 human-centric tasks. Experimental results +surpass previous methods substantially, demonstrating the superiority of our +proposed method. The code will be available on +https://github.com/OpenGVLab/HumanBench. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ ViewRefer: Grasp the Multi-view Knowledge for 3D Visual Grounding with + GPT and Prototype Guidance ICCV 2023 + + +
+ Understanding 3D scenes from multi-view inputs has been proven to alleviate +the view discrepancy issue in 3D visual grounding. However, existing methods +normally neglect the view cues embedded in the text modality and fail to weigh +the relative importance of different views. In this paper, we propose +ViewRefer, a multi-view framework for 3D visual grounding exploring how to +grasp the view knowledge from both text and 3D modalities. For the text branch, +ViewRefer leverages the diverse linguistic knowledge of large-scale language +models, e.g., GPT, to expand a single grounding text to multiple +geometry-consistent descriptions. Meanwhile, in the 3D modality, a transformer +fusion module with inter-view attention is introduced to boost the interaction +of objects across views. On top of that, we further present a set of learnable +multi-view prototypes, which memorize scene-agnostic knowledge for different +views, and enhance the framework from two perspectives: a view-guided attention +module for more robust text features, and a view-guided scoring strategy during +the final prediction. With our designed paradigm, ViewRefer achieves superior +performance on three benchmarks and surpasses the second-best by +2.8%, +1.5%, +and +1.35% on Sr3D, Nr3D, and ScanRefer. Code is released at +https://github.com/Ivan-Tang-3D/ViewRefer3D. + +
+
+ comment: Accepted by ICCV 2023. Code is released at + https://github.com/Ivan-Tang-3D/ViewRefer3D +
+
+
+
+
+ + ♻ ☆ GPT-Driver: Learning to Drive with GPT NeurIPS 2023 + + +
+ We present a simple yet effective approach that can transform the OpenAI +GPT-3.5 model into a reliable motion planner for autonomous vehicles. Motion +planning is a core challenge in autonomous driving, aiming to plan a driving +trajectory that is safe and comfortable. Existing motion planners predominantly +leverage heuristic methods to forecast driving trajectories, yet these +approaches demonstrate insufficient generalization capabilities in the face of +novel and unseen driving scenarios. In this paper, we propose a novel approach +to motion planning that capitalizes on the strong reasoning capabilities and +generalization potential inherent to Large Language Models (LLMs). The +fundamental insight of our approach is the reformulation of motion planning as +a language modeling problem, a perspective not previously explored. +Specifically, we represent the planner inputs and outputs as language tokens, +and leverage the LLM to generate driving trajectories through a language +description of coordinate positions. Furthermore, we propose a novel +prompting-reasoning-finetuning strategy to stimulate the numerical reasoning +potential of the LLM. With this strategy, the LLM can describe highly precise +trajectory coordinates and also its internal decision-making process in natural +language. We evaluate our approach on the large-scale nuScenes dataset, and +extensive experiments substantiate the effectiveness, generalization ability, +and interpretability of our GPT-based motion planner. Code is now available at +https://github.com/PointsCoder/GPT-Driver. + +
+
+ comment: NeurIPS 2023 Foundation Models for Decision Making Workshop +
+
+
+
+
+ + ♻ ☆ Language Agent Tree Search Unifies Reasoning Acting and Planning in + Language Models + + +
+ While large language models (LLMs) have demonstrated impressive performance +on a range of decision-making tasks, they rely on simple acting processes and +fall short of broad deployment as autonomous agents. We introduce LATS +(Language Agent Tree Search), a general framework that synergizes the +capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration +from Monte Carlo tree search in model-based reinforcement learning, LATS +employs LLMs as agents, value functions, and optimizers, repurposing their +latent strengths for enhanced decision-making. What is crucial in this method +is the use of an environment for external feedback, which offers a more +deliberate and adaptive problem-solving mechanism that moves beyond the +limitations of existing techniques. Our experimental evaluation across diverse +domains, such as programming, HotPotQA, and WebShop, illustrates the +applicability of LATS for both reasoning and acting. In particular, LATS +achieves 94.4% for programming on HumanEval with GPT-4 and an average score of +75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness +and generality of our method. + +
+
+ comment: Website and code can be found at + https://andyz245.github.io/LanguageAgentTreeSearch +
+
+
+
+
+ + ♻ ☆ Understanding Parameter Saliency via Extreme Value Theory + + +
+ Deep neural networks are being increasingly implemented throughout society in +recent years. It is useful to identify which parameters trigger +misclassification in diagnosing undesirable model behaviors. The concept of +parameter saliency is proposed and used to diagnose convolutional neural +networks (CNNs) by ranking convolution filters that may have caused +misclassification on the basis of parameter saliency. It is also shown that +fine-tuning the top ranking salient filters efficiently corrects +misidentification on ImageNet. However, there is still a knowledge gap in terms +of understanding why parameter saliency ranking can find the filters inducing +misidentification. In this work, we attempt to bridge the gap by analyzing +parameter saliency ranking from a statistical viewpoint, namely, extreme value +theory. We first show that the existing work implicitly assumes that the +gradient norm computed for each filter follows a normal distribution. Then, we +clarify the relationship between parameter saliency and the score based on the +peaks-over-threshold (POT) method, which is often used to model extreme values. +Finally, we reformulate parameter saliency in terms of the POT method, where +this reformulation is regarded as statistical anomaly detection and does not +require the implicit assumptions of the existing parameter-saliency +formulation. Our experimental results demonstrate that our reformulation can +detect malicious filters as well. Furthermore, we show that the existing +parameter saliency method exhibits a bias against the depth of layers in deep +neural networks. In particular, this bias has the potential to inhibit the +discovery of filters that cause misidentification in situations where domain +shift occurs. In contrast, parameter saliency based on POT shows less of this +bias. + +
+
+
+
+
+ + ♻ ☆ Motion Informed Needle Segmentation in Ultrasound Images + + +
+ Segmenting a moving needle in ultrasound images is challenging due to the +presence of artifacts, noise, and needle occlusion. This task becomes even more +demanding in scenarios where data availability is limited. Convolutional Neural +Networks (CNNs) have been successful in many computer vision applications, but +struggle to accurately segment needles without considering their motion. In +this paper, we present a novel approach for needle segmentation that combines +classical Kalman Filter (KF) techniques with data-driven learning, +incorporating both needle features and needle motion. Our method offers two key +contributions. First, we propose a compatible framework that seamlessly +integrates into commonly used encoder-decoder style architectures. Second, we +demonstrate superior performance compared to recent state-of-the-art needle +segmentation models using our novel convolutional neural network (CNN) based +KF-inspired block, achieving a 15\% reduction in pixel-wise needle tip error +and an 8\% reduction in length error. Third, to our knowledge we are the first +to implement a learnable filter to incorporate non-linear needle motion for +improving needle segmentation. + +
+
+ comment: 7 pages, 4 figures, under review +
+
+
+
+
+ + ♻ ☆ Signed Binary Weight Networks + + +
+ Efficient inference of Deep Neural Networks (DNNs) is essential to making AI +ubiquitous. Two important algorithmic techniques have shown promise for +enabling efficient inference - sparsity and binarization. These techniques +translate into weight sparsity and weight repetition at the hardware-software +level enabling the deployment of DNNs with critically low power and latency +requirements. We propose a new method called signed-binary networks to improve +efficiency further (by exploiting both weight sparsity and weight repetition +together) while maintaining similar accuracy. Our method achieves comparable +accuracy on ImageNet and CIFAR10 datasets with binary and can lead to 69% +sparsity. We observe real speedup when deploying these models on +general-purpose devices and show that this high percentage of unstructured +sparsity can lead to a further reduction in energy consumption on ASICs. + +
+
+ comment: it is being updated +
+
+
+
+
+ + ♻ ☆ A Comprehensive Study of Vision Transformers in Image Classification + Tasks + + +
+ Image Classification is a fundamental task in the field of computer vision +that frequently serves as a benchmark for gauging advancements in Computer +Vision. Over the past few years, significant progress has been made in image +classification due to the emergence of deep learning. However, challenges still +exist, such as modeling fine-grained visual information, high computation +costs, the parallelism of the model, and inconsistent evaluation protocols +across datasets. In this paper, we conduct a comprehensive survey of existing +papers on Vision Transformers for image classification. We first introduce the +popular image classification datasets that influenced the design of models. +Then, we present Vision Transformers models in chronological order, starting +with early attempts at adapting attention mechanism to vision tasks followed by +the adoption of vision transformers, as they have demonstrated success in +capturing intricate patterns and long-range dependencies within images. +Finally, we discuss open problems and shed light on opportunities for image +classification to facilitate new research ideas. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2012.06567, + arXiv:1406.6247, arXiv:1906.05909 by other authors. arXiv admin note: text + overlap with arXiv:2012.06567, arXiv:1406.6247, arXiv:1906.05909 by other + authors +
+
+
+
+
+ + ♻ ☆ Free Lunch for Gait Recognition: A Novel Relation Descriptor + + +
+ Gait recognition is to seek correct matches for query individuals by their +unique walking patterns. However, current methods focus solely on extracting +individual-specific features, overlooking ``interpersonal" relationships. In +this paper, we propose a novel $\textbf{Relation Descriptor}$ that captures not +only individual features but also relations between test gaits and pre-selected +gait anchors. Specifically, we reinterpret classifier weights as gait anchors +and compute similarity scores between test features and these anchors, which +re-expresses individual gait features into a similarity relation distribution. +In essence, the relation descriptor offers a holistic perspective that +leverages the collective knowledge stored within the classifier's weights, +emphasizing meaningful patterns and enhancing robustness. Despite its +potential, relation descriptor poses dimensionality challenges since its +dimension depends on the training set's identity count. To address this, we +propose Farthest gait-Anchor Selection to identify the most discriminative gait +anchors and an Orthogonal Regularization Loss to increase diversity within gait +anchors. Compared to individual-specific features extracted from the backbone, +our relation descriptor can boost the performance nearly without any extra +costs. We evaluate the effectiveness of our method on the popular GREW, Gait3D, +OU-MVLP, CASIA-B, and CCPG, showing that our method consistently outperforms +the baselines and achieves state-of-the-art performance. + +
+
+ comment: Add new figures and fix some typos +
+
+
+
+
+ + ♻ ☆ Spatial-Temporal Enhanced Transformer Towards Multi-Frame 3D Object + Detection + + +
+ The Detection Transformer (DETR) has revolutionized the design of CNN-based +object detection systems, showcasing impressive performance. However, its +potential in the domain of multi-frame 3D object detection remains largely +unexplored. In this paper, we present STEMD, a novel end-to-end framework for +multi-frame 3D object detection based on the DETR-like paradigm. STEMD treats +multi-frame 3D object detection as a sequence-to-sequence task and effectively +captures spatial-temporal dependencies at both the feature and query levels. +Specifically, to model the inter-object spatial interaction and complex +temporal dependencies, we introduce the spatial-temporal graph attention +network, which represents queries as nodes in a graph and enables effective +modeling of object interactions within a social context. To solve the problem +of missing hard cases in the proposed output of the encoder in the current +frame, we incorporate the output of the previous frame to initialize the query +input of the decoder. Moreover, to mitigate the issue of redundant detection +results, where the model generates numerous overlapping boxes from similar +queries, we consider an IoU regularization term in the loss function, which can +distinguish between queries matched with the ground-truth box and queries that +are similar but unmatched during the refinement process, leading to reduced +redundancy and more accurate detections. Through extensive experiments, we +demonstrate the effectiveness of our approach in handling challenging +scenarios, while incurring only a minor additional computational overhead. The +code is available at \url{https://github.com/Eaphan/STEMD}. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ GSDC Transformer: An Efficient and Effective Cue Fusion for Monocular + Multi-Frame Depth Estimation + + +
+ Depth estimation provides an alternative approach for perceiving 3D +information in autonomous driving. Monocular depth estimation, whether with +single-frame or multi-frame inputs, has achieved significant success by +learning various types of cues and specializing in either static or dynamic +scenes. Recently, these cues fusion becomes an attractive topic, aiming to +enable the combined cues to perform well in both types of scenes. However, +adaptive cue fusion relies on attention mechanisms, where the quadratic +complexity limits the granularity of cue representation. Additionally, explicit +cue fusion depends on precise segmentation, which imposes a heavy burden on +mask prediction. To address these issues, we propose the GSDC Transformer, an +efficient and effective component for cue fusion in monocular multi-frame depth +estimation. We utilize deformable attention to learn cue relationships at a +fine scale, while sparse attention reduces computational requirements when +granularity increases. To compensate for the precision drop in dynamic scenes, +we represent scene attributes in the form of super tokens without relying on +precise shapes. Within each super token attributed to dynamic scenes, we gather +its relevant cues and learn local dense relationships to enhance cue fusion. +Our method achieves state-of-the-art performance on the KITTI dataset with +efficient fusion speed. + +
+
+
+
+
+ + ♻ ☆ PG-VTON: A Novel Image-Based Virtual Try-On Method via Progressive + Inference Paradigm + + +
+ Virtual try-on is a promising computer vision topic with a high commercial +value wherein a new garment is visually worn on a person with a photo-realistic +effect. Previous studies conduct their shape and content inference at one +stage, employing a single-scale warping mechanism and a relatively +unsophisticated content inference mechanism. These approaches have led to +suboptimal results in terms of garment warping and skin reservation under +challenging try-on scenarios. To address these limitations, we propose a novel +virtual try-on method via progressive inference paradigm (PGVTON) that +leverages a top-down inference pipeline and a general garment try-on strategy. +Specifically, we propose a robust try-on parsing inference method by +disentangling semantic categories and introducing consistency. Exploiting the +try-on parsing as the shape guidance, we implement the garment try-on via +warping-mapping-composition. To facilitate adaptation to a wide range of try-on +scenarios, we adopt a covering more and selecting one warping strategy and +explicitly distinguish tasks based on alignment. Additionally, we regulate +StyleGAN2 to implement re-naked skin inpainting, conditioned on the target skin +shape and spatial-agnostic skin features. Experiments demonstrate that our +method has state-of-the-art performance under two challenging scenarios. The +code will be available at https://github.com/NerdFNY/PGVTON. + +
+
+
+
+
+ + ♻ ☆ FastPillars: A Deployment-friendly Pillar-based 3D Detector + + +
+ The deployment of 3D detectors strikes one of the major challenges in +real-world self-driving scenarios. Existing BEV-based (i.e., Bird Eye View) +detectors favor sparse convolutions (known as SPConv) to speed up training and +inference, which puts a hard barrier for deployment, especially for on-device +applications. In this paper, to tackle the challenge of efficient 3D object +detection from an industry perspective, we devise a deployment-friendly +pillar-based 3D detector, termed FastPillars. First, we introduce a novel +lightweight Max-and-Attention Pillar Encoding (MAPE) module specially for +enhancing small 3D objects. Second, we propose a simple yet effective principle +for designing a backbone in pillar-based 3D detection. We construct FastPillars +based on these designs, achieving high performance and low latency without +SPConv. Extensive experiments on two large-scale datasets demonstrate the +effectiveness and efficiency of FastPillars for on-device 3D detection +regarding both performance and speed. Specifically, FastPillars delivers +state-of-the-art accuracy on Waymo Open Dataset with 1.8X speed up and 3.8 +mAPH/L2 improvement over CenterPoint (SPConv-based). Our code is publicly +available at: https://github.com/StiphyJay/FastPillars. + +
+
+
+
+
+ + ♻ ☆ Personalized Video Relighting With an At-Home Light Stage + + +
+ In this paper, we develop a personalized video relighting algorithm that +produces high-quality and temporally consistent relit videos under any pose, +expression, and lighting condition in real-time. Existing relighting algorithms +typically rely either on publicly available synthetic data, which yields poor +relighting results, or instead on light stage data which is difficult to +obtain. We show that by just capturing video of a user watching YouTube videos +on a monitor we can train a personalized algorithm capable of performing +high-quality relighting under any condition. Our key contribution is a novel +neural relighting architecture that effectively separates the intrinsic +appearance features - the geometry and reflectance of the face - from the +source lighting and then combines them with the target lighting to generate a +relit image. This neural network architecture enables smoothing of intrinsic +appearance features leading to temporally stable video relighting. Both +qualitative and quantitative evaluations show that our architecture improves +portrait image relighting quality and temporal consistency over +state-of-the-art approaches on both casually captured `Light Stage at Your +Desk' (LSYD) and light-stage-captured `One Light At a Time' (OLAT) datasets. + +
+
+
+
+
+ + ♻ ☆ Revisiting Computer-Aided Tuberculosis Diagnosis + + +
+ Tuberculosis (TB) is a major global health threat, causing millions of deaths +annually. Although early diagnosis and treatment can greatly improve the +chances of survival, it remains a major challenge, especially in developing +countries. Recently, computer-aided tuberculosis diagnosis (CTD) using deep +learning has shown promise, but progress is hindered by limited training data. +To address this, we establish a large-scale dataset, namely the Tuberculosis +X-ray (TBX11K) dataset, which contains 11,200 chest X-ray (CXR) images with +corresponding bounding box annotations for TB areas. This dataset enables the +training of sophisticated detectors for high-quality CTD. Furthermore, we +propose a strong baseline, SymFormer, for simultaneous CXR image classification +and TB infection area detection. SymFormer incorporates Symmetric Search +Attention (SymAttention) to tackle the bilateral symmetry property of CXR +images for learning discriminative features. Since CXR images may not strictly +adhere to the bilateral symmetry property, we also propose Symmetric Positional +Encoding (SPE) to facilitate SymAttention through feature recalibration. To +promote future research on CTD, we build a benchmark by introducing evaluation +metrics, evaluating baseline models reformed from existing detectors, and +running an online challenge. Experiments show that SymFormer achieves +state-of-the-art performance on the TBX11K dataset. The data, code, and models +will be released at https://github.com/yun-liu/Tuberculosis. + +
+
+ comment: Accepted by IEEE TPAMI; 17 pages +
+
+
+
+
+ + ♻ ☆ The Contemporary Art of Image Search: Iterative User Intent Expansion + via Vision-Language Model SC + + +
+ Image search is an essential and user-friendly method to explore vast +galleries of digital images. However, existing image search methods heavily +rely on proximity measurements like tag matching or image similarity, requiring +precise user inputs for satisfactory results. To meet the growing demand for a +contemporary image search engine that enables accurate comprehension of users' +search intentions, we introduce an innovative user intent expansion framework. +Our framework leverages visual-language models to parse and compose multi-modal +user inputs to provide more accurate and satisfying results. It comprises +two-stage processes: 1) a parsing stage that incorporates a language parsing +module with large language models to enhance the comprehension of textual +inputs, along with a visual parsing module that integrates an interactive +segmentation module to swiftly identify detailed visual elements within images; +and 2) a logic composition stage that combines multiple user search intents +into a unified logic expression for more sophisticated operations in complex +searching scenarios. Moreover, the intent expansion framework enables users to +perform flexible contextualized interactions with the search results to further +specify or adjust their detailed search intents iteratively. We implemented the +framework into an image search system for NFT (non-fungible token) search and +conducted a user study to evaluate its usability and novel properties. The +results indicate that the proposed framework significantly improves users' +image search experience. Particularly the parsing and contextualized +interactions prove useful in allowing users to express their search intents +more accurately and engage in a more enjoyable iterative search experience. + +
+
+ comment: Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported + Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024) +
+
+
+
+
+ + ♻ ☆ CRAFT: Contextual Re-Activation of Filters for face recognition Training + + +
+ The first layer of a deep CNN backbone applies filters to an image to extract +the basic features available to later layers. During training, some filters may +go inactive, mean ing all weights in the filter approach zero. An inactive fil +ter in the final model represents a missed opportunity to extract a useful +feature. This phenomenon is especially prevalent in specialized CNNs such as +for face recogni tion (as opposed to, e.g., ImageNet). For example, in one the +most widely face recognition model (ArcFace), about half of the convolution +filters in the first layer are inactive. We propose a novel approach designed +and tested specif ically for face recognition networks, known as "CRAFT: +Contextual Re-Activation of Filters for Face Recognition Training". CRAFT +identifies inactive filters during training and reinitializes them based on the +context of strong filters at that stage in training. We show that CRAFT reduces +fraction of inactive filters from 44% to 32% on average and discovers filter +patterns not found by standard training. Compared to standard training without +reactivation, CRAFT demonstrates enhanced model accuracy on standard +face-recognition benchmark datasets including AgeDB-30, CPLFW, LFW, CALFW, and +CFP-FP, as well as on more challenging datasets like IJBB and IJBC. + +
+
+
+
+
+ + ♻ ☆ Neural Priming for Sample-Efficient Adaptation + + +
+ We propose Neural Priming, a technique for adapting large pretrained models +to distribution shifts and downstream tasks given few or no labeled examples. +Presented with class names or unlabeled test samples, Neural Priming enables +the model to recall and conditions its parameters on relevant data seen +throughout pretraining, thereby priming it for the test distribution. Neural +Priming can be performed at test time, even for pretraining datasets as large +as LAION-2B. Performing lightweight updates on the recalled data significantly +improves accuracy across a variety of distribution shift and transfer learning +benchmarks. Concretely, in the zero-shot setting, we see a 2.45% improvement in +accuracy on ImageNet and 3.81% accuracy improvement on average across standard +transfer learning benchmarks. Further, using Neural Priming at inference to +adapt to distribution shift, we see a 1.41% accuracy improvement on ImageNetV2. +These results demonstrate the effectiveness of Neural Priming in addressing the +challenge of limited labeled data and changing distributions. Code is available +at github.com/RAIVNLab/neural-priming. + +
+
+ comment: 18 pages, 7 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ A Multi-In-Single-Out Network for Video Frame Interpolation without + Optical Flow + + +
+ In general, deep learning-based video frame interpolation (VFI) methods have +predominantly focused on estimating motion vectors between two input frames and +warping them to the target time. While this approach has shown impressive +performance for linear motion between two input frames, it exhibits limitations +when dealing with occlusions and nonlinear movements. Recently, generative +models have been applied to VFI to address these issues. However, as VFI is not +a task focused on generating plausible images, but rather on predicting +accurate intermediate frames between two given frames, performance limitations +still persist. In this paper, we propose a multi-in-single-out (MISO) based VFI +method that does not rely on motion vector estimation, allowing it to +effectively model occlusions and nonlinear motion. Additionally, we introduce a +novel motion perceptual loss that enables MISO-VFI to better capture the +spatio-temporal correlations within the video frames. Our MISO-VFI method +achieves state-of-the-art results on VFI benchmarks Vimeo90K, Middlebury, and +UCF101, with a significant performance gap compared to existing approaches. + +
+
+ comment: Discovering a problem with the manuscript +
+
+
+
+
+ + ♻ ☆ Inflating 2D Convolution Weights for Efficient Generation of 3D Medical + Images + + +
+ The generation of three-dimensional (3D) medical images has great application +potential since it takes into account the 3D anatomical structure. Two problems +prevent effective training of a 3D medical generative model: (1) 3D medical +images are expensive to acquire and annotate, resulting in an insufficient +number of training images, and (2) a large number of parameters are involved in +3D convolution. + Methods: We propose a novel GAN model called 3D Split&Shuffle-GAN. To address +the 3D data scarcity issue, we first pre-train a two-dimensional (2D) GAN model +using abundant image slices and inflate the 2D convolution weights to improve +the initialization of the 3D GAN. Novel 3D network architectures are proposed +for both the generator and discriminator of the GAN model to significantly +reduce the number of parameters while maintaining the quality of image +generation. Several weight inflation strategies and parameter-efficient 3D +architectures are investigated. + Results: Experiments on both heart (Stanford AIMI Coronary Calcium) and brain +(Alzheimer's Disease Neuroimaging Initiative) datasets show that our method +leads to improved 3D image generation quality (14.7 improvements on Fr\'echet +inception distance) with significantly fewer parameters (only 48.5% of the +baseline method). + Conclusions: We built a parameter-efficient 3D medical image generation +model. Due to the efficiency and effectiveness, it has the potential to +generate high-quality 3D brain and heart images for real use cases. + +
+
+ comment: Published at Computer Methods and Programs in Biomedicine (CMPB) 2023 +
+
+
+
+
+ + ♻ ☆ Convolutional layers are equivariant to discrete shifts but not + continuous translations + + +
+ The purpose of this short and simple note is to clarify a common +misconception about convolutional neural networks (CNNs). CNNs are made up of +convolutional layers which are shift equivariant due to weight sharing. +However, convolutional layers are not translation equivariant, even when +boundary effects are ignored and when pooling and subsampling are absent. This +is because shift equivariance is a discrete symmetry while translation +equivariance is a continuous symmetry. This fact is well known among +researchers in equivariant machine learning, but is usually overlooked among +non-experts. To minimize confusion, we suggest using the term `shift +equivariance' to refer to discrete shifts in pixels and `translation +equivariance' to refer to continuous translations. + +
+
+
+
+
+ + ♻ ☆ Data-Driven Target Localization Using Adaptive Radar Processing and + Convolutional Neural Networks + + +
+ Leveraging the advanced functionalities of modern radio frequency (RF) +modeling and simulation tools, specifically designed for adaptive radar +processing applications, this paper presents a data-driven approach to improve +accuracy in radar target localization post adaptive radar detection. To this +end, we generate a large number of radar returns by randomly placing targets of +variable strengths in a predefined area, using RFView, a high-fidelity, +site-specific, RF modeling & simulation tool. We produce heatmap tensors from +the radar returns, in range, azimuth [and Doppler], of the normalized adaptive +matched filter (NAMF) test statistic. We then train a regression convolutional +neural network (CNN) to estimate target locations from these heatmap tensors, +and we compare the target localization accuracy of this approach with that of +peak-finding and local search methods. This empirical study shows that our +regression CNN achieves a considerable improvement in target location +estimation accuracy. The regression CNN offers significant gains and reasonable +accuracy even at signal-to-clutter-plus-noise ratio (SCNR) regimes that are +close to the breakdown threshold SCNR of the NAMF. We also study the robustness +of our trained CNN to mismatches in the radar data, where the CNN is tested on +heatmap tensors collected from areas that it was not trained on. We show that +our CNN can be made robust to mismatches in the radar data through few-shot +learning, using a relatively small number of new training samples. + +
+
+
+
+
+ + ♻ ☆ Effective Backdoor Mitigation Depends on the Pre-training Objective NeurIPS 2023 + + +
+ Despite the advanced capabilities of contemporary machine learning (ML) +models, they remain vulnerable to adversarial and backdoor attacks. This +vulnerability is particularly concerning in real-world deployments, where +compromised models may exhibit unpredictable behavior in critical scenarios. +Such risks are heightened by the prevalent practice of collecting massive, +internet-sourced datasets for pre-training multimodal models, as these datasets +may harbor backdoors. Various techniques have been proposed to mitigate the +effects of backdooring in these models such as CleanCLIP which is the current +state-of-the-art approach. In this work, we demonstrate that the efficacy of +CleanCLIP in mitigating backdoors is highly dependent on the particular +objective used during model pre-training. We observe that stronger pre-training +objectives correlate with harder to remove backdoors behaviors. We show this by +training multimodal models on two large datasets consisting of 3 million (CC3M) +and 6 million (CC6M) datapoints, under various pre-training objectives, +followed by poison removal using CleanCLIP. We find that CleanCLIP is +ineffective when stronger pre-training objectives are used, even with extensive +hyperparameter tuning. Our findings underscore critical considerations for ML +practitioners who pre-train models using large-scale web-curated data and are +concerned about potential backdoor threats. Notably, our results suggest that +simpler pre-training objectives are more amenable to effective backdoor +removal. This insight is pivotal for practitioners seeking to balance the +trade-offs between using stronger pre-training objectives and security against +backdoor attacks. + +
+
+ comment: Accepted for oral presentation at BUGS workshop @ NeurIPS 2023 + (https://neurips2023-bugs.github.io/) +
+
+
+
+
+ + ♻ ☆ The Use of Multi-Scale Fiducial Markers To Aid Takeoff and Landing + Navigation by Rotorcraft + + +
+ This paper quantifies the performance of visual SLAM that leverages +multi-scale fiducial markers (i.e., artificial landmarks that can be detected +at a wide range of distances) to show its potential for reliable takeoff and +landing navigation in rotorcraft. Prior work has shown that square markers with +a black-and-white pattern of grid cells can be used to improve the performance +of visual SLAM with color cameras. We extend this prior work to allow nested +marker layouts. We evaluate performance during semi-autonomous takeoff and +landing operations in a variety of environmental conditions by a DJI Matrice +300 RTK rotorcraft with two FLIR Blackfly color cameras, using RTK GNSS to +obtain ground truth pose estimates. Performance measures include absolute +trajectory error and the fraction of the number of estimated poses to the total +frame. We release all of our results -- our dataset and the code of the +implementation of the visual SLAM with fiducial markers -- to the public as +open-source. + +
+
+ comment: Extended abstract accepted at the 2024 AIAA SciTech +
+
+
+
+
+
+
+
+ + Information Retrieval 16 + +
+
+
+ + ☆ Rank-without-GPT: Building GPT-Independent Listwise Rerankers on + Open-Source Large Language Models + + +
+ Listwise rerankers based on large language models (LLM) are the zero-shot +state-of-the-art. However, current works in this direction all depend on the +GPT models, making it a single point of failure in scientific reproducibility. +Moreover, it raises the concern that the current research findings only hold +for GPT models but not LLM in general. In this work, we lift this pre-condition +and build for the first time effective listwise rerankers without any form of +dependency on GPT. Our passage retrieval experiments show that our best list se +reranker surpasses the listwise rerankers based on GPT-3.5 by 13% and achieves +97% effectiveness of the ones built on GPT-4. Our results also show that the +existing training datasets, which were expressly constructed for pointwise +ranking, are insufficient for building such listwise rerankers. Instead, +high-quality listwise ranking data is required and crucial, calling for further +work on building human-annotated listwise data resources. + +
+
+
+
+
+ + ☆ Let the LLMs Talk: Simulating Human-to-Human Conversational QA via + Zero-Shot LLM-to-LLM Interactions WSDM 2024 + + +
+ Conversational question-answering (CQA) systems aim to create interactive +search systems that effectively retrieve information by interacting with users. +To replicate human-to-human conversations, existing work uses human annotators +to play the roles of the questioner (student) and the answerer (teacher). +Despite its effectiveness, challenges exist as human annotation is +time-consuming, inconsistent, and not scalable. To address this issue and +investigate the applicability of large language models (LLMs) in CQA +simulation, we propose a simulation framework that employs zero-shot learner +LLMs for simulating teacher-student interactions. Our framework involves two +LLMs interacting on a specific topic, with the first LLM acting as a student, +generating questions to explore a given search topic. The second LLM plays the +role of a teacher by answering questions and is equipped with additional +information, including a text on the given topic. We implement both the student +and teacher by zero-shot prompting the GPT-4 model. To assess the effectiveness +of LLMs in simulating CQA interactions and understand the disparities between +LLM- and human-generated conversations, we evaluate the simulated data from +various perspectives. We begin by evaluating the teacher's performance through +both automatic and human assessment. Next, we evaluate the performance of the +student, analyzing and comparing the disparities between questions generated by +the LLM and those generated by humans. Furthermore, we conduct extensive +analyses to thoroughly examine the LLM performance by benchmarking +state-of-the-art reading comprehension models on both datasets. Our results +reveal that the teacher LLM generates lengthier answers that tend to be more +accurate and complete. The student LLM generates more diverse questions, +covering more aspects of a given topic. + +
+
+ comment: Accepted at WSDM 2024 +
+
+
+
+
+ + ☆ Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive + Review + + +
+ Due to the advent and increase in the popularity of the Internet, people have +been producing and disseminating textual data in several ways, such as reviews, +social media posts, and news articles. As a result, numerous researchers have +been working on discovering patterns in textual data, especially because social +media posts function as social sensors, indicating peoples' opinions, +interests, etc. However, most tasks regarding natural language processing are +addressed using traditional machine learning methods and static datasets. This +setting can lead to several problems, such as an outdated dataset, which may +not correspond to reality, and an outdated model, which has its performance +degrading over time. Concept drift is another aspect that emphasizes these +issues, which corresponds to data distribution and pattern changes. In a text +stream scenario, it is even more challenging due to its characteristics, such +as the high speed and data arriving sequentially. In addition, models for this +type of scenario must adhere to the constraints mentioned above while learning +from the stream by storing texts for a limited time and consuming low memory. +In this study, we performed a systematic literature review regarding concept +drift adaptation in text stream scenarios. Considering well-defined criteria, +we selected 40 papers to unravel aspects such as text drift categories, types +of text drift detection, model update mechanism, the addressed stream mining +tasks, types of text representations, and text representation update mechanism. +In addition, we discussed drift visualization and simulation and listed +real-world datasets used in the selected papers. Therefore, this paper +comprehensively reviews the concept drift adaptation in text stream mining +scenarios. + +
+
+ comment: 49 pages +
+
+
+
+
+ + ☆ RankZephyr: Effective and Robust Zero-Shot Listwise Reranking is a + Breeze! + + +
+ In information retrieval, proprietary large language models (LLMs) such as +GPT-4 and open-source counterparts such as LLaMA and Vicuna have played a vital +role in reranking. However, the gap between open-source and closed models +persists, with reliance on proprietary, non-transparent models constraining +reproducibility. Addressing this gap, we introduce RankZephyr, a +state-of-the-art, open-source LLM for listwise zero-shot reranking. RankZephyr +not only bridges the effectiveness gap with GPT-4 but in some cases surpasses +the proprietary model. Our comprehensive evaluations across several datasets +(TREC Deep Learning Tracks; NEWS and COVID from BEIR) showcase this ability. +RankZephyr benefits from strategic training choices and is resilient against +variations in initial document ordering and the number of documents reranked. +Additionally, our model outperforms GPT-4 on the NovelEval test set, comprising +queries and passages past its training period, which addresses concerns about +data contamination. To foster further research in this rapidly evolving field, +we provide all code necessary to reproduce our results at +https://github.com/castorini/rank_llm. + +
+
+
+
+
+ + ☆ An empirical study of next-basket recommendations + + +
+ Next Basket Recommender Systems (NBRs) function to recommend the subsequent +shopping baskets for users through the modeling of their preferences derived +from purchase history, typically manifested as a sequence of historical +baskets. Given their widespread applicability in the E-commerce industry, +investigations into NBRs have garnered increased attention in recent years. +Despite the proliferation of diverse NBR methodologies, a substantial challenge +lies in the absence of a systematic and unified evaluation framework across +these methodologies. Various studies frequently appraise NBR approaches using +disparate datasets and diverse experimental settings, impeding a fair and +effective comparative assessment of methodological performance. To bridge this +gap, this study undertakes a systematic empirical inquiry into NBRs, reviewing +seminal works within the domain and scrutinizing their respective merits and +drawbacks. Subsequently, we implement designated NBR algorithms on uniform +datasets, employing consistent experimental configurations, and assess their +performances via identical metrics. This methodological rigor establishes a +cohesive framework for the impartial evaluation of diverse NBR approaches. It +is anticipated that this study will furnish a robust foundation and serve as a +pivotal reference for forthcoming research endeavors in this dynamic field. + +
+
+ comment: 6 pages, 2 figures +
+
+
+
+
+ + ☆ A Multi-Granularity-Aware Aspect Learning Model for Multi-Aspect Dense + Retrieval WSDM2024 + + +
+ Dense retrieval methods have been mostly focused on unstructured text and +less attention has been drawn to structured data with various aspects, e.g., +products with aspects such as category and brand. Recent work has proposed two +approaches to incorporate the aspect information into item representations for +effective retrieval by predicting the values associated with the item aspects. +Despite their efficacy, they treat the values as isolated classes (e.g., "Smart +Homes", "Home, Garden & Tools", and "Beauty & Health") and ignore their +fine-grained semantic relation. Furthermore, they either enforce the learning +of aspects into the CLS token, which could confuse it from its designated use +for representing the entire content semantics, or learn extra aspect embeddings +only with the value prediction objective, which could be insufficient +especially when there are no annotated values for an item aspect. Aware of +these limitations, we propose a MUlti-granulaRity-aware Aspect Learning model +(MURAL) for multi-aspect dense retrieval. It leverages aspect information +across various granularities to capture both coarse and fine-grained semantic +relations between values. Moreover, MURAL incorporates separate aspect +embeddings as input to transformer encoders so that the masked language model +objective can assist implicit aspect learning even without aspect-value +annotations. Extensive experiments on two real-world datasets of products and +mini-programs show that MURAL outperforms state-of-the-art baselines +significantly. + +
+
+ comment: Accepted by WSDM2024 +
+
+
+
+
+ + ☆ DRAFT: Dense Retrieval Augmented Few-shot Topic classifier Framework + + +
+ With the growing volume of diverse information, the demand for classifying +arbitrary topics has become increasingly critical. To address this challenge, +we introduce DRAFT, a simple framework designed to train a classifier for +few-shot topic classification. DRAFT uses a few examples of a specific topic as +queries to construct Customized dataset with a dense retriever model. +Multi-query retrieval (MQR) algorithm, which effectively handles multiple +queries related to a specific topic, is applied to construct the Customized +dataset. Subsequently, we fine-tune a classifier using the Customized dataset +to identify the topic. To demonstrate the efficacy of our proposed approach, we +conduct evaluations on both widely used classification benchmark datasets and +manually constructed datasets with 291 diverse topics, which simulate diverse +contents encountered in real-world applications. DRAFT shows competitive or +superior performance compared to baselines that use in-context learning, such +as GPT-3 175B and InstructGPT 175B, on few-shot topic classification tasks +despite having 177 times fewer parameters, demonstrating its effectiveness. + +
+
+
+
+
+ + ☆ LLaRA: Aligning Large Language Models with Sequential Recommenders + + +
+ Sequential recommendation aims to predict the subsequent items matching user +preference based on her/his historical interactions. With the development of +Large Language Models (LLMs), there is growing interest in exploring the +potential of LLMs for sequential recommendation by framing it as a language +modeling task. Prior works represent items in the textual prompts using either +ID indexing or text indexing and feed the prompts into LLMs, but falling short +of either encapsulating comprehensive world knowledge or exhibiting sufficient +sequential understanding. To harness the complementary strengths of traditional +recommenders (which encode user behavioral knowledge) and LLMs (which possess +world knowledge about items), we propose LLaRA -- a Large Language and +Recommendation Assistant framework. Specifically, LLaRA represents items in +LLM's input prompts using a novel hybrid approach that integrates ID-based item +embeddings from traditional recommenders with textual item features. Viewing +the ``sequential behavior of the user'' as a new modality in recommendation, we +employ an adapter to bridge the modality gap between ID embeddings of the +traditional recommenders and the input space of LLMs. Furthermore, instead of +directly exposing the hybrid prompt to LLMs, we apply a curriculum learning +approach to gradually ramp up training complexity. We first warm up the LLM +with text-only prompting, which aligns more naturally with the LLM's language +modeling capabilities. Thereafter, we progressively transition to hybrid +prompting, training the adapter to incorporate behavioral knowledge from the +traditional sequential recommender into the LLM. Extensive experiments +demonstrate the efficacy of LLaRA framework. Our code and data are available at +https://github.com/ljy0ustc/LLaRA . + +
+
+ comment: 12 pages, 7 figures +
+
+
+
+
+ + ☆ E4SRec: An Elegant Effective Efficient Extensible Solution of Large + Language Models for Sequential Recommendation + + +
+ The recent advancements in Large Language Models (LLMs) have sparked interest +in harnessing their potential within recommender systems. Since LLMs are +designed for natural language tasks, existing recommendation approaches have +predominantly transformed recommendation tasks into open-domain natural +language generation tasks. However, this approach necessitates items to possess +rich semantic information, often generates out-of-range results, and suffers +from notably low efficiency and limited extensibility. Furthermore, practical +ID-based recommendation strategies, reliant on a huge number of unique +identities (IDs) to represent users and items, have gained prominence in +real-world recommender systems due to their effectiveness and efficiency. +Nevertheless, the incapacity of LLMs to model IDs presents a formidable +challenge when seeking to leverage LLMs for personalized recommendations. In +this paper, we introduce an Elegant Effective Efficient Extensible solution for +large language models for Sequential Recommendation (E4SRec), which seamlessly +integrates LLMs with traditional recommender systems that exclusively utilize +IDs to represent items. Specifically, E4SRec takes ID sequences as inputs, +ensuring that the generated outputs fall within the candidate lists. +Furthermore, E4SRec possesses the capability to generate the entire ranking +list in a single forward process, and demands only a minimal set of pluggable +parameters, which are trained for each dataset while keeping the entire LLM +frozen. We substantiate the effectiveness, efficiency, and extensibility of our +proposed E4SRec through comprehensive experiments conducted on four widely-used +real-world datasets. The implementation code is accessible at +https://github.com/HestiaSky/E4SRec/. + +
+
+
+
+
+ + ☆ PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval + Models WSDM 2024 + + +
+ Embedding-based Retrieval Models (ERMs) have emerged as a promising framework +for large-scale text retrieval problems due to powerful large language models. +Nevertheless, fine-tuning ERMs to reach state-of-the-art results can be +expensive due to the extreme scale of data as well as the complexity of +multi-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this +work, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast +tuning of ERMs without any backward pass in the optimization. At index building +stage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN) +component. At inference stage, PEFA performs a convex combination of two +scoring functions, one from the ERM and the other from the kNN. Based on the +neighborhood definition, PEFA framework induces two realizations, namely +PEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra +small) using a single ANN index. Empirically, PEFA achieves significant +improvement on two retrieval applications. For document retrieval, regarding +Recall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an +average of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%, +respectively. For product search, PEFA improves the Recall@100 of the +fine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL, +respectively. Our code is available at https://github.com/ +amzn/pecos/tree/mainline/examples/pefa-wsdm24 + +
+
+ comment: Accept by WSDM 2024 +
+
+
+
+
+ + ☆ FreestyleRet: Retrieving Images from Style-Diversified Queries + + +
+ Image Retrieval aims to retrieve corresponding images based on a given query. +In application scenarios, users intend to express their retrieval intent +through various query styles. However, current retrieval tasks predominantly +focus on text-query retrieval exploration, leading to limited retrieval query +options and potential ambiguity or bias in user intention. In this paper, we +propose the Style-Diversified Query-Based Image Retrieval task, which enables +retrieval based on various query styles. To facilitate the novel setting, we +propose the first Diverse-Style Retrieval dataset, encompassing diverse query +styles including text, sketch, low-resolution, and art. We also propose a +light-weighted style-diversified retrieval framework. For various query style +inputs, we apply the Gram Matrix to extract the query's textural features and +cluster them into a style space with style-specific bases. Then we employ the +style-init prompt tuning module to enable the visual encoder to comprehend the +texture and style information of the query. Experiments demonstrate that our +model, employing the style-init prompt tuning strategy, outperforms existing +retrieval models on the style-diversified retrieval task. Moreover, +style-diversified queries~(sketch+text, art+text, etc) can be simultaneously +retrieved in our model. The auxiliary information from other queries enhances +the retrieval performance within the respective query. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Combining Counting Processes and Classification Improves a Stopping Rule + for Technology Assisted Review EMNLP 2023 + + +
+ Technology Assisted Review (TAR) stopping rules aim to reduce the cost of +manually assessing documents for relevance by minimising the number of +documents that need to be examined to ensure a desired level of recall. This +paper extends an effective stopping rule using information derived from a text +classifier that can be trained without the need for any additional annotation. +Experiments on multiple data sets (CLEF e-Health, TREC Total Recall, TREC Legal +and RCV1) showed that the proposed approach consistently improves performance +and outperforms several alternative methods. + +
+
+ comment: Accepted at EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Adaptive spectral graph wavelets for collaborative filtering + + +
+ Collaborative filtering is a popular approach in recommender systems, whose +objective is to provide personalized item suggestions to potential users based +on their purchase or browsing history. However, personalized recommendations +require considerable amount of behavioral data on users, which is usually +unavailable for new users, giving rise to the cold-start problem. To help +alleviate this challenging problem, we introduce a spectral graph wavelet +collaborative filtering framework for implicit feedback data, where users, +items and their interactions are represented as a bipartite graph. +Specifically, we first propose an adaptive transfer function by leveraging a +power transform with the goal of stabilizing the variance of graph frequencies +in the spectral domain. Then, we design a deep recommendation model for +efficient learning of low-dimensional embeddings of users and items using +spectral graph wavelets in an end-to-end fashion. In addition to capturing the +graph's local and global structures, our approach yields localization of graph +signals in both spatial and spectral domains, and hence not only learns +discriminative representations of users and items, but also promotes the +recommendation quality. The effectiveness of our proposed model is demonstrated +through extensive experiments on real-world benchmark datasets, achieving +better recommendation performance compared with strong baseline methods. + +
+
+
+
+
+ + ♻ ☆ HeteFedRec: Federated Recommender Systems with Model Heterogeneity + + +
+ Owing to the nature of privacy protection, federated recommender systems +(FedRecs) have garnered increasing interest in the realm of on-device +recommender systems. However, most existing FedRecs only allow participating +clients to collaboratively train a recommendation model of the same public +parameter size. Training a model of the same size for all clients can lead to +suboptimal performance since clients possess varying resources. For example, +clients with limited training data may prefer to train a smaller recommendation +model to avoid excessive data consumption, while clients with sufficient data +would benefit from a larger model to achieve higher recommendation accuracy. To +address the above challenge, this paper introduces HeteFedRec, a novel FedRec +framework that enables the assignment of personalized model sizes to +participants. In HeteFedRec, we present a heterogeneous recommendation model +aggregation strategy, including a unified dual-task learning mechanism and a +dimensional decorrelation regularization, to allow knowledge aggregation among +recommender models of different sizes. Additionally, a relation-based ensemble +knowledge distillation method is proposed to effectively distil knowledge from +heterogeneous item embeddings. Extensive experiments conducted on three +real-world recommendation datasets demonstrate the effectiveness and efficiency +of HeteFedRec in training federated recommender systems under heterogeneous +settings. + +
+
+
+
+
+ + ♻ ☆ The Contemporary Art of Image Search: Iterative User Intent Expansion + via Vision-Language Model SC + + +
+ Image search is an essential and user-friendly method to explore vast +galleries of digital images. However, existing image search methods heavily +rely on proximity measurements like tag matching or image similarity, requiring +precise user inputs for satisfactory results. To meet the growing demand for a +contemporary image search engine that enables accurate comprehension of users' +search intentions, we introduce an innovative user intent expansion framework. +Our framework leverages visual-language models to parse and compose multi-modal +user inputs to provide more accurate and satisfying results. It comprises +two-stage processes: 1) a parsing stage that incorporates a language parsing +module with large language models to enhance the comprehension of textual +inputs, along with a visual parsing module that integrates an interactive +segmentation module to swiftly identify detailed visual elements within images; +and 2) a logic composition stage that combines multiple user search intents +into a unified logic expression for more sophisticated operations in complex +searching scenarios. Moreover, the intent expansion framework enables users to +perform flexible contextualized interactions with the search results to further +specify or adjust their detailed search intents iteratively. We implemented the +framework into an image search system for NFT (non-fungible token) search and +conducted a user study to evaluate its usability and novel properties. The +results indicate that the proposed framework significantly improves users' +image search experience. Particularly the parsing and contextualized +interactions prove useful in allowing users to express their search intents +more accurately and engage in a more enjoyable iterative search experience. + +
+
+ comment: Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported + Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024) +
+
+
+
+
+ + ♻ ☆ Machine Reading Comprehension using Case-based Reasoning + + +
+ We present an accurate and interpretable method for answer extraction in +machine reading comprehension that is reminiscent of case-based reasoning (CBR) +from classical AI. Our method (CBR-MRC) builds upon the hypothesis that +contextualized answers to similar questions share semantic similarities with +each other. Given a test question, CBR-MRC first retrieves a set of similar +cases from a nonparametric memory and then predicts an answer by selecting the +span in the test context that is most similar to the contextualized +representations of answers in the retrieved cases. The semi-parametric nature +of our approach allows it to attribute a prediction to the specific set of +evidence cases, making it a desirable choice for building reliable and +debuggable QA systems. We show that CBR-MRC provides high accuracy comparable +with large reader models and outperforms baselines by 11.5 and 8.4 EM on +NaturalQuestions and NewsQA, respectively. Further, we demonstrate the ability +of CBR-MRC in identifying not just the correct answer tokens but also the span +with the most relevant supporting evidence. Lastly, we observe that contexts +for certain question types show higher lexical diversity than others and find +that CBR-MRC is robust to these variations while performance using +fully-parametric methods drops. + +
+
+ comment: 9 pages, 2 figures +
+
+
+
+
+
+
+
+ + Machine Learning 154 + +
+
+
+ + ☆ Dexterous Functional Grasping + + +
+ While there have been significant strides in dexterous manipulation, most of +it is limited to benchmark tasks like in-hand reorientation which are of +limited utility in the real world. The main benefit of dexterous hands over +two-fingered ones is their ability to pickup tools and other objects (including +thin ones) and grasp them firmly to apply force. However, this task requires +both a complex understanding of functional affordances as well as precise +low-level control. While prior work obtains affordances from human data this +approach doesn't scale to low-level control. Similarly, simulation training +cannot give the robot an understanding of real-world semantics. In this paper, +we aim to combine the best of both worlds to accomplish functional grasping for +in-the-wild objects. We use a modular approach. First, affordances are obtained +by matching corresponding regions of different objects and then a low-level +policy trained in sim is run to grasp it. We propose a novel application of +eigengrasps to reduce the search space of RL using a small amount of human data +and find that it leads to more stable and physically realistic motion. We find +that eigengrasp action space beats baselines in simulation and outperforms +hardcoded grasping in real and matches or outperforms a trained human +teleoperator. Results visualizations and videos at https://dexfunc.github.io/ + +
+
+ comment: In CoRL 2023. Website at https://dexfunc.github.io/ +
+
+
+
+
+ + ☆ Describing Differences in Image Sets with Natural Language + + +
+ How do two sets of images differ? Discerning set-level differences is crucial +for understanding model behaviors and analyzing datasets, yet manually sifting +through thousands of images is impractical. To aid in this discovery process, +we explore the task of automatically describing the differences between two +$\textbf{sets}$ of images, which we term Set Difference Captioning. This task +takes in image sets $D_A$ and $D_B$, and outputs a description that is more +often true on $D_A$ than $D_B$. We outline a two-stage approach that first +proposes candidate difference descriptions from image sets and then re-ranks +the candidates by checking how well they can differentiate the two sets. We +introduce VisDiff, which first captions the images and prompts a language model +to propose candidate descriptions, then re-ranks these descriptions using CLIP. +To evaluate VisDiff, we collect VisDiffBench, a dataset with 187 paired image +sets with ground truth difference descriptions. We apply VisDiff to various +domains, such as comparing datasets (e.g., ImageNet vs. ImageNetV2), comparing +classification models (e.g., zero-shot CLIP vs. supervised ResNet), summarizing +model failure modes (supervised ResNet), characterizing differences between +generative models (e.g., StableDiffusionV1 and V2), and discovering what makes +images memorable. Using VisDiff, we are able to find interesting and previously +unknown differences in datasets and models, demonstrating its utility in +revealing nuanced insights. + +
+
+
+
+
+ + ☆ Detecting algorithmic bias in medical AI-models + + +
+ With the growing prevalence of machine learning and artificial +intelligence-based medical decision support systems, it is equally important to +ensure that these systems provide patient outcomes in a fair and equitable +fashion. This paper presents an innovative framework for detecting areas of +algorithmic bias in medical-AI decision support systems. Our approach +efficiently identifies potential biases in medical-AI models, specifically in +the context of sepsis prediction, by employing the Classification and +Regression Trees (CART) algorithm. We verify our methodology by conducting a +series of synthetic data experiments, showcasing its ability to estimate areas +of bias in controlled settings precisely. The effectiveness of the concept is +further validated by experiments using electronic medical records from Grady +Memorial Hospital in Atlanta, Georgia. These tests demonstrate the practical +implementation of our strategy in a clinical environment, where it can function +as a vital instrument for guaranteeing fairness and equity in AI-based medical +decisions. + +
+
+ comment: 26 pages, 9 figures +
+
+
+
+
+ + ☆ Classification for everyone : Building geography agnostic models for + fairer recognition + + +
+ In this paper, we analyze different methods to mitigate inherent geographical +biases present in state of the art image classification models. We first +quantitatively present this bias in two datasets - The Dollar Street Dataset +and ImageNet, using images with location information. We then present different +methods which can be employed to reduce this bias. Finally, we analyze the +effectiveness of the different techniques on making these models more robust to +geographical locations of the images. + +
+
+
+
+
+ + ☆ Choroidalyzer: An open-source, end-to-end pipeline for choroidal + analysis in optical coherence tomography + + +
+ Purpose: To develop Choroidalyzer, an open-source, end-to-end pipeline for +segmenting the choroid region, vessels, and fovea, and deriving choroidal +thickness, area, and vascular index. + Methods: We used 5,600 OCT B-scans (233 subjects, 6 systemic disease cohorts, +3 device types, 2 manufacturers). To generate region and vessel ground-truths, +we used state-of-the-art automatic methods following manual correction of +inaccurate segmentations, with foveal positions manually annotated. We trained +a U-Net deep-learning model to detect the region, vessels, and fovea to +calculate choroid thickness, area, and vascular index in a fovea-centred region +of interest. We analysed segmentation agreement (AUC, Dice) and choroid metrics +agreement (Pearson, Spearman, mean absolute error (MAE)) in internal and +external test sets. We compared Choroidalyzer to two manual graders on a small +subset of external test images and examined cases of high error. + Results: Choroidalyzer took 0.299 seconds per image on a standard laptop and +achieved excellent region (Dice: internal 0.9789, external 0.9749), very good +vessel segmentation performance (Dice: internal 0.8817, external 0.8703) and +excellent fovea location prediction (MAE: internal 3.9 pixels, external 3.4 +pixels). For thickness, area, and vascular index, Pearson correlations were +0.9754, 0.9815, and 0.8285 (internal) / 0.9831, 0.9779, 0.7948 (external), +respectively (all p<0.0001). Choroidalyzer's agreement with graders was +comparable to the inter-grader agreement across all metrics. + Conclusions: Choroidalyzer is an open-source, end-to-end pipeline that +accurately segments the choroid and reliably extracts thickness, area, and +vascular index. Especially choroidal vessel segmentation is a difficult and +subjective task, and fully-automatic methods like Choroidalyzer could provide +objectivity and standardisation. + +
+
+
+
+
+ + ☆ Fast CT anatomic localization algorithm + + +
+ Automatically determining the position of every slice in a CT scan is a basic +yet powerful capability allowing fast retrieval of region of interest for +visual inspection and automated analysis. Unlike conventional localization +approaches which work at the slice level, we directly localize only a fraction +of the slices and and then fit a linear model which maps slice index to its +estimated axial anatomical position based on those slices. The model is then +used to assign axial position to every slices of the scan. This approach proves +to be both computationally efficient, with a typical processing time of less +than a second per scan (regardless of its size), accurate, with a typical +median localization error of 1 cm, and robust to different noise sources, +imaging protocols, metal induced artifacts, anatomical deformations etc. +Another key element of our approach is the introduction of a mapping confidence +score. This score acts as a fail safe mechanism which allows a rejection of +unreliable localization results in rare cases of anomalous scans. Our algorithm +sets new State Of The Art results in terms of localization accuracy. It also +offers a decrease of two orders of magnitude in processing time with respect to +all published processing times. It was designed to be invariant to various scan +resolutions, scan protocols, patient orientations, strong artifacts and various +deformations and abnormalities. Additionally, our algorithm is the first one to +the best of our knowledge which supports the entire body from head to feet and +is not confined to specific anatomical region. This algorithm was tested on +thousands of scans and proves to be very reliable and useful as a preprocessing +stage for many applications. + +
+
+
+
+
+ + ☆ MIND: Multi-Task Incremental Network Distillation + + +
+ The recent surge in pervasive devices generating dynamic data streams has +underscored the necessity for learning systems to adapt to data distributional +shifts continually. To tackle this challenge, the research community has put +forth a spectrum of methodologies, including the demanding pursuit of +class-incremental learning without replay data. In this study, we present MIND, +a parameter isolation method that aims to significantly enhance the performance +of replay-free solutions and achieve state-of-the-art results on several widely +studied datasets. Our approach introduces two main contributions: two +alternative distillation procedures that significantly improve the efficiency +of MIND increasing the accumulated knowledge of each sub-network, and the +optimization of the BachNorm layers across tasks inside the sub-networks. +Overall, MIND outperforms all the state-of-the-art methods for rehearsal-free +Class-Incremental learning (with an increment in classification accuracy of +approx. +6% on CIFAR-100/10 and +10% on TinyImageNet/10) reaching up to approx. ++40% accuracy in Domain-Incremental scenarios. Moreover, we ablated each +contribution to demonstrate its impact on performance improvement. Our results +showcase the superior performance of MIND indicating its potential for +addressing the challenges posed by Class-incremental and Domain-Incremental +learning in resource-constrained environments. + +
+
+
+
+
+ + ☆ Unsupervised Video Domain Adaptation with Masked Pre-Training and + Collaborative Self-Training + + +
+ In this work, we tackle the problem of unsupervised domain adaptation (UDA) +for video action recognition. Our approach, which we call UNITE, uses an image +teacher model to adapt a video student model to the target domain. UNITE first +employs self-supervised pre-training to promote discriminative feature learning +on target domain videos using a teacher-guided masked distillation objective. +We then perform self-training on masked target data, using the video student +model and image teacher model together to generate improved pseudolabels for +unlabeled target videos. Our self-training process successfully leverages the +strengths of both models to achieve strong transfer performance across domains. +We evaluate our approach on multiple video domain adaptation benchmarks and +observe significant improvements upon previously reported results. + +
+
+
+
+
+ + ☆ Concept Drift Adaptation in Text Stream Mining Settings: A Comprehensive + Review + + +
+ Due to the advent and increase in the popularity of the Internet, people have +been producing and disseminating textual data in several ways, such as reviews, +social media posts, and news articles. As a result, numerous researchers have +been working on discovering patterns in textual data, especially because social +media posts function as social sensors, indicating peoples' opinions, +interests, etc. However, most tasks regarding natural language processing are +addressed using traditional machine learning methods and static datasets. This +setting can lead to several problems, such as an outdated dataset, which may +not correspond to reality, and an outdated model, which has its performance +degrading over time. Concept drift is another aspect that emphasizes these +issues, which corresponds to data distribution and pattern changes. In a text +stream scenario, it is even more challenging due to its characteristics, such +as the high speed and data arriving sequentially. In addition, models for this +type of scenario must adhere to the constraints mentioned above while learning +from the stream by storing texts for a limited time and consuming low memory. +In this study, we performed a systematic literature review regarding concept +drift adaptation in text stream scenarios. Considering well-defined criteria, +we selected 40 papers to unravel aspects such as text drift categories, types +of text drift detection, model update mechanism, the addressed stream mining +tasks, types of text representations, and text representation update mechanism. +In addition, we discussed drift visualization and simulation and listed +real-world datasets used in the selected papers. Therefore, this paper +comprehensively reviews the concept drift adaptation in text stream mining +scenarios. + +
+
+ comment: 49 pages +
+
+
+
+
+ + ☆ Toward autocorrection of chemical process flowsheets using large + language models + + +
+ The process engineering domain widely uses Process Flow Diagrams (PFDs) and +Process and Instrumentation Diagrams (P&IDs) to represent process flows and +equipment configurations. However, the P&IDs and PFDs, hereafter called +flowsheets, can contain errors causing safety hazards, inefficient operation, +and unnecessary expenses. Correcting and verifying flowsheets is a tedious, +manual process. We propose a novel generative AI methodology for automatically +identifying errors in flowsheets and suggesting corrections to the user, i.e., +autocorrecting flowsheets. Inspired by the breakthrough of Large Language +Models (LLMs) for grammatical autocorrection of human language, we investigate +LLMs for the autocorrection of flowsheets. The input to the model is a +potentially erroneous flowsheet and the output of the model are suggestions for +a corrected flowsheet. We train our autocorrection model on a synthetic dataset +in a supervised manner. The model achieves a top-1 accuracy of 80% and a top-5 +accuracy of 84% on an independent test dataset of synthetically generated +flowsheets. The results suggest that the model can learn to autocorrect the +synthetic flowsheets. We envision that flowsheet autocorrection will become a +useful tool for chemical engineers. + +
+
+
+
+
+ + ☆ Experimental Insights Towards Explainable and Interpretable Pedestrian + Crossing Prediction + + +
+ In the context of autonomous driving, pedestrian crossing prediction is a key +component for improving road safety. Presently, the focus of these predictions +extends beyond achieving trustworthy results; it is shifting towards the +explainability and interpretability of these predictions. This research +introduces a novel neuro-symbolic approach that combines deep learning and +fuzzy logic for an explainable and interpretable pedestrian crossing +prediction. We have developed an explainable predictor (ExPedCross), which +utilizes a set of explainable features and employs a fuzzy inference system to +predict whether the pedestrian will cross or not. Our approach was evaluated on +both the PIE and JAAD datasets. The results offer experimental insights into +achieving explainability and interpretability in the pedestrian crossing +prediction task. Furthermore, the testing results yield a set of guidelines and +recommendations regarding the process of dataset selection, feature selection, +and explainability. + +
+
+
+
+
+ + ☆ Attention-enhanced neural differential equations for physics-informed + deep learning of ion transport NeurIPS + + +
+ Species transport models typically combine partial differential equations +(PDEs) with relations from hindered transport theory to quantify +electromigrative, convective, and diffusive transport through complex +nanoporous systems; however, these formulations are frequently substantial +simplifications of the governing dynamics, leading to the poor generalization +performance of PDE-based models. Given the growing interest in deep learning +methods for the physical sciences, we develop a machine learning-based approach +to characterize ion transport across nanoporous membranes. Our proposed +framework centers around attention-enhanced neural differential equations that +incorporate electroneutrality-based inductive biases to improve generalization +performance relative to conventional PDE-based methods. In addition, we study +the role of the attention mechanism in illuminating physically-meaningful +ion-pairing relationships across diverse mixture compositions. Further, we +investigate the importance of pre-training on simulated data from PDE-based +models, as well as the performance benefits from hard vs. soft inductive +biases. Our results indicate that physics-informed deep learning solutions can +outperform their classical PDE-based counterparts and provide promising avenues +for modelling complex transport phenomena across diverse applications. + +
+
+ comment: 8 pages, 2 figures. Accepted in the NeurIPS Machine Learning and the + Physical Sciences Workshop +
+
+
+
+
+ + ☆ Semi-Supervised Health Index Monitoring with Feature Generation and + Fusion + + +
+ The Health Index (HI) is crucial for evaluating system health, aiding tasks +like anomaly detection and predicting remaining useful life for systems +demanding high safety and reliability. Tight monitoring is crucial for +achieving high precision at a lower cost, with applications such as spray +coating. Obtaining HI labels in real-world applications is often +cost-prohibitive, requiring continuous, precise health measurements. Therefore, +it is more convenient to leverage run-to failure datasets that may provide +potential indications of machine wear condition, making it necessary to apply +semi-supervised tools for HI construction. In this study, we adapt the Deep +Semi-supervised Anomaly Detection (DeepSAD) method for HI construction. We use +the DeepSAD embedding as a condition indicators to address interpretability +challenges and sensitivity to system-specific factors. Then, we introduce a +diversity loss to enrich condition indicators. We employ an alternating +projection algorithm with isotonic constraints to transform the DeepSAD +embedding into a normalized HI with an increasing trend. Validation on the PHME +2010 milling dataset, a recognized benchmark with ground truth HIs demonstrates +meaningful HIs estimations. Our methodology is then applied to monitor wear +states of thermal spray coatings using high-frequency voltage. Our +contributions create opportunities for more accessible and reliable HI +estimation, particularly in cases where obtaining ground truth HI labels is +unfeasible. + +
+
+ comment: 13 pages, 8 figures +
+
+
+
+
+ + ☆ Lessons from Usable ML Deployments and Application to Wind Turbine + Monitoring NeurIPS 2023 + + +
+ Through past experiences deploying what we call usable ML (one step beyond +explainable ML, including both explanations and other augmenting information) +to real-world domains, we have learned three key lessons. First, many +organizations are beginning to hire people who we call ``bridges'' because they +bridge the gap between ML developers and domain experts, and these people fill +a valuable role in developing usable ML applications. Second, a configurable +system that enables easily iterating on usable ML interfaces during +collaborations with bridges is key. Finally, there is a need for continuous, +in-deployment evaluations to quantify the real-world impact of usable ML. +Throughout this paper, we apply these lessons to the task of wind turbine +monitoring, an essential task in the renewable energy domain. Turbine engineers +and data analysts must decide whether to perform costly in-person +investigations on turbines to prevent potential cases of brakepad failure, and +well-tuned usable ML interfaces can aid with this decision-making process. +Through the applications of our lessons to this task, we hope to demonstrate +the potential real-world impact of usable ML in the renewable energy domain. + +
+
+ comment: Presented in XAI in Action: Past, Present, and Future Applications @ + NeurIPS 2023. 8 pages, 3 figures +
+
+
+
+
+ + ☆ Towards Causal Representations of Climate Model Data + + +
+ Climate models, such as Earth system models (ESMs), are crucial for +simulating future climate change based on projected Shared Socioeconomic +Pathways (SSP) greenhouse gas emissions scenarios. While ESMs are sophisticated +and invaluable, machine learning-based emulators trained on existing simulation +data can project additional climate scenarios much faster and are +computationally efficient. However, they often lack generalizability and +interpretability. This work delves into the potential of causal representation +learning, specifically the \emph{Causal Discovery with Single-parent Decoding} +(CDSD) method, which could render climate model emulation efficient +\textit{and} interpretable. We evaluate CDSD on multiple climate datasets, +focusing on emissions, temperature, and precipitation. Our findings shed light +on the challenges, limitations, and promise of using CDSD as a stepping stone +towards more interpretable and robust climate model emulation. + +
+
+
+
+
+ + ☆ Exploring Error Bits for Memory Failure Prediction: An In-Depth + Correlative Study + + +
+ In large-scale datacenters, memory failure is a common cause of server +crashes, with uncorrectable errors (UEs) being a major indicator of Dual Inline +Memory Module (DIMM) defects. Existing approaches primarily focus on predicting +UEs using correctable errors (CEs), without fully considering the information +provided by error bits. However, error bit patterns have a strong correlation +with the occurrence of uncorrectable errors (UEs). In this paper, we present a +comprehensive study on the correlation between CEs and UEs, specifically +emphasizing the importance of spatio-temporal error bit information. Our +analysis reveals a strong correlation between spatio-temporal error bits and UE +occurrence. Through evaluations using real-world datasets, we demonstrate that +our approach significantly improves prediction performance by 15% in F1-score +compared to the state-of-the-art algorithms. Overall, our approach effectively +reduces the number of virtual machine interruptions caused by UEs by +approximately 59%. + +
+
+ comment: Published at ICCAD 2023 +
+
+
+
+
+ + ☆ Expert-guided Bayesian Optimisation for Human-in-the-loop Experimental + Design of Known Systems NeurIPS 2023 + + +
+ Domain experts often possess valuable physical insights that are overlooked +in fully automated decision-making processes such as Bayesian optimisation. In +this article we apply high-throughput (batch) Bayesian optimisation alongside +anthropological decision theory to enable domain experts to influence the +selection of optimal experiments. Our methodology exploits the hypothesis that +humans are better at making discrete choices than continuous ones and enables +experts to influence critical early decisions. At each iteration we solve an +augmented multi-objective optimisation problem across a number of alternate +solutions, maximising both the sum of their utility function values and the +determinant of their covariance matrix, equivalent to their total variability. +By taking the solution at the knee point of the Pareto front, we return a set +of alternate solutions at each iteration that have both high utility values and +are reasonably distinct, from which the expert selects one for evaluation. We +demonstrate that even in the case of an uninformed practitioner, our algorithm +recovers the regret of standard Bayesian optimisation. + +
+
+ comment: NeurIPS 2023 Workshop on Adaptive Experimental Design and Active + Learning in the Real World. Main text: 6 pages +
+
+
+
+
+ + ☆ A Kernel-Based Neural Network Test for High-dimensional Sequencing Data + Analysis + + +
+ The recent development of artificial intelligence (AI) technology, especially +the advance of deep neural network (DNN) technology, has revolutionized many +fields. While DNN plays a central role in modern AI technology, it has been +rarely used in sequencing data analysis due to challenges brought by +high-dimensional sequencing data (e.g., overfitting). Moreover, due to the +complexity of neural networks and their unknown limiting distributions, +building association tests on neural networks for genetic association analysis +remains a great challenge. To address these challenges and fill the important +gap of using AI in high-dimensional sequencing data analysis, we introduce a +new kernel-based neural network (KNN) test for complex association analysis of +sequencing data. The test is built on our previously developed KNN framework, +which uses random effects to model the overall effects of high-dimensional +genetic data and adopts kernel-based neural network structures to model complex +genotype-phenotype relationships. Based on KNN, a Wald-type test is then +introduced to evaluate the joint association of high-dimensional genetic data +with a disease phenotype of interest, considering non-linear and non-additive +effects (e.g., interaction effects). Through simulations, we demonstrated that +our proposed method attained higher power compared to the sequence kernel +association test (SKAT), especially in the presence of non-linear and +interaction effects. Finally, we apply the methods to the whole genome +sequencing (WGS) dataset from the Alzheimer's Disease Neuroimaging Initiative +(ADNI) study, investigating new genes associated with the hippocampal volume +change over time. + +
+
+ comment: 31 pages, 5 figures and 3 tabels +
+
+
+
+
+ + ☆ Algorithms for mean-field variational inference via polyhedral + optimization in the Wasserstein space + + +
+ We develop a theory of finite-dimensional polyhedral subsets over the +Wasserstein space and optimization of functionals over them via first-order +methods. Our main application is to the problem of mean-field variational +inference, which seeks to approximate a distribution $\pi$ over $\mathbb{R}^d$ +by a product measure $\pi^\star$. When $\pi$ is strongly log-concave and +log-smooth, we provide (1) approximation rates certifying that $\pi^\star$ is +close to the minimizer $\pi^\star_\diamond$ of the KL divergence over a +\emph{polyhedral} set $\mathcal{P}_\diamond$, and (2) an algorithm for +minimizing $\text{KL}(\cdot\|\pi)$ over $\mathcal{P}_\diamond$ with accelerated +complexity $O(\sqrt \kappa \log(\kappa d/\varepsilon^2))$, where $\kappa$ is +the condition number of $\pi$. + +
+
+ comment: 40 pages +
+
+
+
+
+ + ☆ Are Vision Transformers More Data Hungry Than Newborn Visual Systems? NeurIPS 2023 + + +
+ Vision transformers (ViTs) are top performing models on many computer vision +benchmarks and can accurately predict human behavior on object recognition +tasks. However, researchers question the value of using ViTs as models of +biological learning because ViTs are thought to be more data hungry than +brains, with ViTs requiring more training data to reach similar levels of +performance. To test this assumption, we directly compared the learning +abilities of ViTs and animals, by performing parallel controlled rearing +experiments on ViTs and newborn chicks. We first raised chicks in impoverished +visual environments containing a single object, then simulated the training +data available in those environments by building virtual animal chambers in a +video game engine. We recorded the first-person images acquired by agents +moving through the virtual chambers and used those images to train self +supervised ViTs that leverage time as a teaching signal, akin to biological +visual systems. When ViTs were trained through the eyes of newborn chicks, the +ViTs solved the same view invariant object recognition tasks as the chicks. +Thus, ViTs were not more data hungry than newborn visual systems: both learned +view invariant object representations in impoverished visual environments. The +flexible and generic attention based learning mechanism in ViTs combined with +the embodied data streams available to newborn animals appears sufficient to +drive the development of animal-like object recognition. + +
+
+ comment: Accepted in Thirty-seventh Conference on Neural Information + Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ☆ MIMONets: Multiple-Input-Multiple-Output Neural Networks Exploiting + Computation in Superposition NeurIPS 2023 + + +
+ With the advent of deep learning, progressively larger neural networks have +been designed to solve complex tasks. We take advantage of these capacity-rich +models to lower the cost of inference by exploiting computation in +superposition. To reduce the computational burden per input, we propose +Multiple-Input-Multiple-Output Neural Networks (MIMONets) capable of handling +many inputs at once. MIMONets augment various deep neural network architectures +with variable binding mechanisms to represent an arbitrary number of inputs in +a compositional data structure via fixed-width distributed representations. +Accordingly, MIMONets adapt nonlinear neural transformations to process the +data structure holistically, leading to a speedup nearly proportional to the +number of superposed input items in the data structure. After processing in +superposition, an unbinding mechanism recovers each transformed input of +interest. MIMONets also provide a dynamic trade-off between accuracy and +throughput by an instantaneous on-demand switching between a set of +accuracy-throughput operating points, yet within a single set of fixed +parameters. We apply the concept of MIMONets to both CNN and Transformer +architectures resulting in MIMOConv and MIMOFormer, respectively. Empirical +evaluations show that MIMOConv achieves about 2-4 x speedup at an accuracy +delta within [+0.68, -3.18]% compared to WideResNet CNNs on CIFAR10 and +CIFAR100. Similarly, MIMOFormer can handle 2-4 inputs at once while maintaining +a high average accuracy within a [-1.07, -3.43]% delta on the long range arena +benchmark. Finally, we provide mathematical bounds on the interference between +superposition channels in MIMOFormer. Our code is available at +https://github.com/IBM/multiple-input-multiple-output-nets. + +
+
+ comment: accepted in NeurIPS 2023 +
+
+
+
+
+ + ☆ Convergence Rates for Stochastic Approximation: Biased Noise with + Unbounded Variance, and Applications + + +
+ The Stochastic Approximation (SA) algorithm introduced by Robbins and Monro +in 1951 has been a standard method for solving equations of the form +$\mathbf{f}({\boldsymbol {\theta}}) = \mathbf{0}$, when only noisy measurements +of $\mathbf{f}(\cdot)$ are available. If $\mathbf{f}({\boldsymbol {\theta}}) = +\nabla J({\boldsymbol {\theta}})$ for some function $J(\cdot)$, then SA can +also be used to find a stationary point of $J(\cdot)$. In much of the +literature, it is assumed that the error term ${\boldsymbol {xi}}_{t+1}$ has +zero conditional mean, and that its conditional variance is bounded as a +function of $t$ (though not necessarily with respect to ${\boldsymbol +{\theta}}_t$). Also, for the most part, the emphasis has been on +``synchronous'' SA, whereby, at each time $t$, \textit{every} component of +${\boldsymbol {\theta}}_t$ is updated. Over the years, SA has been applied to a +variety of areas, out of which two are the focus in this paper: Convex and +nonconvex optimization, and Reinforcement Learning (RL). As it turns out, in +these applications, the above-mentioned assumptions do not always hold. In +zero-order methods, the error neither has zero mean nor bounded conditional +variance. In the present paper, we extend SA theory to encompass errors with +nonzero conditional mean and/or unbounded conditional variance, and also +asynchronous SA. In addition, we derive estimates for the rate of convergence +of the algorithm. Then we apply the new results to problems in nonconvex +optimization, and to Markovian SA, a recently emerging area in RL. We prove +that SA converges in these situations, and compute the ``optimal step size +sequences'' to maximize the estimated rate of convergence. + +
+
+ comment: 40 pages, 3 figures +
+
+
+
+
+ + ☆ Calibrated Adaptive Teacher for Domain Adaptive Intelligent Fault + Diagnosis + + +
+ Intelligent Fault Diagnosis (IFD) based on deep learning has proven to be an +effective and flexible solution, attracting extensive research. Deep neural +networks can learn rich representations from vast amounts of representative +labeled data for various applications. In IFD, they achieve high classification +performance from signals in an end-to-end manner, without requiring extensive +domain knowledge. However, deep learning models usually only perform well on +the data distribution they have been trained on. When applied to a different +distribution, they may experience performance drops. This is also observed in +IFD, where assets are often operated in working conditions different from those +in which labeled data have been collected. Unsupervised domain adaptation (UDA) +deals with the scenario where labeled data are available in a source domain, +and only unlabeled data are available in a target domain, where domains may +correspond to operating conditions. Recent methods rely on training with +confident pseudo-labels for target samples. However, the confidence-based +selection of pseudo-labels is hindered by poorly calibrated confidence +estimates in the target domain, primarily due to over-confident predictions, +which limits the quality of pseudo-labels and leads to error accumulation. In +this paper, we propose a novel UDA method called Calibrated Adaptive Teacher +(CAT), where we propose to calibrate the predictions of the teacher network +throughout the self-training process, leveraging post-hoc calibration +techniques. We evaluate CAT on domain-adaptive IFD and perform extensive +experiments on the Paderborn benchmark for bearing fault diagnosis under +varying operating conditions. Our proposed method achieves state-of-the-art +performance on most transfer tasks. + +
+
+ comment: 23 pages. Under review +
+
+
+
+
+ + ☆ Score-Aware Policy-Gradient Methods and Performance Guarantees using + Local Lyapunov Conditions: Applications to Product-Form Stochastic Networks + and Queueing Systems + + +
+ Stochastic networks and queueing systems often lead to Markov decision +processes (MDPs) with large state and action spaces as well as nonconvex +objective functions, which hinders the convergence of many reinforcement +learning (RL) algorithms. Policy-gradient methods perform well on MDPs with +large state and action spaces, but they sometimes experience slow convergence +due to the high variance of the gradient estimator. In this paper, we show that +some of these difficulties can be circumvented by exploiting the structure of +the underlying MDP. We first introduce a new family of gradient estimators +called score-aware gradient estimators (SAGEs). When the stationary +distribution of the MDP belongs to an exponential family parametrized by the +policy parameters, SAGEs allow us to estimate the policy gradient without +relying on value-function estimation, contrary to classical policy-gradient +methods like actor-critic. To demonstrate their applicability, we examine two +common control problems arising in stochastic networks and queueing systems +whose stationary distributions have a product-form, a special case of +exponential families. As a second contribution, we show that, under appropriate +assumptions, the policy under a SAGE-based policy-gradient method has a large +probability of converging to an optimal policy, provided that it starts +sufficiently close to it, even with a nonconvex objective function and multiple +maximizers. Our key assumptions are that, locally around a maximizer, a +nondegeneracy property of the Hessian of the objective function holds and a +Lyapunov function exists. Finally, we conduct a numerical comparison between a +SAGE-based policy-gradient method and an actor-critic algorithm. The results +demonstrate that the SAGE-based method finds close-to-optimal policies more +rapidly, highlighting its superior performance over the traditional +actor-critic method. + +
+
+ comment: 45 pages, 5 figures +
+
+
+
+
+ + ☆ Weakly Supervised Detection of Hallucinations in LLM Activations + + +
+ We propose an auditing method to identify whether a large language model +(LLM) encodes patterns such as hallucinations in its internal states, which may +propagate to downstream tasks. We introduce a weakly supervised auditing +technique using a subset scanning approach to detect anomalous patterns in LLM +activations from pre-trained models. Importantly, our method does not need +knowledge of the type of patterns a-priori. Instead, it relies on a reference +dataset devoid of anomalies during testing. Further, our approach enables the +identification of pivotal nodes responsible for encoding these patterns, which +may offer crucial insights for fine-tuning specific sub-networks for bias +mitigation. We introduce two new scanning methods to handle LLM activations for +anomalous sentences that may deviate from the expected distribution in either +direction. Our results confirm prior findings of BERT's limited internal +capacity for encoding hallucinations, while OPT appears capable of encoding +hallucination information internally. Importantly, our scanning approach, +without prior exposure to false statements, performs comparably to a fully +supervised out-of-distribution classifier. + +
+
+
+
+
+ + ☆ Materials Expert-Artificial Intelligence for Materials Discovery + + +
+ The advent of material databases provides an unprecedented opportunity to +uncover predictive descriptors for emergent material properties from vast data +space. However, common reliance on high-throughput ab initio data necessarily +inherits limitations of such data: mismatch with experiments. On the other +hand, experimental decisions are often guided by an expert's intuition honed +from experiences that are rarely articulated. We propose using machine learning +to "bottle" such operational intuition into quantifiable descriptors using +expertly curated measurement-based data. We introduce "Materials +Expert-Artificial Intelligence" (ME-AI) to encapsulate and articulate this +human intuition. As a first step towards such a program, we focus on the +topological semimetal (TSM) among square-net materials as the property inspired +by the expert-identified descriptor based on structural information: the +tolerance factor. We start by curating a dataset encompassing 12 primary +features of 879 square-net materials, using experimental data whenever +possible. We then use Dirichlet-based Gaussian process regression using a +specialized kernel to reveal composite descriptors for square-net topological +semimetals. The ME-AI learned descriptors independently reproduce expert +intuition and expand upon it. Specifically, new descriptors point to +hypervalency as a critical chemical feature predicting TSM within square-net +compounds. Our success with a carefully defined problem points to the "machine +bottling human insight" approach as promising for machine learning-aided +material discovery. + +
+
+ comment: 8 pages main text, 4 figs, 8 pages Supplementary material +
+
+
+
+
+ + ☆ Machine Learning Driven Sensitivity Analysis of E3SM Land Model + Parameters for Wetland Methane Emissions + + +
+ Methane (CH4) is the second most critical greenhouse gas after carbon +dioxide, contributing to 16-25% of the observed atmospheric warming. Wetlands +are the primary natural source of methane emissions globally. However, wetland +methane emission estimates from biogeochemistry models contain considerable +uncertainty. One of the main sources of this uncertainty arises from the +numerous uncertain model parameters within various physical, biological, and +chemical processes that influence methane production, oxidation, and transport. +Sensitivity Analysis (SA) can help identify critical parameters for methane +emission and achieve reduced biases and uncertainties in future projections. +This study performs SA for 19 selected parameters responsible for critical +biogeochemical processes in the methane module of the Energy Exascale Earth +System Model (E3SM) land model (ELM). The impact of these parameters on various +CH4 fluxes is examined at 14 FLUXNET- CH4 sites with diverse vegetation types. +Given the extensive number of model simulations needed for global +variance-based SA, we employ a machine learning (ML) algorithm to emulate the +complex behavior of ELM methane biogeochemistry. ML enables the computational +time to be shortened significantly from 6 CPU hours to 0.72 milliseconds, +achieving reduced computational costs. We found that parameters linked to CH4 +production and diffusion generally present the highest sensitivities despite +apparent seasonal variation. Comparing simulated emissions from perturbed +parameter sets against FLUXNET-CH4 observations revealed that better +performances can be achieved at each site compared to the default parameter +values. This presents a scope for further improving simulated emissions using +parameter calibration with advanced optimization techniques like Bayesian +optimization. + +
+
+ comment: 24 pages, 9 figures and 2 tables +
+
+
+
+
+ + ☆ Large Language Models on Graphs: A Comprehensive Survey + + +
+ Large language models (LLMs), such as ChatGPT and LLaMA, are creating +significant advancements in natural language processing, due to their strong +text encoding/decoding ability and newly found emergent capability (e.g., +reasoning). While LLMs are mainly designed to process pure texts, there are +many real-world scenarios where text data are associated with rich structure +information in the form of graphs (e.g., academic networks, and e-commerce +networks) or scenarios where graph data are paired with rich textual +information (e.g., molecules with descriptions). Besides, although LLMs have +shown their pure text-based reasoning ability, it is underexplored whether such +ability can be generalized to graph scenarios (i.e., graph-based reasoning). In +this paper, we provide a systematic review of scenarios and techniques related +to large language models on graphs. We first summarize potential scenarios of +adopting LLMs on graphs into three categories, namely pure graphs, text-rich +graphs, and text-paired graphs. We then discuss detailed techniques for +utilizing LLMs on graphs, including LLM as Predictor, LLM as Encoder, and LLM +as Aligner, and compare the advantages and disadvantages of different schools +of models. Furthermore, we mention the real-world applications of such methods +and summarize open-source codes and benchmark datasets. Finally, we conclude +with potential future research directions in this fast-growing field. The +related source can be found at +https://github.com/PeterGriffinJin/Awesome-Language-Model-on-Graphs. + +
+
+ comment: 26 pages +
+
+
+
+
+ + ☆ Scaling Laws for Adversarial Attacks on Language Model Activations + + +
+ We explore a class of adversarial attacks targeting the activations of +language models. By manipulating a relatively small subset of model +activations, $a$, we demonstrate the ability to control the exact prediction of +a significant number (in some cases up to 1000) of subsequent tokens $t$. We +empirically verify a scaling law where the maximum number of target tokens +$t_\mathrm{max}$ predicted depends linearly on the number of tokens $a$ whose +activations the attacker controls as $t_\mathrm{max} = \kappa a$. We find that +the number of bits of control in the input space needed to control a single bit +in the output space (what we call attack resistance $\chi$) is remarkably +constant between $\approx 16$ and $\approx 25$ over 2 orders of magnitude of +model sizes for different language models. Compared to attacks on tokens, +attacks on activations are predictably much stronger, however, we identify a +surprising regularity where one bit of input steered either via activations or +via tokens is able to exert control over a similar amount of output bits. This +gives support for the hypothesis that adversarial attacks are a consequence of +dimensionality mismatch between the input and output spaces. A practical +implication of the ease of attacking language model activations instead of +tokens is for multi-modal and selected retrieval models, where additional data +sources are added as activations directly, sidestepping the tokenized input. +This opens up a new, broad attack surface. By using language models as a +controllable test-bed to study adversarial attacks, we were able to experiment +with input-output dimensions that are inaccessible in computer vision, +especially where the output dimension dominates. + +
+
+ comment: 15 pages, 9 figures +
+
+
+
+
+ + ☆ Learning "Look-Ahead" Nonlocal Traffic Dynamics in a Ring Road + + +
+ The macroscopic traffic flow model is widely used for traffic control and +management. To incorporate drivers' anticipative behaviors and to remove +impractical speed discontinuity inherent in the classic +Lighthill-Whitham-Richards (LWR) traffic model, nonlocal partial differential +equation (PDE) models with ``look-ahead" dynamics have been proposed, which +assume that the speed is a function of weighted downstream traffic density. +However, it lacks data validation on two important questions: whether there +exist nonlocal dynamics, and how the length and weight of the ``look-ahead" +window affect the spatial temporal propagation of traffic densities. In this +paper, we adopt traffic trajectory data from a ring-road experiment and design +a physics-informed neural network to learn the fundamental diagram and +look-ahead kernel that best fit the data, and reinvent a data-enhanced nonlocal +LWR model via minimizing the loss function combining the data discrepancy and +the nonlocal model discrepancy. Results show that the learned nonlocal LWR +yields a more accurate prediction of traffic wave propagation in three +different scenarios: stop-and-go oscillations, congested, and free traffic. We +first demonstrate the existence of ``look-ahead" effect with real traffic data. +The optimal nonlocal kernel is found out to take a length of around 35 to 50 +meters, and the kernel weight within 5 meters accounts for the majority of the +nonlocal effect. Our results also underscore the importance of choosing a +priori physics in machine learning models. + +
+
+
+
+
+ + ☆ C3: High-performance and low-complexity neural compression from a single + image or video + + +
+ Most neural compression models are trained on large datasets of images or +videos in order to generalize to unseen data. Such generalization typically +requires large and expressive architectures with a high decoding complexity. +Here we introduce C3, a neural compression method with strong rate-distortion +(RD) performance that instead overfits a small model to each image or video +separately. The resulting decoding complexity of C3 can be an order of +magnitude lower than neural baselines with similar RD performance. C3 builds on +COOL-CHIC (Ladune et al.) and makes several simple and effective improvements +for images. We further develop new methodology to apply C3 to videos. On the +CLIC2020 image benchmark, we match the RD performance of VTM, the reference +implementation of the H.266 codec, with less than 3k MACs/pixel for decoding. +On the UVG video benchmark, we match the RD performance of the Video +Compression Transformer (Mentzer et al.), a well-established neural video +codec, with less than 5k MACs/pixel for decoding. + +
+
+
+
+
+ + ☆ Compositional Generalization for Data-to-Text Generation + + +
+ Data-to-text generation involves transforming structured data, often +represented as predicate-argument tuples, into coherent textual descriptions. +Despite recent advances, systems still struggle when confronted with unseen +combinations of predicates, producing unfaithful descriptions (e.g. +hallucinations or omissions). We refer to this issue as compositional +generalisation, and it encouraged us to create a benchmark for assessing the +performance of different approaches on this specific problem. Furthermore, we +propose a novel model that addresses compositional generalization by clustering +predicates into groups. Our model generates text in a sentence-by-sentence +manner, relying on one cluster of predicates at a time. This approach +significantly outperforms T5~baselines across all evaluation metrics.Notably, +it achieved a 31% improvement over T5 in terms of a metric focused on +maintaining faithfulness to the input. + +
+
+
+
+
+ + ☆ LExCI: A Framework for Reinforcement Learning with Embedded Systems + + +
+ Advances in artificial intelligence (AI) have led to its application in many +areas of everyday life. In the context of control engineering, reinforcement +learning (RL) represents a particularly promising approach as it is centred +around the idea of allowing an agent to freely interact with its environment to +find an optimal strategy. One of the challenges professionals face when +training and deploying RL agents is that the latter often have to run on +dedicated embedded devices. This could be to integrate them into an existing +toolchain or to satisfy certain performance criteria like real-time +constraints. Conventional RL libraries, however, cannot be easily utilised in +conjunction with that kind of hardware. In this paper, we present a framework +named LExCI, the Learning and Experiencing Cycle Interface, which bridges this +gap and provides end-users with a free and open-source tool for training agents +on embedded systems using the open-source library RLlib. Its operability is +demonstrated with two state-of-the-art RL-algorithms and a rapid control +prototyping system. + +
+
+ comment: The code, models, and data used for this work are available in a + separate branch of LExCI's GitHub repository + (https://github.com/mechatronics-RWTH/lexci-2/tree/lexci_paper). This paper + has been submitted to Applied Intelligence + (https://link.springer.com/journal/10489) +
+
+
+
+
+ + ☆ Towards Measuring Representational Similarity of Large Language Models NeurIPS 2023 + + +
+ Understanding the similarity of the numerous released large language models +(LLMs) has many uses, e.g., simplifying model selection, detecting illegal +model reuse, and advancing our understanding of what makes LLMs perform well. +In this work, we measure the similarity of representations of a set of LLMs +with 7B parameters. Our results suggest that some LLMs are substantially +different from others. We identify challenges of using representational +similarity measures that suggest the need of careful study of similarity scores +to avoid false conclusions. + +
+
+ comment: Extended abstract in UniReps Workshop @ NeurIPS 2023 +
+
+
+
+
+ + ☆ Towards the Inferrence of Structural Similarity of Combinatorial + Landscapes + + +
+ One of the most common problem-solving heuristics is by analogy. For a given +problem, a solver can be viewed as a strategic walk on its fitness landscape. +Thus if a solver works for one problem instance, we expect it will also be +effective for other instances whose fitness landscapes essentially share +structural similarities with each other. However, due to the black-box nature +of combinatorial optimization, it is far from trivial to infer such similarity +in real-world scenarios. To bridge this gap, by using local optima network as a +proxy of fitness landscapes, this paper proposed to leverage graph data mining +techniques to conduct qualitative and quantitative analyses to explore the +latent topological structural information embedded in those landscapes. By +conducting large-scale empirical experiments on three classic combinatorial +optimization problems, we gain concrete evidence to support the existence of +structural similarity between landscapes of the same classes within neighboring +dimensions. We also interrogated the relationship between landscapes of +different problem classes. + +
+
+
+
+
+ + ☆ (Provable) Adversarial Robustness for Group Equivariant Tasks: Graphs, + Point Clouds, Molecules, and More NeurIPS 2023 + + +
+ A machine learning model is traditionally considered robust if its prediction +remains (almost) constant under input perturbations with small norm. However, +real-world tasks like molecular property prediction or point cloud segmentation +have inherent equivariances, such as rotation or permutation equivariance. In +such tasks, even perturbations with large norm do not necessarily change an +input's semantic content. Furthermore, there are perturbations for which a +model's prediction explicitly needs to change. For the first time, we propose a +sound notion of adversarial robustness that accounts for task equivariance. We +then demonstrate that provable robustness can be achieved by (1) choosing a +model that matches the task's equivariances (2) certifying traditional +adversarial robustness. Certification methods are, however, unavailable for +many models, such as those with continuous equivariances. We close this gap by +developing the framework of equivariance-preserving randomized smoothing, which +enables architecture-agnostic certification. We additionally derive the first +architecture-specific graph edit distance certificates, i.e. sound robustness +guarantees for isomorphism equivariant tasks like node classification. Overall, +a sound notion of robustness is an important prerequisite for future work at +the intersection of robust and geometric machine learning. + +
+
+ comment: Accepted at NeurIPS 2023 +
+
+
+
+
+ + ☆ Analyzing and Improving the Training Dynamics of Diffusion Models + + +
+ Diffusion models currently dominate the field of data-driven image synthesis +with their unparalleled scaling to large datasets. In this paper, we identify +and rectify several causes for uneven and ineffective training in the popular +ADM diffusion model architecture, without altering its high-level structure. +Observing uncontrolled magnitude changes and imbalances in both the network +activations and weights over the course of training, we redesign the network +layers to preserve activation, weight, and update magnitudes on expectation. We +find that systematic application of this philosophy eliminates the observed +drifts and imbalances, resulting in considerably better networks at equal +computational complexity. Our modifications improve the previous record FID of +2.41 in ImageNet-512 synthesis to 1.81, achieved using fast deterministic +sampling. + As an independent contribution, we present a method for setting the +exponential moving average (EMA) parameters post-hoc, i.e., after completing +the training run. This allows precise tuning of EMA length without the cost of +performing several training runs, and reveals its surprising interactions with +network architecture, training time, and guidance. + +
+
+
+
+
+ + ☆ DeepPointMap: Advancing LiDAR SLAM with Unified Neural Descriptors + + +
+ Point clouds have shown significant potential in various domains, including +Simultaneous Localization and Mapping (SLAM). However, existing approaches +either rely on dense point clouds to achieve high localization accuracy or use +generalized descriptors to reduce map size. Unfortunately, these two aspects +seem to conflict with each other. To address this limitation, we propose a +unified architecture, DeepPointMap, achieving excellent preference on both +aspects. We utilize neural network to extract highly representative and sparse +neural descriptors from point clouds, enabling memory-efficient map +representation and accurate multi-scale localization tasks (e.g., odometry and +loop-closure). Moreover, we showcase the versatility of our framework by +extending it to more challenging multi-agent collaborative SLAM. The promising +results obtained in these scenarios further emphasize the effectiveness and +potential of our approach. + +
+
+
+
+
+ + ☆ Diffusion-Based Speech Enhancement in Matched and Mismatched Conditions + Using a Heun-Based Sampler + + +
+ Diffusion models are a new class of generative models that have recently been +applied to speech enhancement successfully. Previous works have demonstrated +their superior performance in mismatched conditions compared to state-of-the +art discriminative models. However, this was investigated with a single +database for training and another one for testing, which makes the results +highly dependent on the particular databases. Moreover, recent developments +from the image generation literature remain largely unexplored for speech +enhancement. These include several design aspects of diffusion models, such as +the noise schedule or the reverse sampler. In this work, we systematically +assess the generalization performance of a diffusion-based speech enhancement +model by using multiple speech, noise and binaural room impulse response (BRIR) +databases to simulate mismatched acoustic conditions. We also experiment with a +noise schedule and a sampler that have not been applied to speech enhancement +before. We show that the proposed system substantially benefits from using +multiple databases for training, and achieves superior performance compared to +state-of-the-art discriminative models in both matched and mismatched +conditions. We also show that a Heun-based sampler achieves superior +performance at a smaller computational cost compared to a sampler commonly used +for speech enhancement. + +
+
+
+
+
+ + ☆ H-GAP: Humanoid Control with a Generalist Planner + + +
+ Humanoid control is an important research challenge offering avenues for +integration into human-centric infrastructures and enabling physics-driven +humanoid animations. The daunting challenges in this field stem from the +difficulty of optimizing in high-dimensional action spaces and the instability +introduced by the bipedal morphology of humanoids. However, the extensive +collection of human motion-captured data and the derived datasets of humanoid +trajectories, such as MoCapAct, paves the way to tackle these challenges. In +this context, we present Humanoid Generalist Autoencoding Planner (H-GAP), a +state-action trajectory generative model trained on humanoid trajectories +derived from human motion-captured data, capable of adeptly handling downstream +control tasks with Model Predictive Control (MPC). For 56 degrees of freedom +humanoid, we empirically demonstrate that H-GAP learns to represent and +generate a wide range of motor behaviours. Further, without any learning from +online interactions, it can also flexibly transfer these behaviors to solve +novel downstream control tasks via planning. Notably, H-GAP excels established +MPC baselines that have access to the ground truth dynamics model, and is +superior or comparable to offline RL methods trained for individual tasks. +Finally, we do a series of empirical studies on the scaling properties of +H-GAP, showing the potential for performance gains via additional data but not +computing. Code and videos are available at +https://ycxuyingchen.github.io/hgap/. + +
+
+ comment: 18 pages including appendix, 4 figures +
+
+
+
+
+ + ☆ Amortized Bayesian Decision Making for simulation-based models + + +
+ Simulation-based inference (SBI) provides a powerful framework for inferring +posterior distributions of stochastic simulators in a wide range of domains. In +many settings, however, the posterior distribution is not the end goal itself +-- rather, the derived parameter values and their uncertainties are used as a +basis for deciding what actions to take. Unfortunately, because posterior +distributions provided by SBI are (potentially crude) approximations of the +true posterior, the resulting decisions can be suboptimal. Here, we address the +question of how to perform Bayesian decision making on stochastic simulators, +and how one can circumvent the need to compute an explicit approximation to the +posterior. Our method trains a neural network on simulated data and can predict +the expected cost given any data and action, and can, thus, be directly used to +infer the action with lowest cost. We apply our method to several benchmark +problems and demonstrate that it induces similar cost as the true posterior +distribution. We then apply the method to infer optimal actions in a real-world +simulator in the medical neurosciences, the Bayesian Virtual Epileptic Patient, +and demonstrate that it allows to infer actions associated with low cost after +few simulations. + +
+
+
+
+
+ + ☆ Learning a Sparse Representation of Barron Functions with the Inverse + Scale Space Flow + + +
+ This paper presents a method for finding a sparse representation of Barron +functions. Specifically, given an $L^2$ function $f$, the inverse scale space +flow is used to find a sparse measure $\mu$ minimising the $L^2$ loss between +the Barron function associated to the measure $\mu$ and the function $f$. The +convergence properties of this method are analysed in an ideal setting and in +the cases of measurement noise and sampling bias. In an ideal setting the +objective decreases strictly monotone in time to a minimizer with +$\mathcal{O}(1/t)$, and in the case of measurement noise or sampling bias the +optimum is achieved up to a multiplicative or additive constant. This +convergence is preserved on discretization of the parameter space, and the +minimizers on increasingly fine discretizations converge to the optimum on the +full parameter space. + +
+
+ comment: 30 pages, 0 figures +
+
+
+
+
+ + ☆ Lights out: training RL agents robust to temporary blindness + + +
+ Agents trained with DQN rely on an observation at each timestep to decide +what action to take next. However, in real world applications observations can +change or be missing entirely. Examples of this could be a light bulb breaking +down, or the wallpaper in a certain room changing. While these situations +change the actual observation, the underlying optimal policy does not change. +Because of this we want our agent to continue taking actions until it receives +a (recognized) observation again. To achieve this we introduce a combination of +a neural network architecture that uses hidden representations of the +observations and a novel n-step loss function. Our implementation is able to +withstand location based blindness stretches longer than the ones it was +trained on, and therefore shows robustness to temporary blindness. For access +to our implementation, please email Nathan, Marije, or Pau. + +
+
+
+
+
+ + ☆ A Self-Commissioning Edge Computing Method for Data-Driven Anomaly + Detection in Power Electronic Systems + + +
+ Ensuring the reliability of power electronic converters is a matter of great +importance, and data-driven condition monitoring techniques are cementing +themselves as an important tool for this purpose. However, translating methods +that work well in controlled lab environments to field applications presents +significant challenges, notably because of the limited diversity and accuracy +of the lab training data. By enabling the use of field data, online machine +learning can be a powerful tool to overcome this problem, but it introduces +additional challenges in ensuring the stability and predictability of the +training processes. This work presents an edge computing method that mitigates +these shortcomings with minimal additional memory usage, by employing an +autonomous algorithm that prioritizes the storage of training samples with +larger prediction errors. The method is demonstrated on the use case of a +self-commissioning condition monitoring system, in the form of a thermal +anomaly detection scheme for a variable frequency motor drive, where the +algorithm self-learned to distinguish normal and anomalous operation with +minimal prior knowledge. The obtained results, based on experimental data, show +a significant improvement in prediction accuracy and training speed, when +compared to equivalent models trained online without the proposed data +selection process. + +
+
+
+
+
+ + ☆ Do AI models produce better weather forecasts than physics-based models? + A quantitative evaluation case study of Storm Ciarán + + +
+ There has been huge recent interest in the potential of making operational +weather forecasts using machine learning techniques. As they become a part of +the weather forecasting toolbox, there is a pressing need to understand how +well current machine learning models can simulate high-impactweather events. We +compare forecasts of Storm Ciar\'an, a European windstorm that caused sixteen +deaths and extensive damage in Northern Europe, made by machine learning and +numericalweather prediction models. The four machine learning models considered +(FourCastNet, Pangu-Weather, GraphCast and FourCastNet-v2) produce forecasts +that accurately capture the synoptic-scale structure of the cyclone including +the position of the cloud head, shape of the warm sector and location of warm +conveyor belt jet, and the large-scale dynamical drivers important for the +rapid storm development such as the position of the storm relative to the +upper-level jet exit. However, their ability to resolve the more detailed +structures important for issuing weather warnings is more mixed. All of the +machine learning models underestimate the peak amplitude of winds associated +with the storm, only some machine learning models resolve the warm core +seclusion and none of the machine learning models capture the sharp bent-back +warm frontal gradient. Our study shows there is a great deal about the +performance and properties of machine learning weather forecasts that can be +derived from case studies of high-impact weather events such as Storm Ciar\'an. + +
+
+
+
+
+ + ☆ What Machine Learning Can Do for Focusing Aerogel Detectors + + +
+ Particle identification at the Super Charm-Tau factory experiment will be +provided by a Focusing Aerogel Ring Imaging CHerenkov detector (FARICH). The +specifics of detector location make proper cooling difficult, therefore a +significant number of ambient background hits are captured. They must be +mitigated to reduce the data flow and improve particle velocity resolution. In +this work we present several approaches to filtering signal hits, inspired by +machine learning techniques from computer vision. + +
+
+ comment: 5 pages, 4 figures, to be published in 26th International Conference + on Computing in High Energy & Nuclear Physics (CHEP2023) proceedings +
+
+
+
+
+ + ☆ A Q-learning approach to the continuous control problem of robot + inverted pendulum balancing + + +
+ This study evaluates the application of a discrete action space reinforcement +learning method (Q-learning) to the continuous control problem of robot +inverted pendulum balancing. To speed up the learning process and to overcome +technical difficulties related to the direct learning on the real robotic +system, the learning phase is performed in simulation environment. A +mathematical model of the system dynamics is implemented, deduced by curve +fitting on data acquired from the real system. The proposed approach +demonstrated feasible, featuring its application on a real world robot that +learned to balance an inverted pendulum. This study also reinforces and +demonstrates the importance of an accurate representation of the physical world +in simulation to achieve a more efficient implementation of reinforcement +learning algorithms in real world, even when using a discrete action space +algorithm to control a continuous action. + +
+
+
+
+
+ + ☆ SAMSGL: Series-Aligned Multi-Scale Graph Learning for Spatio-Temporal + Forecasting + + +
+ Spatio-temporal forecasting in various domains, like traffic prediction and +weather forecasting, is a challenging endeavor, primarily due to the +difficulties in modeling propagation dynamics and capturing high-dimensional +interactions among nodes. Despite the significant strides made by graph-based +networks in spatio-temporal forecasting, there remain two pivotal factors +closely related to forecasting performance that need further consideration: +time delays in propagation dynamics and multi-scale high-dimensional +interactions. In this work, we present a Series-Aligned Multi-Scale Graph +Learning (SAMSGL) framework, aiming to enhance forecasting performance. In +order to handle time delays in spatial interactions, we propose a +series-aligned graph convolution layer to facilitate the aggregation of +non-delayed graph signals, thereby mitigating the influence of time delays for +the improvement in accuracy. To understand global and local spatio-temporal +interactions, we develop a spatio-temporal architecture via multi-scale graph +learning, which encompasses two essential components: multi-scale graph +structure learning and graph-fully connected (Graph-FC) blocks. The multi-scale +graph structure learning includes a global graph structure to learn both +delayed and non-delayed node embeddings, as well as a local one to learn node +variations influenced by neighboring factors. The Graph-FC blocks +synergistically fuse spatial and temporal information to boost prediction +accuracy. To evaluate the performance of SAMSGL, we conduct experiments on +meteorological and traffic forecasting datasets, which demonstrate its +effectiveness and superiority. + +
+
+ comment: 13 pages, 7figures +
+
+
+
+
+ + ☆ On the Initialization of Graph Neural Networks ICML 2023 + + +
+ Graph Neural Networks (GNNs) have displayed considerable promise in graph +representation learning across various applications. The core learning process +requires the initialization of model weight matrices within each GNN layer, +which is typically accomplished via classic initialization methods such as +Xavier initialization. However, these methods were originally motivated to +stabilize the variance of hidden embeddings and gradients across layers of +Feedforward Neural Networks (FNNs) and Convolutional Neural Networks (CNNs) to +avoid vanishing gradients and maintain steady information flow. In contrast, +within the GNN context classical initializations disregard the impact of the +input graph structure and message passing on variance. In this paper, we +analyze the variance of forward and backward propagation across GNN layers and +show that the variance instability of GNN initializations comes from the +combined effect of the activation function, hidden dimension, graph structure +and message passing. To better account for these influence factors, we propose +a new initialization method for Variance Instability Reduction within GNN +Optimization (Virgo), which naturally tends to equate forward and backward +variances across successive layers. We conduct comprehensive experiments on 15 +datasets to show that Virgo can lead to superior model performance and more +stable variance at initialization on node classification, link prediction and +graph classification tasks. Codes are in +https://github.com/LspongebobJH/virgo_icml2023. + +
+
+ comment: Accepted by ICML 2023 +
+
+
+
+
+ + ☆ Rethinking and Simplifying Bootstrapped Graph Latents WSDM 2024 + + +
+ Graph contrastive learning (GCL) has emerged as a representative paradigm in +graph self-supervised learning, where negative samples are commonly regarded as +the key to preventing model collapse and producing distinguishable +representations. Recent studies have shown that GCL without negative samples +can achieve state-of-the-art performance as well as scalability improvement, +with bootstrapped graph latent (BGRL) as a prominent step forward. However, +BGRL relies on a complex architecture to maintain the ability to scatter +representations, and the underlying mechanisms enabling the success remain +largely unexplored. In this paper, we introduce an instance-level decorrelation +perspective to tackle the aforementioned issue and leverage it as a springboard +to reveal the potential unnecessary model complexity within BGRL. Based on our +findings, we present SGCL, a simple yet effective GCL framework that utilizes +the outputs from two consecutive iterations as positive pairs, eliminating the +negative samples. SGCL only requires a single graph augmentation and a single +graph encoder without additional parameters. Extensive experiments conducted on +various graph benchmarks demonstrate that SGCL can achieve competitive +performance with fewer parameters, lower time and space costs, and significant +convergence speedup. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ☆ Projection Regret: Reducing Background Bias for Novelty Detection via + Diffusion Models NeurIPS 2023 + + +
+ Novelty detection is a fundamental task of machine learning which aims to +detect abnormal ($\textit{i.e.}$ out-of-distribution (OOD)) samples. Since +diffusion models have recently emerged as the de facto standard generative +framework with surprising generation results, novelty detection via diffusion +models has also gained much attention. Recent methods have mainly utilized the +reconstruction property of in-distribution samples. However, they often suffer +from detecting OOD samples that share similar background information to the +in-distribution data. Based on our observation that diffusion models can +\emph{project} any sample to an in-distribution sample with similar background +information, we propose \emph{Projection Regret (PR)}, an efficient novelty +detection method that mitigates the bias of non-semantic information. To be +specific, PR computes the perceptual distance between the test image and its +diffusion-based projection to detect abnormality. Since the perceptual distance +often fails to capture semantic changes when the background information is +dominant, we cancel out the background bias by comparing it against recursive +projections. Extensive experiments demonstrate that PR outperforms the prior +art of generative-model-based novelty detection methods by a significant +margin. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ Prompt Optimization via Adversarial In-Context Learning + + +
+ We propose a new method, Adversarial In-Context Learning (adv-ICL), to +optimize prompt for in-context learning (ICL) by employing one LLM as a +generator, another as a discriminator, and a third as a prompt modifier. As in +traditional adversarial learning, adv-ICL is implemented as a two-player game +between the generator and discriminator, where the generator tries to generate +realistic enough output to fool the discriminator. In each round, given an +input prefixed by task instructions and several exemplars, the generator +produces an output. The discriminator is then tasked with classifying the +generator input-output pair as model-generated or real data. Based on the +discriminator loss, the prompt modifier proposes possible edits to the +generator and discriminator prompts, and the edits that most improve the +adversarial loss are selected. We show that adv-ICL results in significant +improvements over state-of-the-art prompt optimization techniques for both open +and closed-source models on 11 generation and classification tasks including +summarization, arithmetic reasoning, machine translation, data-to-text +generation, and the MMLU and big-bench hard benchmarks. In addition, because +our method uses pre-trained models and updates only prompts rather than model +parameters, it is computationally efficient, easy to extend to any LLM and +task, and effective in low-resource settings. + +
+
+ comment: 21 pages +
+
+
+
+
+ + ☆ Privacy-Aware Data Acquisition under Data Similarity in Regression + Markets + + +
+ Data markets facilitate decentralized data exchange for applications such as +prediction, learning, or inference. The design of these markets is challenged +by varying privacy preferences as well as data similarity among data owners. +Related works have often overlooked how data similarity impacts pricing and +data value through statistical information leakage. We demonstrate that data +similarity and privacy preferences are integral to market design and propose a +query-response protocol using local differential privacy for a two-party data +acquisition mechanism. In our regression data market model, we analyze +strategic interactions between privacy-aware owners and the learner as a +Stackelberg game over the asked price and privacy factor. Finally, we +numerically evaluate how data similarity affects market participation and +traded data value. + +
+
+ comment: Submitted to IEEE Transactions on Neural Networks and Learning + Systems (submission version) +
+
+
+
+
+ + ☆ Panoptica -- instance-wise evaluation of 3D semantic and instance + segmentation maps + + +
+ This paper introduces panoptica, a versatile and performance-optimized +package designed for computing instance-wise segmentation quality metrics from +2D and 3D segmentation maps. panoptica addresses the limitations of existing +metrics and provides a modular framework that complements the original +intersection over union-based panoptic quality with other metrics, such as the +distance metric Average Symmetric Surface Distance. The package is open-source, +implemented in Python, and accompanied by comprehensive documentation and +tutorials. panoptica employs a three-step metrics computation process to cover +diverse use cases. The efficacy of panoptica is demonstrated on various +real-world biomedical datasets, where an instance-wise evaluation is +instrumental for an accurate representation of the underlying clinical task. +Overall, we envision panoptica as a valuable tool facilitating in-depth +evaluation of segmentation methods. + +
+
+ comment: 15 pages, 6 figures, 3 tables +
+
+
+
+
+ + ☆ TSVR+: Twin support vector regression with privileged information + + +
+ In the realm of machine learning, the data may contain additional attributes, +known as privileged information (PI). The main purpose of PI is to assist in +the training of the model and then utilize the acquired knowledge to make +predictions for unseen samples. Support vector regression (SVR) is an effective +regression model, however, it has a low learning speed due to solving a convex +quadratic problem (QP) subject to a pair of constraints. In contrast, twin +support vector regression (TSVR) is more efficient than SVR as it solves two +QPs each subject to one set of constraints. However, TSVR and its variants are +trained only on regular features and do not use privileged features for +training. To fill this gap, we introduce a fusion of TSVR with learning using +privileged information (LUPI) and propose a novel approach called twin support +vector regression with privileged information (TSVR+). The regularization terms +in the proposed TSVR+ capture the essence of statistical learning theory and +implement the structural risk minimization principle. We use the successive +overrelaxation (SOR) technique to solve the optimization problem of the +proposed TSVR+, which enhances the training efficiency. As far as our knowledge +extends, the integration of the LUPI concept into twin variants of regression +models is a novel advancement. The numerical experiments conducted on UCI, +stock and time series data collectively demonstrate the superiority of the +proposed model. + +
+
+
+
+
+ + ☆ FRAPPÉ: A Post-Processing Framework for Group Fairness Regularization + + +
+ Post-processing mitigation techniques for group fairness generally adjust the +decision threshold of a base model in order to improve fairness. Methods in +this family exhibit several advantages that make them appealing in practice: +post-processing requires no access to the model training pipeline, is agnostic +to the base model architecture, and offers a reduced computation cost compared +to in-processing. Despite these benefits, existing methods face other +challenges that limit their applicability: they require knowledge of the +sensitive attributes at inference time and are oftentimes outperformed by +in-processing. In this paper, we propose a general framework to transform any +in-processing method with a penalized objective into a post-processing +procedure. The resulting method is specifically designed to overcome the +aforementioned shortcomings of prior post-processing approaches. Furthermore, +we show theoretically and through extensive experiments on real-world data that +the resulting post-processing method matches or even surpasses the +fairness-error trade-off offered by the in-processing counterpart. + +
+
+ comment: Presubmission +
+
+
+
+
+ + ☆ UTBoost: A Tree-boosting based System for Uplift Modeling + + +
+ Uplift modeling refers to the set of machine learning techniques that a +manager may use to estimate customer uplift, that is, the net effect of an +action on some customer outcome. By identifying the subset of customers for +whom a treatment will have the greatest effect, uplift models assist +decision-makers in optimizing resource allocations and maximizing overall +returns. Accurately estimating customer uplift poses practical challenges, as +it requires assessing the difference between two mutually exclusive outcomes +for each individual. In this paper, we propose two innovative adaptations of +the well-established Gradient Boosting Decision Trees (GBDT) algorithm, which +learn the causal effect in a sequential way and overcome the counter-factual +nature. Both approaches innovate existing techniques in terms of ensemble +learning method and learning objectives, respectively. Experiments on +large-scale datasets demonstrate the usefulness of the proposed methods, which +often yielding remarkable improvements over base models. To facilitate the +application, we develop the UTBoost, an end-to-end tree boosting system +specifically designed for uplift modeling. The package is open source and has +been optimized for training speed to meet the needs of real industrial +applications. + +
+
+ comment: 11 pages, 3 figures +
+
+
+
+
+ + ☆ Structured World Representations in Maze-Solving Transformers + + +
+ Transformer models underpin many recent advances in practical machine +learning applications, yet understanding their internal behavior continues to +elude researchers. Given the size and complexity of these models, forming a +comprehensive picture of their inner workings remains a significant challenge. +To this end, we set out to understand small transformer models in a more +tractable setting: that of solving mazes. In this work, we focus on the +abstractions formed by these models and find evidence for the consistent +emergence of structured internal representations of maze topology and valid +paths. We demonstrate this by showing that the residual stream of only a single +token can be linearly decoded to faithfully reconstruct the entire maze. We +also find that the learned embeddings of individual tokens have spatial +structure. Furthermore, we take steps towards deciphering the circuity of +path-following by identifying attention heads (dubbed $\textit{adjacency +heads}$), which are implicated in finding valid subsequent tokens. + +
+
+ comment: 15 pages, 18 figures, 15 tables. Corresponding author: Michael + Ivanitskiy (mivanits@mines.edu). Code available at + https://github.com/understanding-search/structured-representations-maze-transformers +
+
+
+
+
+ + ☆ ULMA: Unified Language Model Alignment with Demonstration and Point-wise + Human Preference + + +
+ Language model alignment is a cutting-edge technique in large language model +training to align the model output to user's intent, e.g., being helpful and +harmless. Recent alignment framework consists of two steps: supervised +fine-tuning with demonstration data and preference learning with human +preference data. Previous preference learning methods, such as RLHF and DPO, +mainly focus on pair-wise preference data. However, in many real-world +scenarios where human feedbacks are intrinsically point-wise, these methods +will suffer from information loss or even fail. To fill this gap, in this +paper, we first develop a preference learning method called point-wise DPO to +tackle point-wise preference data. Further revelation on the connection between +supervised fine-tuning and point-wise preference learning enables us to develop +a unified framework for both human demonstration and point-wise preference +data, which sheds new light on the construction of preference dataset. +Extensive experiments on point-wise datasets with binary or continuous labels +demonstrate the superior performance and efficiency of our proposed methods. A +new dataset with high-quality demonstration samples on harmlessness is +constructed and made publicly available. + +
+
+
+
+
+ + ☆ On Optimal Consistency-Robustness Trade-Off for Learning-Augmented + Multi-Option Ski Rental + + +
+ The learning-augmented multi-option ski rental problem generalizes the +classical ski rental problem in two ways: the algorithm is provided with a +prediction on the number of days we can ski, and the ski rental options now +come with a variety of rental periods and prices to choose from, unlike the +classical two-option setting. Subsequent to the initial study of the +multi-option ski rental problem (without learning augmentation) due to Zhang, +Poon, and Xu, significant progress has been made for this problem recently in +particular. The problem is very well understood when we relinquish one of the +two generalizations -- for the learning-augmented classical ski rental problem, +algorithms giving best-possible trade-off between consistency and robustness +exist; for the multi-option ski rental problem without learning augmentation, +deterministic/randomized algorithms giving the best-possible competitiveness +have been found. However, in presence of both generalizations, there remained a +huge gap between the algorithmic and impossibility results. In fact, for +randomized algorithms, we did not have any nontrivial lower bounds on the +consistency-robustness trade-off before. + This paper bridges this gap for both deterministic and randomized algorithms. +For deterministic algorithms, we present a best-possible algorithm that +completely matches the known lower bound. For randomized algorithms, we show +the first nontrivial lower bound on the consistency-robustness trade-off, and +also present an improved randomized algorithm. Our algorithm matches our lower +bound on robustness within a factor of e/2 when the consistency is at most +1.086. + +
+
+ comment: 16 pages, 2 figures +
+
+
+
+
+ + ☆ Characterization of Locality in Spin States and Forced Moves for + Optimizations + + +
+ Ising formulations are widely utilized to solve combinatorial optimization +problems, and a variety of quantum or semiconductor-based hardware has recently +been made available. In combinatorial optimization problems, the existence of +local minima in energy landscapes is problematic to use to seek the global +minimum. We note that the aim of the optimization is not to obtain exact +samplings from the Boltzmann distribution, and there is thus no need to satisfy +detailed balance conditions. In light of this fact, we develop an algorithm to +get out of the local minima efficiently while it does not yield the exact +samplings. For this purpose, we utilize a feature that characterizes locality +in the current state, which is easy to obtain with a type of specialized +hardware. Furthermore, as the proposed algorithm is based on a rejection-free +algorithm, the computational cost is low. In this work, after presenting the +details of the proposed algorithm, we report the results of numerical +experiments that demonstrate the effectiveness of the proposed feature and +algorithm. + +
+
+ comment: 7 pages, 2 figures +
+
+
+
+
+ + ☆ Asymmetric leader-laggard cluster synchronization for collective + decision-making with laser network + + +
+ Photonic accelerators have recently attracted soaring interest, harnessing +the ultimate nature of light for information processing. Collective +decision-making with a laser network, employing the chaotic and synchronous +dynamics of optically interconnected lasers to address the competitive +multi-armed bandit (CMAB) problem, is a highly compelling approach due to its +scalability and experimental feasibility. We investigated essential network +structures for collective decision-making through quantitative stability +analysis. Moreover, we demonstrated the asymmetric preferences of players in +the CMAB problem, extending its functionality to more practical applications. +Our study highlights the capability and significance of machine learning built +upon chaotic lasers and photonic devices. + +
+
+
+
+
+ + ☆ MEMTO: Memory-guided Transformer for Multivariate Time Series Anomaly + Detection + + +
+ Detecting anomalies in real-world multivariate time series data is +challenging due to complex temporal dependencies and inter-variable +correlations. Recently, reconstruction-based deep models have been widely used +to solve the problem. However, these methods still suffer from an +over-generalization issue and fail to deliver consistently high performance. To +address this issue, we propose the MEMTO, a memory-guided Transformer using a +reconstruction-based approach. It is designed to incorporate a novel memory +module that can learn the degree to which each memory item should be updated in +response to the input data. To stabilize the training procedure, we use a +two-phase training paradigm which involves using K-means clustering for +initializing memory items. Additionally, we introduce a bi-dimensional +deviation-based detection criterion that calculates anomaly scores considering +both input space and latent space. We evaluate our proposed method on five +real-world datasets from diverse domains, and it achieves an average anomaly +detection F1-score of 95.74%, significantly outperforming the previous +state-of-the-art methods. We also conduct extensive experiments to empirically +validate the effectiveness of our proposed model's key components. + +
+
+
+
+
+ + ☆ MASP: Scalable GNN-based Planning for Multi-Agent Navigation + + +
+ We investigate the problem of decentralized multi-agent navigation tasks, +where multiple agents need to reach initially unassigned targets in a limited +time. Classical planning-based methods suffer from expensive computation +overhead at each step and offer limited expressiveness for complex cooperation +strategies. In contrast, reinforcement learning (RL) has recently become a +popular paradigm for addressing this issue. However, RL struggles with low data +efficiency and cooperation when directly exploring (nearly) optimal policies in +the large search space, especially with an increased agent number (e.g., 10+ +agents) or in complex environments (e.g., 3D simulators). In this paper, we +propose Multi-Agent Scalable GNN-based P lanner (MASP), a goal-conditioned +hierarchical planner for navigation tasks with a substantial number of agents. +MASP adopts a hierarchical framework to divide a large search space into +multiple smaller spaces, thereby reducing the space complexity and accelerating +training convergence. We also leverage graph neural networks (GNN) to model the +interaction between agents and goals, improving goal achievement. Besides, to +enhance generalization capabilities in scenarios with unseen team sizes, we +divide agents into multiple groups, each with a previously trained number of +agents. The results demonstrate that MASP outperforms classical planning-based +competitors and RL baselines, achieving a nearly 100% success rate with minimal +training data in both multi-agent particle environments (MPE) with 50 agents +and a quadrotor 3-dimensional environment (OmniDrones) with 20 agents. +Furthermore, the learned policy showcases zero-shot generalization across +unseen team sizes. + +
+
+ comment: Submitted to IEEE RA-L +
+
+
+
+
+ + ☆ Creative Agents: Empowering Agents with Imagination for Creative Tasks + + +
+ We study building embodied agents for open-ended creative tasks. While +existing methods build instruction-following agents that can perform diverse +open-ended tasks, none of them demonstrates creativity -- the ability to give +novel and diverse task solutions implicit in the language instructions. This +limitation comes from their inability to convert abstract language instructions +into concrete task goals in the environment and perform long-horizon planning +for such complicated goals. Given the observation that humans perform creative +tasks with the help of imagination, we propose a class of solutions for +creative agents, where the controller is enhanced with an imaginator that +generates detailed imaginations of task outcomes conditioned on language +instructions. We introduce several approaches to implementing the components of +creative agents. We implement the imaginator with either a large language model +for textual imagination or a diffusion model for visual imagination. The +controller can either be a behavior-cloning policy learned from data or a +pre-trained foundation model generating executable codes in the environment. We +benchmark creative tasks with the challenging open-world game Minecraft, where +the agents are asked to create diverse buildings given free-form language +instructions. In addition, we propose novel evaluation metrics for open-ended +creative tasks utilizing GPT-4V, which holds many advantages over existing +metrics. We perform a detailed experimental analysis of creative agents, +showing that creative agents are the first AI agents accomplishing diverse +building creation in the survival mode of Minecraft. Our benchmark and models +are open-source for future research on creative agents +(https://github.com/PKU-RL/Creative-Agents). + +
+
+ comment: The first two authors contribute equally +
+
+
+
+
+ + ☆ Simplifying Neural Network Training Under Class Imbalance NeurIPS 2023 + + +
+ Real-world datasets are often highly class-imbalanced, which can adversely +impact the performance of deep learning models. The majority of research on +training neural networks under class imbalance has focused on specialized loss +functions, sampling techniques, or two-stage training procedures. Notably, we +demonstrate that simply tuning existing components of standard deep learning +pipelines, such as the batch size, data augmentation, optimizer, and label +smoothing, can achieve state-of-the-art performance without any such +specialized class imbalance methods. We also provide key prescriptions and +considerations for training under class imbalance, and an understanding of why +imbalance methods succeed or fail. + +
+
+ comment: NeurIPS 2023. Code available at + https://github.com/ravidziv/SimplifyingImbalancedTraining +
+
+
+
+
+ + ☆ ASPEN: High-Throughput LoRA Fine-Tuning of Large Language Models with a + Single GPU + + +
+ Transformer-based large language models (LLMs) have demonstrated outstanding +performance across diverse domains, particularly when fine-turned for specific +domains. Recent studies suggest that the resources required for fine-tuning +LLMs can be economized through parameter-efficient methods such as Low-Rank +Adaptation (LoRA). While LoRA effectively reduces computational burdens and +resource demands, it currently supports only a single-job fine-tuning setup. + In this paper, we present ASPEN, a high-throughput framework for fine-tuning +LLMs. ASPEN efficiently trains multiple jobs on a single GPU using the LoRA +method, leveraging shared pre-trained model and adaptive scheduling. ASPEN is +compatible with transformer-based language models like LLaMA and ChatGLM, etc. +Experiments show that ASPEN saves 53% of GPU memory when training multiple +LLaMA-7B models on NVIDIA A100 80GB GPU and boosts training throughput by about +17% compared to existing methods when training with various pre-trained models +on different GPUs. The adaptive scheduling algorithm reduces turnaround time by +24%, end-to-end training latency by 12%, prioritizing jobs and preventing +out-of-memory issues. + +
+
+ comment: 14 pages, 14 figures +
+
+
+
+
+ + ☆ ReconU-Net: a direct PET image reconstruction using U-Net architecture + with back projection-induced skip connection + + +
+ [Objective] This study aims to introduce a novel back projection-induced +U-Net-shaped architecture, called ReconU-Net, for deep learning-based direct +positron emission tomography (PET) image reconstruction. Additionally, our +objective is to analyze the behavior of direct PET image reconstruction and +gain deeper insights by comparing the proposed ReconU-Net architecture with +other encoder-decoder architectures without skip connections. [Approach] The +proposed ReconU-Net architecture uniquely integrates the physical model of the +back projection operation into the skip connection. This distinctive feature +facilitates the effective transfer of intrinsic spatial information from the +input sinogram to the reconstructed image via an embedded physical model. The +proposed ReconU-Net was trained using Monte Carlo simulation data from the +Brainweb phantom and tested on both simulated and real Hoffman brain phantom +data. [Main results] The proposed ReconU-Net method generated a reconstructed +image with a more accurate structure compared to other deep learning-based +direct reconstruction methods. Further analysis showed that the proposed +ReconU-Net architecture has the ability to transfer features of multiple +resolutions, especially non-abstract high-resolution information, through skip +connections. Despite limited training on simulated data, the proposed +ReconU-Net successfully reconstructed the real Hoffman brain phantom, unlike +other deep learning-based direct reconstruction methods, which failed to +produce a reconstructed image. [Significance] The proposed ReconU-Net can +improve the fidelity of direct PET image reconstruction, even when dealing with +small training datasets, by leveraging the synergistic relationship between +data-driven modeling and the physics model of the imaging process. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ Pseudo Replay-based Class Continual Learning for Online New Category + Anomaly Detection in Additive Manufacturing + + +
+ The incorporation of advanced sensors and machine learning techniques has +enabled modern manufacturing enterprises to perform data-driven in-situ quality +monitoring based on the sensor data collected in manufacturing processes. +However, one critical challenge is that newly presented defect category may +manifest as the manufacturing process continues, resulting in monitoring +performance deterioration of previously trained machine learning models. Hence, +there is an increasing need for empowering machine learning model to learn +continually. Among all continual learning methods, memory-based continual +learning has the best performance but faces the constraints of data storage +capacity. To address this issue, this paper develops a novel pseudo +replay-based continual learning by integrating class incremental learning and +oversampling-based data generation. Without storing all the data, the developed +framework could generate high-quality data representing previous classes to +train machine learning model incrementally when new category anomaly occurs. In +addition, it could even enhance the monitoring performance since it also +effectively improves the data quality. The effectiveness of the proposed +framework is validated in an additive manufacturing process, which leverages +supervised classification problem for anomaly detection. The experimental +results show that the developed method is very promising in detecting novel +anomaly while maintaining a good performance on the previous task and brings up +more flexibility in model architecture. + +
+
+
+
+
+ + ☆ Constrained Twin Variational Auto-Encoder for Intrusion Detection in IoT + Systems + + +
+ Intrusion detection systems (IDSs) play a critical role in protecting +billions of IoT devices from malicious attacks. However, the IDSs for IoT +devices face inherent challenges of IoT systems, including the heterogeneity of +IoT data/devices, the high dimensionality of training data, and the imbalanced +data. Moreover, the deployment of IDSs on IoT systems is challenging, and +sometimes impossible, due to the limited resources such as memory/storage and +computing capability of typical IoT devices. To tackle these challenges, this +article proposes a novel deep neural network/architecture called Constrained +Twin Variational Auto-Encoder (CTVAE) that can feed classifiers of IDSs with +more separable/distinguishable and lower-dimensional representation data. +Additionally, in comparison to the state-of-the-art neural networks used in +IDSs, CTVAE requires less memory/storage and computing power, hence making it +more suitable for IoT IDS systems. Extensive experiments with the 11 most +popular IoT botnet datasets show that CTVAE can boost around 1% in terms of +accuracy and Fscore in detection attack compared to the state-of-the-art +machine learning and representation learning methods, whilst the running time +for attack detection is lower than 2E-6 seconds and the model size is lower +than 1 MB. We also further investigate various characteristics of CTVAE in the +latent space and in the reconstruction representation to demonstrate its +efficacy compared with current well-known methods. + +
+
+
+
+
+ + ☆ RL-Based Cargo-UAV Trajectory Planning and Cell Association for Minimum + Handoffs, Disconnectivity, and Energy Consumption + + +
+ Unmanned aerial vehicle (UAV) is a promising technology for last-mile cargo +delivery. However, the limited on-board battery capacity, cellular +unreliability, and frequent handoffs in the airspace are the main obstacles to +unleash its full potential. Given that existing cellular networks were +primarily designed to service ground users, re-utilizing the same architecture +for highly mobile aerial users, e.g., cargo-UAVs, is deemed challenging. +Indeed, to ensure a safe delivery using cargo-UAVs, it is crucial to utilize +the available energy efficiently, while guaranteeing reliable connectivity for +command-and-control and avoiding frequent handoff. To achieve this goal, we +propose a novel approach for joint cargo-UAV trajectory planning and cell +association. Specifically, we formulate the cargo-UAV mission as a +multi-objective problem aiming to 1) minimize energy consumption, 2) reduce +handoff events, and 3) guarantee cellular reliability along the trajectory. We +leverage reinforcement learning (RL) to jointly optimize the cargo-UAV's +trajectory and cell association. Simulation results demonstrate a performance +improvement of our proposed method, in terms of handoffs, disconnectivity, and +energy consumption, compared to benchmarks. + +
+
+
+
+
+ + ☆ NeutronStream: A Dynamic GNN Training Framework with Sliding Window for + Graph Streams + + +
+ Existing Graph Neural Network (GNN) training frameworks have been designed to +help developers easily create performant GNN implementations. However, most +existing GNN frameworks assume that the input graphs are static, but ignore +that most real-world graphs are constantly evolving. Though many dynamic GNN +models have emerged to learn from evolving graphs, the training process of +these dynamic GNNs is dramatically different from traditional GNNs in that it +captures both the spatial and temporal dependencies of graph updates. This +poses new challenges for designing dynamic GNN training frameworks. First, the +traditional batched training method fails to capture real-time structural +evolution information. Second, the time-dependent nature makes parallel +training hard to design. Third, it lacks system supports for users to +efficiently implement dynamic GNNs. In this paper, we present NeutronStream, a +framework for training dynamic GNN models. NeutronStream abstracts the input +dynamic graph into a chronologically updated stream of events and processes the +stream with an optimized sliding window to incrementally capture the +spatial-temporal dependencies of events. Furthermore, NeutronStream provides a +parallel execution engine to tackle the sequential event processing challenge +to achieve high performance. NeutronStream also integrates a built-in graph +storage structure that supports dynamic updates and provides a set of +easy-to-use APIs that allow users to express their dynamic GNNs. Our +experimental results demonstrate that, compared to state-of-the-art dynamic GNN +implementations, NeutronStream achieves speedups ranging from 1.48X to 5.87X +and an average accuracy improvement of 3.97%. + +
+
+ comment: 12 pages, 15 figures +
+
+
+
+
+ + ☆ Congestion-aware Distributed Task Offloading in Wireless Multi-hop + Networks Using Graph Neural Networks ICASSP 2024 + + +
+ Computational offloading has become an enabling component for edge +intelligence in mobile and smart devices. Existing offloading schemes mainly +focus on mobile devices and servers, while ignoring the potential network +congestion caused by tasks from multiple mobile devices, especially in wireless +multi-hop networks. To fill this gap, we propose a low-overhead, +congestion-aware distributed task offloading scheme by augmenting a distributed +greedy framework with graph-based machine learning. In simulated wireless +multi-hop networks with 20-110 nodes and a resource allocation scheme based on +shortest path routing and contention-based link scheduling, our approach is +demonstrated to be effective in reducing congestion or unstable queues under +the context-agnostic baseline, while improving the execution latency over local +computing. + +
+
+ comment: 5 pages, 5 figures, submitted to IEEE ICASSP 2024 +
+
+
+
+
+ + ☆ Generator Born from Classifier + + +
+ In this paper, we make a bold attempt toward an ambitious task: given a +pre-trained classifier, we aim to reconstruct an image generator, without +relying on any data samples. From a black-box perspective, this challenge seems +intractable, since it inevitably involves identifying the inverse function for +a classifier, which is, by nature, an information extraction process. As such, +we resort to leveraging the knowledge encapsulated within the parameters of the +neural network. Grounded on the theory of Maximum-Margin Bias of gradient +descent, we propose a novel learning paradigm, in which the generator is +trained to ensure that the convergence conditions of the network parameters are +satisfied over the generated distribution of the samples. Empirical validation +from various image generation tasks substantiates the efficacy of our strategy. + +
+
+
+
+
+ + ☆ Learning Energy-based Model via Dual-MCMC Teaching + + +
+ This paper studies the fundamental learning problem of the energy-based model +(EBM). Learning the EBM can be achieved using the maximum likelihood estimation +(MLE), which typically involves the Markov Chain Monte Carlo (MCMC) sampling, +such as the Langevin dynamics. However, the noise-initialized Langevin dynamics +can be challenging in practice and hard to mix. This motivates the exploration +of joint training with the generator model where the generator model serves as +a complementary model to bypass MCMC sampling. However, such a method can be +less accurate than the MCMC and result in biased EBM learning. While the +generator can also serve as an initializer model for better MCMC sampling, its +learning can be biased since it only matches the EBM and has no access to +empirical training examples. Such biased generator learning may limit the +potential of learning the EBM. To address this issue, we present a joint +learning framework that interweaves the maximum likelihood learning algorithm +for both the EBM and the complementary generator model. In particular, the +generator model is learned by MLE to match both the EBM and the empirical data +distribution, making it a more informative initializer for MCMC sampling of +EBM. Learning generator with observed examples typically requires inference of +the generator posterior. To ensure accurate and efficient inference, we adopt +the MCMC posterior sampling and introduce a complementary inference model to +initialize such latent MCMC sampling. We show that three separate models can be +seamlessly integrated into our joint framework through two (dual-) MCMC +teaching, enabling effective and efficient EBM learning. + +
+
+
+
+
+ + ☆ Dimensionality Reduction and Dynamical Mode Recognition of Circular + Arrays of Flame Oscillators Using Deep Neural Network + + +
+ Oscillatory combustion in aero engines and modern gas turbines often has +significant adverse effects on their operation, and accurately recognizing +various oscillation modes is the prerequisite for understanding and controlling +combustion instability. However, the high-dimensional spatial-temporal data of +a complex combustion system typically poses considerable challenges to the +dynamical mode recognition. Based on a two-layer bidirectional long short-term +memory variational autoencoder (Bi-LSTM-VAE) dimensionality reduction model and +a two-dimensional Wasserstein distance-based classifier (WDC), this study +proposes a promising method (Bi-LSTM-VAE-WDC) for recognizing dynamical modes +in oscillatory combustion systems. Specifically, the Bi-LSTM-VAE dimension +reduction model was introduced to reduce the high-dimensional spatial-temporal +data of the combustion system to a low-dimensional phase space; Gaussian kernel +density estimates (GKDE) were computed based on the distribution of phase +points in a grid; two-dimensional WD values were calculated from the GKDE maps +to recognize the oscillation modes. The time-series data used in this study +were obtained from numerical simulations of circular arrays of laminar flame +oscillators. The results show that the novel Bi-LSTM-VAE method can produce a +non-overlapping distribution of phase points, indicating an effective +unsupervised mode recognition and classification. Furthermore, the present +method exhibits a more prominent performance than VAE and PCA (principal +component analysis) for distinguishing dynamical modes in complex flame +systems, implying its potential in studying turbulent combustion. + +
+
+ comment: 17 pages, 10 figures, research paper +
+
+
+
+
+ + ☆ GIT-Net: Generalized Integral Transform for Operator Learning + + +
+ This article introduces GIT-Net, a deep neural network architecture for +approximating Partial Differential Equation (PDE) operators, inspired by +integral transform operators. GIT-NET harnesses the fact that differential +operators commonly used for defining PDEs can often be represented +parsimoniously when expressed in specialized functional bases (e.g., Fourier +basis). Unlike rigid integral transforms, GIT-Net parametrizes adaptive +generalized integral transforms with deep neural networks. When compared to +several recently proposed alternatives, GIT-Net's computational and memory +requirements scale gracefully with mesh discretizations, facilitating its +application to PDE problems on complex geometries. Numerical experiments +demonstrate that GIT-Net is a competitive neural network operator, exhibiting +small test errors and low evaluations across a range of PDE problems. This +stands in contrast to existing neural network operators, which typically excel +in just one of these areas. + +
+
+
+
+
+ + ☆ Adaptive Instrument Design for Indirect Experiments + + +
+ Indirect experiments provide a valuable framework for estimating treatment +effects in situations where conducting randomized control trials (RCTs) is +impractical or unethical. Unlike RCTs, indirect experiments estimate treatment +effects by leveraging (conditional) instrumental variables, enabling estimation +through encouragement and recommendation rather than strict treatment +assignment. However, the sample efficiency of such estimators depends not only +on the inherent variability in outcomes but also on the varying compliance +levels of users with the instrumental variables and the choice of estimator +being used, especially when dealing with numerous instrumental variables. While +adaptive experiment design has a rich literature for direct experiments, in +this paper we take the initial steps towards enhancing sample efficiency for +indirect experiments by adaptively designing a data collection policy over +instrumental variables. Our main contribution is a practical computational +procedure that utilizes influence functions to search for an optimal data +collection policy, minimizing the mean-squared error of the desired +(non-linear) estimator. Through experiments conducted in various domains +inspired by real-world applications, we showcase how our method can +significantly improve the sample efficiency of indirect experiments. + +
+
+
+
+
+ + ☆ PEFA: Parameter-Free Adapters for Large-scale Embedding-based Retrieval + Models WSDM 2024 + + +
+ Embedding-based Retrieval Models (ERMs) have emerged as a promising framework +for large-scale text retrieval problems due to powerful large language models. +Nevertheless, fine-tuning ERMs to reach state-of-the-art results can be +expensive due to the extreme scale of data as well as the complexity of +multi-stages pipelines (e.g., pre-training, fine-tuning, distillation). In this +work, we propose the PEFA framework, namely ParamEter-Free Adapters, for fast +tuning of ERMs without any backward pass in the optimization. At index building +stage, PEFA equips the ERM with a non-parametric k-nearest neighbor (kNN) +component. At inference stage, PEFA performs a convex combination of two +scoring functions, one from the ERM and the other from the kNN. Based on the +neighborhood definition, PEFA framework induces two realizations, namely +PEFA-XL (i.e., extra large) using double ANN indices and PEFA-XS (i.e., extra +small) using a single ANN index. Empirically, PEFA achieves significant +improvement on two retrieval applications. For document retrieval, regarding +Recall@100 metric, PEFA improves not only pre-trained ERMs on Trivia-QA by an +average of 13.2%, but also fine-tuned ERMs on NQ-320K by an average of 5.5%, +respectively. For product search, PEFA improves the Recall@100 of the +fine-tuned ERMs by an average of 5.3% and 14.5%, for PEFA-XS and PEFA-XL, +respectively. Our code is available at https://github.com/ +amzn/pecos/tree/mainline/examples/pefa-wsdm24 + +
+
+ comment: Accept by WSDM 2024 +
+
+
+
+
+ + ☆ Decoding Data Quality via Synthetic Corruptions: Embedding-guided + Pruning of Code Data SP + + +
+ Code datasets, often collected from diverse and uncontrolled sources such as +GitHub, potentially suffer from quality issues, thereby affecting the +performance and training efficiency of Large Language Models (LLMs) optimized +for code generation. Previous studies demonstrated the benefit of using +embedding spaces for data pruning, but they mainly focused on duplicate removal +or increasing variety, and in other modalities, such as images. Our work +focuses on using embeddings to identify and remove "low-quality" code data. +First, we explore features of "low-quality" code in embedding space, through +the use of synthetic corruptions. Armed with this knowledge, we devise novel +pruning metrics that operate in embedding space to identify and remove +low-quality entries in the Stack dataset. We demonstrate the benefits of this +synthetic corruption informed pruning (SCIP) approach on the well-established +HumanEval and MBPP benchmarks, outperforming existing embedding-based methods. +Importantly, we achieve up to a 3% performance improvement over no pruning, +thereby showing the promise of insights from synthetic corruptions for data +pruning. + +
+
+ comment: 12 pages, 4 figures, Oral Presentation at 3rd Workshop on Efficient + Natural Language and Speech Processing (ENLSP-III), NeurIPS 2023 +
+
+
+
+
+ + ☆ Towards Fast and Stable Federated Learning: Confronting Heterogeneity + via Knowledge Anchor ACM MM23 + + +
+ Federated learning encounters a critical challenge of data heterogeneity, +adversely affecting the performance and convergence of the federated model. +Various approaches have been proposed to address this issue, yet their +effectiveness is still limited. Recent studies have revealed that the federated +model suffers severe forgetting in local training, leading to global forgetting +and performance degradation. Although the analysis provides valuable insights, +a comprehensive understanding of the vulnerable classes and their impact +factors is yet to be established. In this paper, we aim to bridge this gap by +systematically analyzing the forgetting degree of each class during local +training across different communication rounds. Our observations are: (1) Both +missing and non-dominant classes suffer similar severe forgetting during local +training, while dominant classes show improvement in performance. (2) When +dynamically reducing the sample size of a dominant class, catastrophic +forgetting occurs abruptly when the proportion of its samples is below a +certain threshold, indicating that the local model struggles to leverage a few +samples of a specific class effectively to prevent forgetting. Motivated by +these findings, we propose a novel and straightforward algorithm called +Federated Knowledge Anchor (FedKA). Assuming that all clients have a single +shared sample for each class, the knowledge anchor is constructed before each +local training stage by extracting shared samples for missing classes and +randomly selecting one sample per class for non-dominant classes. The knowledge +anchor is then utilized to correct the gradient of each mini-batch towards the +direction of preserving the knowledge of the missing and non-dominant classes. +Extensive experimental results demonstrate that our proposed FedKA achieves +fast and stable convergence, significantly improving accuracy on popular +benchmarks. + +
+
+ comment: Published in ACM MM23 +
+
+
+
+
+ + ☆ Robust Clustering using Hyperdimensional Computing + + +
+ This paper addresses the clustering of data in the hyperdimensional computing +(HDC) domain. In prior work, an HDC-based clustering framework, referred to as +HDCluster, has been proposed. However, the performance of the existing +HDCluster is not robust. The performance of HDCluster is degraded as the +hypervectors for the clusters are chosen at random during the initialization +step. To overcome this bottleneck, we assign the initial cluster hypervectors +by exploring the similarity of the encoded data, referred to as \textit{query} +hypervectors. Intra-cluster hypervectors have a higher similarity than +inter-cluster hypervectors. Harnessing the similarity results among query +hypervectors, this paper proposes four HDC-based clustering algorithms: +similarity-based k-means, equal bin-width histogram, equal bin-height +histogram, and similarity-based affinity propagation. Experimental results +illustrate that: (i) Compared to the existing HDCluster, our proposed HDC-based +clustering algorithms can achieve better accuracy, more robust performance, +fewer iterations, and less execution time. Similarity-based affinity +propagation outperforms the other three HDC-based clustering algorithms on +eight datasets by 2~38% in clustering accuracy. (ii) Even for one-pass +clustering, i.e., without any iterative update of the cluster hypervectors, our +proposed algorithms can provide more robust clustering accuracy than HDCluster. +(iii) Over eight datasets, five out of eight can achieve higher or comparable +accuracy when projected onto the hyperdimensional space. Traditional clustering +is more desirable than HDC when the number of clusters, $k$, is large. + +
+
+
+
+
+ + ☆ Efficient Online Data Mixing For Language Model Pre-Training + + +
+ The data used to pretrain large language models has a decisive impact on a +model's downstream performance, which has led to a large body of work on data +selection methods that aim to automatically determine the most suitable data to +use for pretraining. Existing data selection methods suffer from slow and +computationally expensive processes, a problem amplified by the increasing size +of models and of pretraining datasets. Data mixing, on the other hand, reduces +the complexity of data selection by grouping data points together and +determining sampling probabilities across entire groups. However, data mixing +proportions are typically fixed before training and therefore cannot adapt to +changing training dynamics. To address these limitations, we develop an +efficient algorithm for Online Data Mixing (ODM) that combines elements from +both data selection and data mixing. Based on multi-armed bandit algorithms, +our online approach optimizes the data mixing proportions during training. +Remarkably, our method trains a model that reaches the final perplexity of the +next best method with 19\% fewer training iterations, and improves performance +on the 5-shot MMLU benchmark by 1.9% relative accuracy, while adding negligible +wall-clock time during pretraining. + +
+
+
+
+
+ + ☆ Harmonizing Global Voices: Culturally-Aware Models for Enhanced Content + Moderation + + +
+ Content moderation at scale faces the challenge of considering local cultural +distinctions when assessing content. While global policies aim to maintain +decision-making consistency and prevent arbitrary rule enforcement, they often +overlook regional variations in interpreting natural language as expressed in +content. In this study, we are looking into how moderation systems can tackle +this issue by adapting to local comprehension nuances. We train large language +models on extensive datasets of media news and articles to create culturally +attuned models. The latter aim to capture the nuances of communication across +geographies with the goal of recognizing cultural and societal variations in +what is considered offensive content. We further explore the capability of +these models to generate explanations for instances of content violation, +aiming to shed light on how policy guidelines are perceived when cultural and +societal contexts change. We find that training on extensive media datasets +successfully induced cultural awareness and resulted in improvements in +handling content violations on a regional basis. Additionally, these +advancements include the ability to provide explanations that align with the +specific local norms and nuances as evidenced by the annotators' preference in +our conducted study. This multifaceted success reinforces the critical role of +an adaptable content moderation approach in keeping pace with the ever-evolving +nature of the content it oversees. + +
+
+ comment: 12 pages, 8 Figures. Supplementary material +
+
+
+
+
+ + ☆ Auto DP-SGD: Dual Improvements of Privacy and Accuracy via Automatic + Clipping Threshold and Noise Multiplier Estimation + + +
+ DP-SGD has emerged as a popular method to protect personally identifiable +information in deep learning applications. Unfortunately, DP-SGD's per-sample +gradient clipping and uniform noise addition during training can significantly +degrade model utility. To enhance the model's utility, researchers proposed +various adaptive DP-SGD methods. However, we examine and discover that these +techniques result in greater privacy leakage or lower accuracy than the +traditional DP-SGD method, or a lack of evaluation on a complex data set such +as CIFAR100. To address these limitations, we propose an Auto DP-SGD. Our +method automates clipping threshold estimation based on the DL model's gradient +norm and scales the gradients of each training sample without losing gradient +information. This helps to improve the algorithm's utility while using a less +privacy budget. To further improve accuracy, we introduce automatic noise +multiplier decay mechanisms to decrease the noise multiplier after every epoch. +Finally, we develop closed-form mathematical expressions using tCDP accountant +for automatic noise multiplier and automatic clipping threshold estimation. +Through extensive experimentation, we demonstrate that Auto DP-SGD outperforms +existing SOTA DP-SGD methods in privacy and accuracy on various benchmark +datasets. We also show that privacy can be improved by lowering the scale +factor and using learning rate schedulers without significantly reducing +accuracy. Specifically, Auto DP-SGD, when used with a step noise multiplier, +improves accuracy by 3.20, 1.57, 6.73, and 1.42 for the MNIST, CIFAR10, +CIFAR100, and AG News Corpus datasets, respectively. Furthermore, it obtains a +substantial reduction in the privacy budget of 94.9, 79.16, 67.36, and 53.37 +for the corresponding data sets. + +
+
+ comment: 25 pages single column, 2 figures +
+
+
+
+
+ + ☆ FERGI: Automatic Annotation of User Preferences for Text-to-Image + Generation from Spontaneous Facial Expression Reaction + + +
+ Researchers have proposed to use data of human preference feedback to +fine-tune text-to-image generative models. However, the scalability of human +feedback collection has been limited by its reliance on manual annotation. +Therefore, we develop and test a method to automatically annotate user +preferences from their spontaneous facial expression reaction to the generated +images. We collect a dataset of Facial Expression Reaction to Generated Images +(FERGI) and show that the activations of multiple facial action units (AUs) are +highly correlated with user evaluations of the generated images. Specifically, +AU4 (brow lowerer) is most consistently reflective of negative evaluations of +the generated image. This can be useful in two ways. Firstly, we can +automatically annotate user preferences between image pairs with substantial +difference in AU4 responses to them with an accuracy significantly +outperforming state-of-the-art scoring models. Secondly, directly integrating +the AU4 responses with the scoring models improves their consistency with human +preferences. Additionally, the AU4 response best reflects the user's evaluation +of the image fidelity, making it complementary to the state-of-the-art scoring +models, which are generally better at reflecting image-text alignment. Finally, +this method of automatic annotation with facial expression analysis can be +potentially generalized to other generation tasks. The code is available at +https://github.com/ShuangquanFeng/FERGI, and the dataset is also available at +the same link for research purposes. + +
+
+
+
+
+ + ☆ Data-Driven Traffic Reconstruction and Kernel Methods for Identifying + Stop-and-Go Congestion NeurIPS 2023 + + +
+ Identifying stop-and-go events (SAGs) in traffic flow presents an important +avenue for advancing data-driven research for climate change mitigation and +sustainability, owing to their substantial impact on carbon emissions, travel +time, fuel consumption, and roadway safety. In fact, SAGs are estimated to +account for 33-50% of highway driving externalities. However, insufficient +attention has been paid to precisely quantifying where, when, and how much +these SAGs take place -necessary for downstream decision making, such as +intervention design and policy analysis. A key challenge is that the data +available to researchers and governments are typically sparse and aggregated to +a granularity that obscures SAGs. To overcome such data limitations, this study +thus explores the use of traffic reconstruction techniques for SAG +identification. In particular, we introduce a kernel-based method for +identifying spatio-temporal features in traffic and leverage bootstrapping to +quantify the uncertainty of the reconstruction process. Experimental results on +California highway data demonstrate the promise of the method for capturing +SAGs. This work contributes to a foundation for data-driven decision making to +advance sustainability of traffic systems. + +
+
+ comment: Presented at NeurIPS 2023 workshops: Tackling Climate Change with + Machine Learning & Computational Sustainability +
+
+
+
+
+ + ☆ CaloQVAE : Simulating high-energy particle-calorimeter interactions + using hybrid quantum-classical generative models + + +
+ The Large Hadron Collider's high luminosity era presents major computational +challenges in the analysis of collision events. Large amounts of Monte Carlo +(MC) simulation will be required to constrain the statistical uncertainties of +the simulated datasets below these of the experimental data. Modelling of +high-energy particles propagating through the calorimeter section of the +detector is the most computationally intensive MC simulation task. We introduce +a technique combining recent advancements in generative models and quantum +annealing for fast and efficient simulation of high-energy particle-calorimeter +interactions. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ☆ Using Curiosity for an Even Representation of Tasks in Continual Offline + Reinforcement Learning + + +
+ In this work, we investigate the means of using curiosity on replay buffers +to improve offline multi-task continual reinforcement learning when tasks, +which are defined by the non-stationarity in the environment, are non labeled +and not evenly exposed to the learner in time. In particular, we investigate +the use of curiosity both as a tool for task boundary detection and as a +priority metric when it comes to retaining old transition tuples, which we +respectively use to propose two different buffers. Firstly, we propose a Hybrid +Reservoir Buffer with Task Separation (HRBTS), where curiosity is used to +detect task boundaries that are not known due to the task agnostic nature of +the problem. Secondly, by using curiosity as a priority metric when it comes to +retaining old transition tuples, a Hybrid Curious Buffer (HCB) is proposed. We +ultimately show that these buffers, in conjunction with regular reinforcement +learning algorithms, can be used to alleviate the catastrophic forgetting issue +suffered by the state of the art on replay buffers when the agent's exposure to +tasks is not equal along time. We evaluate catastrophic forgetting and the +efficiency of our proposed buffers against the latest works such as the Hybrid +Reservoir Buffer (HRB) and the Multi-Time Scale Replay Buffer (MTR) in three +different continual reinforcement learning settings. Experiments were done on +classical control tasks and Metaworld environment. Experiments show that our +proposed replay buffers display better immunity to catastrophic forgetting +compared to existing works in most of the settings. + +
+
+
+
+
+ + ☆ Active Learning for Abrupt Shifts Change-point Detection via + Derivative-Aware Gaussian Processes + + +
+ Change-point detection (CPD) is crucial for identifying abrupt shifts in +data, which influence decision-making and efficient resource allocation across +various domains. To address the challenges posed by the costly and +time-intensive data acquisition in CPD, we introduce the Derivative-Aware +Change Detection (DACD) method. It leverages the derivative process of a +Gaussian process (GP) for Active Learning (AL), aiming to pinpoint change-point +locations effectively. DACD balances the exploitation and exploration of +derivative processes through multiple data acquisition functions (AFs). By +utilizing GP derivative mean and variance as criteria, DACD sequentially +selects the next sampling data point, thus enhancing algorithmic efficiency and +ensuring reliable and accurate results. We investigate the effectiveness of +DACD method in diverse scenarios and show it outperforms other active learning +change-point detection approaches. + +
+
+
+
+
+ + ♻ ☆ Geometry-Aware Normalizing Wasserstein Flows for Optimal Causal + Inference + + +
+ This manuscript enriches the framework of continuous normalizing flows (CNFs) +within causal inference, primarily to augment the geometric properties of +parametric submodels used in targeted maximum likelihood estimation (TMLE). By +introducing an innovative application of CNFs, we construct a refined series of +parametric submodels that enable a directed interpolation between the prior +distribution $p_0$ and the empirical distribution $p_1$. This proposed +methodology serves to optimize the semiparametric efficiency bound in causal +inference by orchestrating CNFs to align with Wasserstein gradient flows. Our +approach not only endeavors to minimize the mean squared error in the +estimation but also imbues the estimators with geometric sophistication, +thereby enhancing robustness against misspecification. This robustness is +crucial, as it alleviates the dependence on the standard $n^{\frac{1}{4}}$ rate +for a doubly-robust perturbation direction in TMLE. By incorporating robust +optimization principles and differential geometry into the estimators, the +developed geometry-aware CNFs represent a significant advancement in the +pursuit of doubly robust causal inference. + +
+
+
+
+
+ + ♻ ☆ Harnessing Discrete Representations For Continual Reinforcement Learning ICLR 2024 + + +
+ Reinforcement learning (RL) agents make decisions using nothing but +observations from the environment, and consequently, heavily rely on the +representations of those observations. Though some recent breakthroughs have +used vector-based categorical representations of observations, often referred +to as discrete representations, there is little work explicitly assessing the +significance of such a choice. In this work, we provide a thorough empirical +investigation of the advantages of representing observations as vectors of +categorical values within the context of reinforcement learning. We perform +evaluations on world-model learning, model-free RL, and ultimately continual RL +problems, where the benefits best align with the needs of the problem setting. +We find that, when compared to traditional continuous representations, world +models learned over discrete representations accurately model more of the world +with less capacity, and that agents trained with discrete representations learn +better policies with less data. In the context of continual RL, these benefits +translate into faster adapting agents. Additionally, our analysis suggests that +the observed performance improvements can be attributed to the information +contained within the latent vectors and potentially the encoding of the +discrete representation itself. + +
+
+ comment: 23 pages, 16 figures, submitted to ICLR 2024 +
+
+
+
+
+ + ♻ ☆ Jellyfish: A Large Language Model for Data Preprocessing + + +
+ In this paper, we present Jellyfish, an open-source LLM as a universal task +solver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned +with the datasets of several typical DP tasks including error detection, data +imputation, schema matching, and entity matching, and delivers generalizability +to other tasks. Remarkably, Jellyfish can operate on a local, single, and +low-priced GPU with its 13 billion parameters, ensuring data security and +enabling further tuning. Its proficiency in understanding natural language +allows users to manually craft instructions for DP tasks. Unlike many existing +methods that heavily rely on prior knowledge, Jellyfish acquires domain +knowledge during its tuning process and integrates optional knowledge injection +during inference. A distinctive feature of Jellyfish is its interpreter, which +elucidates its output decisions. To construct Jellyfish, we develop a series of +pre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance +serializer, which automatically translates raw data into model prompts, and a +knowledge injector, which optionally introduces task- and dataset-specific +knowledge to enhance DP performance. Our evaluation of Jellyfish, using a range +of real datasets, shows its competitiveness compared to state-of-the-art +methods and its strong generalizability to unseen tasks. Jellyfish's +performance rivals that of GPT series models, and its interpreter offers +enhanced reasoning capabilities compared to GPT-3.5. Furthermore, our +evaluation highlights the effectiveness of the techniques employed in +constructing Jellyfish. Our model is available at Hugging Face: +https://huggingface.co/NECOUDBFM/Jellyfish . + +
+
+
+
+
+ + ♻ ☆ Uncertainty Quantification in Multivariable Regression for Material + Property Prediction with Bayesian Neural Networks + + +
+ With the increased use of data-driven approaches and machine learning-based +methods in material science, the importance of reliable uncertainty +quantification (UQ) of the predicted variables for informed decision-making +cannot be overstated. UQ in material property prediction poses unique +challenges, including the multi-scale and multi-physics nature of advanced +materials, intricate interactions between numerous factors, limited +availability of large curated datasets for model training, etc. Recently, +Bayesian Neural Networks (BNNs) have emerged as a promising approach for UQ, +offering a probabilistic framework for capturing uncertainties within neural +networks. In this work, we introduce an approach for UQ within physics-informed +BNNs, which integrates knowledge from governing laws in material modeling to +guide the models toward physically consistent predictions. To evaluate the +effectiveness of this approach, we present case studies for predicting the +creep rupture life of steel alloys. Experimental validation with three datasets +of collected measurements from creep tests demonstrates the ability of BNNs to +produce accurate point and uncertainty estimates that are competitive or exceed +the performance of the conventional method of Gaussian Process Regression. +Similarly, we evaluated the suitability of BNNs for UQ in an active learning +application and reported competitive performance. The most promising framework +for creep life prediction is BNNs based on Markov Chain Monte Carlo +approximation of the posterior distribution of network parameters, as it +provided more reliable results in comparison to BNNs based on variational +inference approximation or related NNs with probabilistic outputs. The codes +are available at: +https://github.com/avakanski/Creep-uncertainty-quantification. + +
+
+ comment: 24 pages, 4 figures +
+
+
+
+
+ + ♻ ☆ Variability of echo state network prediction horizon for partially + observed dynamical systems + + +
+ Study of dynamical systems using partial state observation is an important +problem due to its applicability to many real-world systems. We address the +problem by studying an echo state network (ESN) framework with partial state +input with partial or full state output. Application to the Lorenz system and +Chua's oscillator (both numerically simulated and experimental systems) +demonstrate the effectiveness of our method. We show that the ESN, as an +autonomous dynamical system, is capable of making short-term predictions up to +a few Lyapunov times. However, the prediction horizon has high variability +depending on the initial condition-an aspect that we explore in detail using +the distribution of the prediction horizon. Further, using a variety of +statistical metrics to compare the long-term dynamics of the ESN predictions +with numerically simulated or experimental dynamics and observed similar +results, we show that the ESN can effectively learn the system's dynamics even +when trained with noisy numerical or experimental datasets. Thus, we +demonstrate the potential of ESNs to serve as cheap surrogate models for +simulating the dynamics of systems where complete observations are unavailable. + +
+
+
+
+
+ + ♻ ☆ End-to-End Meta-Bayesian Optimisation with Transformer Neural Processes + + +
+ Meta-Bayesian optimisation (meta-BO) aims to improve the sample efficiency of +Bayesian optimisation by leveraging data from related tasks. While previous +methods successfully meta-learn either a surrogate model or an acquisition +function independently, joint training of both components remains an open +challenge. This paper proposes the first end-to-end differentiable meta-BO +framework that generalises neural processes to learn acquisition functions via +transformer architectures. We enable this end-to-end framework with +reinforcement learning (RL) to tackle the lack of labelled acquisition data. +Early on, we notice that training transformer-based neural processes from +scratch with RL is challenging due to insufficient supervision, especially when +rewards are sparse. We formalise this claim with a combinatorial analysis +showing that the widely used notion of regret as a reward signal exhibits a +logarithmic sparsity pattern in trajectory lengths. To tackle this problem, we +augment the RL objective with an auxiliary task that guides part of the +architecture to learn a valid probabilistic model as an inductive bias. We +demonstrate that our method achieves state-of-the-art regret results against +various baselines in experiments on standard hyperparameter optimisation tasks +and also outperforms others in the real-world problems of mixed-integer +programming tuning, antibody design, and logic synthesis for electronic design +automation. + +
+
+
+
+
+ + ♻ ☆ Autoencoders for discovering manifold dimension and coordinates in data + from complex dynamical systems + + +
+ While many phenomena in physics and engineering are formally +high-dimensional, their long-time dynamics often live on a lower-dimensional +manifold. The present work introduces an autoencoder framework that combines +implicit regularization with internal linear layers and $L_2$ regularization +(weight decay) to automatically estimate the underlying dimensionality of a +data set, produce an orthogonal manifold coordinate system, and provide the +mapping functions between the ambient space and manifold space, allowing for +out-of-sample projections. We validate our framework's ability to estimate the +manifold dimension for a series of datasets from dynamical systems of varying +complexities and compare to other state-of-the-art estimators. We analyze the +training dynamics of the network to glean insight into the mechanism of +low-rank learning and find that collectively each of the implicit regularizing +layers compound the low-rank representation and even self-correct during +training. Analysis of gradient descent dynamics for this architecture in the +linear case reveals the role of the internal linear layers in leading to faster +decay of a "collective weight variable" incorporating all layers, and the role +of weight decay in breaking degeneracies and thus driving convergence along +directions in which no decay would occur in its absence. We show that this +framework can be naturally extended for applications of state-space modeling +and forecasting by generating a data-driven dynamic model of a spatiotemporally +chaotic partial differential equation using only the manifold coordinates. +Finally, we demonstrate that our framework is robust to hyperparameter choices. + +
+
+
+
+
+ + ♻ ☆ FroSSL: Frobenius Norm Minimization for Self-Supervised Learning + + +
+ Self-supervised learning (SSL) is an increasingly popular paradigm for +representation learning. Recent methods can be classified as +sample-contrastive, dimension-contrastive, or asymmetric network-based, with +each family having its own approach to avoiding informational collapse. While +dimension-contrastive methods converge to similar solutions as +sample-contrastive methods, it can be empirically shown that some methods +require more epochs of training to converge. Motivated by closing this divide, +we present the objective function FroSSL which is both sample- and +dimension-contrastive up to embedding normalization. FroSSL works by minimizing +covariance Frobenius norms for avoiding collapse and minimizing mean-squared +error for augmentation invariance. We show that FroSSL converges more quickly +than a variety of other SSL methods and provide theoretical and empirical +support that this faster convergence is due to how FroSSL affects the +eigenvalues of the embedding covariance matrices. We also show that FroSSL +learns competitive representations on linear probe evaluation when used to +train a ResNet18 on the CIFAR-10, CIFAR-100, STL-10, and ImageNet datasets. + +
+
+
+
+
+ + ♻ ☆ On the Identifiability of Quantized Factors + + +
+ Disentanglement aims to recover meaningful latent ground-truth factors from +the observed distribution solely, and is formalized through the theory of +identifiability. The identifiability of independent latent factors is proven to +be impossible in the unsupervised i.i.d. setting under a general nonlinear map +from factors to observations. In this work, however, we demonstrate that it is +possible to recover quantized latent factors under a generic nonlinear +diffeomorphism. We only assume that the latent factors have independent +discontinuities in their density, without requiring the factors to be +statistically independent. We introduce this novel form of identifiability, +termed quantized factor identifiability, and provide a comprehensive proof of +the recovery of the quantized factors. + +
+
+
+
+
+ + ♻ ☆ A Practical Approach to Novel Class Discovery in Tabular Data + + +
+ The problem of Novel Class Discovery (NCD) consists in extracting knowledge +from a labeled set of known classes to accurately partition an unlabeled set of +novel classes. While NCD has recently received a lot of attention from the +community, it is often solved on computer vision problems and under unrealistic +conditions. In particular, the number of novel classes is usually assumed to be +known in advance, and their labels are sometimes used to tune hyperparameters. +Methods that rely on these assumptions are not applicable in real-world +scenarios. In this work, we focus on solving NCD in tabular data when no prior +knowledge of the novel classes is available. To this end, we propose to tune +the hyperparameters of NCD methods by adapting the $k$-fold cross-validation +process and hiding some of the known classes in each fold. Since we have found +that methods with too many hyperparameters are likely to overfit these hidden +classes, we define a simple deep NCD model. This method is composed of only the +essential elements necessary for the NCD problem and performs impressively well +under realistic conditions. Furthermore, we find that the latent space of this +method can be used to reliably estimate the number of novel classes. +Additionally, we adapt two unsupervised clustering algorithms ($k$-means and +Spectral Clustering) to leverage the knowledge of the known classes. Extensive +experiments are conducted on 7 tabular datasets and demonstrate the +effectiveness of the proposed method and hyperparameter tuning process, and +show that the NCD problem can be solved without relying on knowledge from the +novel classes. + +
+
+ comment: 25 pages, including 3 pages of annexes +
+
+
+
+
+ + ♻ ☆ A Unified Theory of Diversity in Ensemble Learning + + +
+ We present a theory of ensemble diversity, explaining the nature of diversity +for a wide range of supervised learning scenarios. This challenge, of +understanding ensemble diversity, has been referred to as the "holy grail" of +ensemble learning, an open research issue for over 30 years. Our framework +reveals that diversity is in fact a hidden dimension in the bias-variance +decomposition of the ensemble loss. We prove a family of exact +bias-variance-diversity decompositions, for both regression and classification, +e.g., squared, cross-entropy, and Poisson losses. For losses where an additive +bias-variance decomposition is not available (e.g., 0/1 loss) we present an +alternative approach, which precisely quantifies the effects of diversity, +turning out to be dependent on the label distribution. Experiments show how we +can use our framework to understand the diversity-encouraging mechanisms of +popular methods: Bagging, Boosting, and Random Forests. + +
+
+
+
+
+ + ♻ ☆ Balance is Essence: Accelerating Sparse Training via Adaptive Gradient + Correction + + +
+ Despite impressive performance, deep neural networks require significant +memory and computation costs, prohibiting their application in +resource-constrained scenarios. Sparse training is one of the most common +techniques to reduce these costs, however, the sparsity constraints add +difficulty to the optimization, resulting in an increase in training time and +instability. In this work, we aim to overcome this problem and achieve +space-time co-efficiency. To accelerate and stabilize the convergence of sparse +training, we analyze the gradient changes and develop an adaptive gradient +correction method. Specifically, we approximate the correlation between the +current and previous gradients, which is used to balance the two gradients to +obtain a corrected gradient. Our method can be used with the most popular +sparse training pipelines under both standard and adversarial setups. +Theoretically, we prove that our method can accelerate the convergence rate of +sparse training. Extensive experiments on multiple datasets, model +architectures, and sparsities demonstrate that our method outperforms leading +sparse training methods by up to \textbf{5.0\%} in accuracy given the same +number of training epochs, and reduces the number of training epochs by up to +\textbf{52.1\%} to achieve the same accuracy. Our code is available on: +\url{https://github.com/StevenBoys/AGENT}. + +
+
+
+
+
+ + ♻ ☆ AlignBench: Benchmarking Chinese Alignment of Large Language Models + + +
+ Alignment has become a critical step for instruction-tuned Large Language +Models (LLMs) to become helpful assistants. However, effective evaluation of +alignment for emerging Chinese LLMs is still significantly lacking, calling for +real-scenario grounded, open-ended, challenging and automatic evaluations +tailored for alignment. To fill in this gap, we introduce AlignBench, a +comprehensive multi-dimensional benchmark for evaluating LLMs' alignment in +Chinese. Equipped with a human-in-the-loop data curation pipeline, our +benchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with +Chain-of-Thought to generate explanations and final ratings as evaluations, +ensuring high reliability and interpretability. Furthermore, we report +AlignBench evaluated by CritiqueLLM, a dedicated Chinese evaluator LLM that +recovers 95% of GPT-4's evaluation ability. We will provide public APIs for +evaluating AlignBench with CritiqueLLM to facilitate the evaluation of LLMs' +Chinese alignment. All evaluation codes, data, and LLM generations are +available at \url{https://github.com/THUDM/AlignBench}. + +
+
+
+
+
+ + ♻ ☆ Digital twinning of cardiac electrophysiology models from the surface + ECG: a geodesic backpropagation approach + + +
+ The eikonal equation has become an indispensable tool for modeling cardiac +electrical activation accurately and efficiently. In principle, by matching +clinically recorded and eikonal-based electrocardiograms (ECGs), it is possible +to build patient-specific models of cardiac electrophysiology in a purely +non-invasive manner. Nonetheless, the fitting procedure remains a challenging +task. The present study introduces a novel method, Geodesic-BP, to solve the +inverse eikonal problem. Geodesic-BP is well-suited for GPU-accelerated machine +learning frameworks, allowing us to optimize the parameters of the eikonal +equation to reproduce a given ECG. We show that Geodesic-BP can reconstruct a +simulated cardiac activation with high accuracy in a synthetic test case, even +in the presence of modeling inaccuracies. Furthermore, we apply our algorithm +to a publicly available dataset of a biventricular rabbit model, with promising +results. Given the future shift towards personalized medicine, Geodesic-BP has +the potential to help in future functionalizations of cardiac models meeting +clinical time constraints while maintaining the physiological accuracy of +state-of-the-art cardiac models. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Real-Time Online Stock Forecasting Utilizing Integrated Quantitative and + Qualitative Analysis + + +
+ The application of Machine learning to finance has become a familiar +approach, even more so in stock market forecasting. The stock market is highly +volatile and huge amounts of data are generated every minute globally. The +extraction of effective intelligence from this data is of critical importance. +However, a collaboration of numerical stock data with qualitative text data can +be a challenging task. In this work, we accomplish this and provide an +unprecedented, publicly available dataset with technical and fundamental data, +sentiment that we gathered from News Archives, TV news captions, Radio +Transcripts, Tweets, Daily financial newspapers, etc. The text data entries +used for sentiment extraction total more than 1.4 Million. The dataset consists +of daily entries from January 2018 to December 2022 for 8 companies +representing diverse industrial sectors and the Dow Jones Industrial Average +(DJIA) as a whole. Holistic Fundamental and Technical data is provided training +ready for Model learning and deployment. The data generated could be used for +Incremental online learning with real-time data points retrieved daily, since +there was no stagnant data utilized, all the data was retired from APIs or +self-designed scripts. Moreover, the utilization of Spearman's rank correlation +over real-time data, linking stock returns with sentiment analysis has produced +noteworthy results for the DJIA achieving accuracy levels surpassing 60\%. The +dataset is made available at https://github.com/batking24/Huge-Stock-Dataset + +
+
+
+
+
+ + ♻ ☆ NeuroMixGDP: A Neural Collapse-Inspired Random Mixup for Private Data + Release + + +
+ Privacy-preserving data release algorithms have gained increasing attention +for their ability to protect user privacy while enabling downstream machine +learning tasks. However, the utility of current popular algorithms is not +always satisfactory. Mixup of raw data provides a new way of data augmentation, +which can help improve utility. However, its performance drastically +deteriorates when differential privacy (DP) noise is added. To address this +issue, this paper draws inspiration from the recently observed Neural Collapse +(NC) phenomenon, which states that the last layer features of a neural network +concentrate on the vertices of a simplex as Equiangular Tight Frame (ETF). We +propose a scheme to mixup the Neural Collapse features to exploit the ETF +simplex structure and release noisy mixed features to enhance the utility of +the released data. By using Gaussian Differential Privacy (GDP), we obtain an +asymptotic rate for the optimal mixup degree. To further enhance the utility +and address the label collapse issue when the mixup degree is large, we propose +a Hierarchical sampling method to stratify the mixup samples on a small number +of classes. This method remarkably improves utility when the number of classes +is large. Extensive experiments demonstrate the effectiveness of our proposed +method in protecting against attacks and improving utility. In particular, our +approach shows significantly improved utility compared to directly training +classification networks with DPSGD on CIFAR100 and MiniImagenet datasets, +highlighting the benefits of using privacy-preserving data release. We release +reproducible code in https://github.com/Lidonghao1996/NeuroMixGDP. + +
+
+ comment: 28 pages, 9 figures +
+
+
+
+
+ + ♻ ☆ Robust Reinforcement Learning in Continuous Control Tasks with + Uncertainty Set Regularization + + +
+ Reinforcement learning (RL) is recognized as lacking generalization and +robustness under environmental perturbations, which excessively restricts its +application for real-world robotics. Prior work claimed that adding +regularization to the value function is equivalent to learning a robust policy +with uncertain transitions. Although the regularization-robustness +transformation is appealing for its simplicity and efficiency, it is still +lacking in continuous control tasks. In this paper, we propose a new +regularizer named $\textbf{U}$ncertainty $\textbf{S}$et $\textbf{R}$egularizer +(USR), by formulating the uncertainty set on the parameter space of the +transition function. In particular, USR is flexible enough to be plugged into +any existing RL framework. To deal with unknown uncertainty sets, we further +propose a novel adversarial approach to generate them based on the value +function. We evaluate USR on the Real-world Reinforcement Learning (RWRL) +benchmark, demonstrating improvements in the robust performance for perturbed +testing environments. + +
+
+ comment: Accepted at CoRL 2023 +
+
+
+
+
+ + ♻ ☆ Topological Graph Signal Compression + + +
+ Recently emerged Topological Deep Learning (TDL) methods aim to extend +current Graph Neural Networks (GNN) by naturally processing higher-order +interactions, going beyond the pairwise relations and local neighborhoods +defined by graph representations. In this paper we propose a novel TDL-based +method for compressing signals over graphs, consisting in two main steps: +first, disjoint sets of higher-order structures are inferred based on the +original signal --by clustering $N$ datapoints into $K\ll N$ collections; then, +a topological-inspired message passing gets a compressed representation of the +signal within those multi-element sets. Our results show that our framework +improves both standard GNN and feed-forward architectures in compressing +temporal link-based signals from two real-word Internet Service Provider +Networks' datasets --from $30\%$ up to $90\%$ better reconstruction errors +across all evaluation scenarios--, suggesting that it better captures and +exploits spatial and temporal correlations over the whole graph-based network +structure. + +
+
+ comment: Accepted as Oral at the Second Learning on Graphs Conference (LoG + 2023). The recording of the talk can be found in + https://www.youtube.com/watch?v=OcruIkiRkiU +
+
+
+
+
+ + ♻ ☆ On minimizing the training set fill distance in machine learning + regression + + +
+ For regression tasks one often leverages large datasets for training +predictive machine learning models. However, using large datasets may not be +feasible due to computational limitations or high data labelling costs. +Therefore, suitably selecting small training sets from large pools of +unlabelled data points is essential to maximize model performance while +maintaining efficiency. In this work, we study Farthest Point Sampling (FPS), a +data selection approach that aims to minimize the fill distance of the selected +set. We derive an upper bound for the maximum expected prediction error, +conditional to the location of the unlabelled data points, that linearly +depends on the training set fill distance. For empirical validation, we perform +experiments using two regression models on three datasets. We empirically show +that selecting a training set by aiming to minimize the fill distance, thereby +minimizing our derived bound, significantly reduces the maximum prediction +error of various regression models, outperforming alternative sampling +approaches by a large margin. Furthermore, we show that selecting training sets +with the FPS can also increase model stability for the specific case of +Gaussian kernel regression approaches. + +
+
+
+
+
+ + ♻ ☆ CROP: Towards Distributional-Shift Robust Reinforcement Learning using + Compact Reshaped Observation Processing IJCAI 2023 + + +
+ The safe application of reinforcement learning (RL) requires generalization +from limited training data to unseen scenarios. Yet, fulfilling tasks under +changing circumstances is a key challenge in RL. Current state-of-the-art +approaches for generalization apply data augmentation techniques to increase +the diversity of training data. Even though this prevents overfitting to the +training environment(s), it hinders policy optimization. Crafting a suitable +observation, only containing crucial information, has been shown to be a +challenging task itself. To improve data efficiency and generalization +capabilities, we propose Compact Reshaped Observation Processing (CROP) to +reduce the state information used for policy optimization. By providing only +relevant information, overfitting to a specific training layout is precluded +and generalization to unseen environments is improved. We formulate three CROPs +that can be applied to fully observable observation- and action-spaces and +provide methodical foundation. We empirically show the improvements of CROP in +a distributionally shifted safety gridworld. We furthermore provide benchmark +comparisons to full observability and data-augmentation in two different-sized +procedurally generated mazes. + +
+
+ comment: 9 pages, 5 figures, published at IJCAI 2023 +
+
+
+
+
+ + ♻ ☆ Physics-informed neural networks with unknown measurement noise + + +
+ Physics-informed neural networks (PINNs) constitute a flexible approach to +both finding solutions and identifying parameters of partial differential +equations. Most works on the topic assume noiseless data, or data contaminated +with weak Gaussian noise. We show that the standard PINN framework breaks down +in case of non-Gaussian noise. We give a way of resolving this fundamental +issue and we propose to jointly train an energy-based model (EBM) to learn the +correct noise distribution. We illustrate the improved performance of our +approach using multiple examples. + +
+
+
+
+
+ + ♻ ☆ Traffic Signal Control with Communicative Deep Reinforcement Learning + Agents: a Case Study + + +
+ In this work we analyze Multi-Agent Advantage Actor-Critic (MA2C) a recently +proposed multi-agent reinforcement learning algorithm that can be applied to +adaptive traffic signal control (ATSC) problems. To evaluate its potential we +compare MA2C with Independent Advantage Actor-Critic (IA2C) and other +Reinforcement Learning or heuristic based algorithms. Specifically, we analyze +MA2C theoretically with the framework provided by non-Markov decision +processes, which allows a deeper insight of the algorithm, and we critically +examine the effectiveness and the robustness of the method by testing it in two +traffic areas located in Bologna (Italy) simulated in SUMO, a software modeling +tool for ATSC problems. Our results indicate that MA2C, trained with +pseudo-random vehicle flows, is a promising technique able to outperform the +alternative methods. + +
+
+ comment: Obsolete version +
+
+
+
+
+ + ♻ ☆ A Framework for Neurosymbolic Robot Action Planning using Large Language + Models + + +
+ Symbolic task planning is a widely used approach to enforce robot autonomy +due to its ease of understanding and deployment. However, symbolic task +planning is difficult to scale in real-world when frequent re-planning is +needed, for example, due to human-robot interactions or unforeseen events. Plan +length and planning time can hinder the robot's efficiency and negatively +affect the overall human-robot interaction's fluency. We present a framework, +Teriyaki, designed to bridge the gap between symbolic task planning and machine +learning approaches, by training Large Language Models (LLMs), namely GPT-3, +into neurosymbolic task planners compatible with the Planning Domain Definition +Language (PDDL). Potential benefits include: (i) better scalability in so far +as the planning domain complexity increases, since LLMs' response time linearly +scales with the combined length of the input and the output, instead of +super-linearly as in the case of symbolic task planners, and (ii) the ability +to synthesize a plan action-by-action instead of end-to-end, and to make each +action available for execution as soon as it is generated, which in turn +enables concurrent planning and execution. In the past year, significant +efforts have been devoted by the research community to evaluate the overall +cognitive abilities of LLMs, with alternate successes. Instead, with Teriyaki +we aim to providing an overall planning performance comparable to traditional +planners in specific planning domains, while leveraging LLMs capabilities in +other metrics which are used to build a look-ahead predictive planning model. +Preliminary results in selected domains show that our method can: (i) solve +95.5% of problems in a test data set of 1000 samples; (ii) produce plans up to +13.5% shorter than a traditional symbolic planner; (iii) reduce average overall +waiting times for a plan availability by up to 61.4%. + +
+
+ comment: Submitted to journal. 28 pages, 6 figures, 2 tables. Updated + according to reviewers' comments. Previous title: A Framework to Generate + Neurosymbolic PDDL-compliant Planners +
+
+
+
+
+ + ♻ ☆ Pushing the Limits of Pre-training for Time Series Forecasting in the + CloudOps Domain + + +
+ Time series has been left behind in the era of pre-training and transfer +learning. While research in the fields of natural language processing and +computer vision are enjoying progressively larger datasets to train massive +models, the most popular time series datasets consist of only tens of thousands +of time steps, limiting our ability to study the effectiveness of pre-training +and scaling. Recent studies have also cast doubt on the need for expressive +models and scale. To alleviate these issues, we introduce three large-scale +time series forecasting datasets from the cloud operations (CloudOps) domain, +the largest having billions of observations, enabling further study into +pre-training and scaling of time series models. We build the empirical +groundwork for studying pre-training and scaling of time series models and pave +the way for future research by identifying a promising candidate architecture. +We show that it is a strong zero-shot baseline and benefits from further +scaling, both in model and dataset size. Accompanying these datasets and +results is a suite of comprehensive benchmark results comparing classical and +deep learning baselines to our pre-trained method - achieving a 27% reduction +in error on the largest dataset. Code and datasets can be found +https://github.com/SalesforceAIResearch/pretrain-time-series-cloudops. + +
+
+
+
+
+ + ♻ ☆ Revisiting Hidden Representations in Transfer Learning for Medical + Imaging + + +
+ While a key component to the success of deep learning is the availability of +massive amounts of training data, medical image datasets are often limited in +diversity and size. Transfer learning has the potential to bridge the gap +between related yet different domains. For medical applications, however, it +remains unclear whether it is more beneficial to pre-train on natural or +medical images. We aim to shed light on this problem by comparing +initialization on ImageNet and RadImageNet on seven medical classification +tasks. Our work includes a replication study, which yields results contrary to +previously published findings. In our experiments, ResNet50 models pre-trained +on ImageNet tend to outperform those trained on RadImageNet. To gain further +insights, we investigate the learned representations using Canonical +Correlation Analysis (CCA) and compare the predictions of the different models. +Our results indicate that, contrary to intuition, ImageNet and RadImageNet may +converge to distinct intermediate representations, which appear to diverge +further during fine-tuning. Despite these distinct representations, the +predictions of the models remain similar. Our findings show that the similarity +between networks before and after fine-tuning does not correlate with +performance gains, suggesting that the advantages of transfer learning might +not solely originate from the reuse of features in the early layers of a +convolutional neural network. + +
+
+ comment: Published in TMLR +
+
+
+
+
+ + ♻ ☆ TriDeNT: Triple Deep Network Training for Privileged Knowledge + Distillation in Histopathology + + +
+ Computational pathology models rarely utilise data that will not be available +for inference. This means most models cannot learn from highly informative data +such as additional immunohistochemical (IHC) stains and spatial +transcriptomics. We present TriDeNT, a novel self-supervised method for +utilising privileged data that is not available during inference to improve +performance. We demonstrate the efficacy of this method for a range of +different paired data including immunohistochemistry, spatial transcriptomics +and expert nuclei annotations. In all settings, TriDeNT outperforms other +state-of-the-art methods in downstream tasks, with observed improvements of up +to 101%. Furthermore, we provide qualitative and quantitative measurements of +the features learned by these models and how they differ from baselines. +TriDeNT offers a novel method to distil knowledge from scarce or costly data +during training, to create significantly better models for routine inputs. + +
+
+
+
+
+ + ♻ ☆ Solving Inverse Physics Problems with Score Matching NeurIPS 2023 + + +
+ We propose to solve inverse problems involving the temporal evolution of +physics systems by leveraging recent advances from diffusion models. Our method +moves the system's current state backward in time step by step by combining an +approximate inverse physics simulator and a learned correction function. A +central insight of our work is that training the learned correction with a +single-step loss is equivalent to a score matching objective, while recursively +predicting longer parts of the trajectory during training relates to maximum +likelihood training of a corresponding probability flow. We highlight the +advantages of our algorithm compared to standard denoising score matching and +implicit score matching, as well as fully learned baselines for a wide range of +inverse physics problems. The resulting inverse solver has excellent accuracy +and temporal stability and, in contrast to other learned inverse solvers, +allows for sampling the posterior of the solutions. + +
+
+ comment: Accepted at NeurIPS 2023; code available at + https://github.com/tum-pbs/SMDP +
+
+
+
+
+ + ♻ ☆ Unraveling the Enigma of Double Descent: An In-depth Analysis through + the Lens of Learned Feature Space + + +
+ Double descent presents a counter-intuitive aspect within the machine +learning domain, and researchers have observed its manifestation in various +models and tasks. While some theoretical explanations have been proposed for +this phenomenon in specific contexts, an accepted theory to account for its +occurrence in deep learning remains yet to be established. In this study, we +revisit the phenomenon of double descent and demonstrate that its occurrence is +strongly influenced by the presence of noisy data. Through conducting a +comprehensive analysis of the feature space of learned representations, we +unveil that double descent arises in imperfect models trained with noisy data. +We argue that double descent is a consequence of the model first learning the +noisy data until interpolation and then adding implicit regularization via +over-parameterization acquiring therefore capability to separate the +information from the noise. + +
+
+
+
+
+ + ♻ ☆ Policy Gradient with Kernel Quadrature + + +
+ Reward evaluation of episodes becomes a bottleneck in a broad range of +reinforcement learning tasks. Our aim in this paper is to select a small but +representative subset of a large batch of episodes, only on which we actually +compute rewards for more efficient policy gradient iterations. We build a +Gaussian process modeling of discounted returns or rewards to derive a positive +definite kernel on the space of episodes, run an ``episodic" kernel quadrature +method to compress the information of sample episodes, and pass the reduced +episodes to the policy network for gradient updates. We present the theoretical +background of this procedure as well as its numerical illustrations in MuJoCo +tasks. + +
+
+ comment: 18 pages, 2 figures +
+
+
+
+
+ + ♻ ☆ Quantum Machine Learning on Near-Term Quantum Devices: Current State of + Supervised and Unsupervised Techniques for Real-World Applications + + +
+ The past decade has witnessed significant advancements in quantum hardware, +encompassing improvements in speed, qubit quantity, and quantum volume-a metric +defining the maximum size of a quantum circuit effectively implementable on +near-term quantum devices. This progress has led to a surge in Quantum Machine +Learning (QML) applications on real hardware, aiming to achieve quantum +advantage over classical approaches. This survey focuses on selected supervised +and unsupervised learning applications executed on quantum hardware, +specifically tailored for real-world scenarios. The exploration includes a +thorough analysis of current QML implementation limitations on quantum +hardware, covering techniques like encoding, ansatz structure, error +mitigation, and gradient methods to address these challenges. Furthermore, the +survey evaluates the performance of QML implementations in comparison to +classical counterparts. In conclusion, we discuss existing bottlenecks related +to applying QML on real quantum devices and propose potential solutions to +overcome these challenges in the future. + +
+
+ comment: 40 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Nash Learning from Human Feedback + + +
+ Reinforcement learning from human feedback (RLHF) has emerged as the main +paradigm for aligning large language models (LLMs) with human preferences. +Typically, RLHF involves the initial step of learning a reward model from human +feedback, often expressed as preferences between pairs of text generations +produced by a pre-trained LLM. Subsequently, the LLM's policy is fine-tuned by +optimizing it to maximize the reward model through a reinforcement learning +algorithm. However, an inherent limitation of current reward models is their +inability to fully represent the richness of human preferences and their +dependency on the sampling distribution. + In this study, we introduce an alternative pipeline for the fine-tuning of +LLMs using pairwise human feedback. Our approach entails the initial learning +of a preference model, which is conditioned on two inputs given a prompt, +followed by the pursuit of a policy that consistently generates responses +preferred over those generated by any competing policy, thus defining the Nash +equilibrium of this preference model. We term this approach Nash learning from +human feedback (NLHF). + In the context of a tabular policy representation, we present a novel +algorithmic solution, Nash-MD, founded on the principles of mirror descent. +This algorithm produces a sequence of policies, with the last iteration +converging to the regularized Nash equilibrium. Additionally, we explore +parametric representations of policies and introduce gradient descent +algorithms for deep-learning architectures. To demonstrate the effectiveness of +our approach, we present experimental results involving the fine-tuning of a +LLM for a text summarization task. We believe NLHF offers a compelling avenue +for preference learning and policy optimization with the potential of advancing +the field of aligning LLMs with human preferences. + +
+
+
+
+
+ + ♻ ☆ Value Functions are Control Barrier Functions: Verification of Safe + Policies using Control Theory + + +
+ Guaranteeing safe behaviour of reinforcement learning (RL) policies poses +significant challenges for safety-critical applications, despite RL's +generality and scalability. To address this, we propose a new approach to apply +verification methods from control theory to learned value functions. By +analyzing task structures for safety preservation, we formalize original +theorems that establish links between value functions and control barrier +functions. Further, we propose novel metrics for verifying value functions in +safe control tasks and practical implementation details to improve learning. +Our work presents a novel method for certificate learning, which unlocks a +diversity of verification techniques from control theory for RL policies, and +marks a significant step towards a formal framework for the general, scalable, +and verifiable design of RL-based control systems. Code and videos are +available at this https url: https://rl-cbf.github.io/ + +
+
+
+
+
+ + ♻ ☆ Controllable Music Production with Diffusion Models and Guidance + Gradients + + +
+ We demonstrate how conditional generation from diffusion models can be used +to tackle a variety of realistic tasks in the production of music in 44.1kHz +stereo audio with sampling-time guidance. The scenarios we consider include +continuation, inpainting and regeneration of musical audio, the creation of +smooth transitions between two different music tracks, and the transfer of +desired stylistic characteristics to existing audio clips. We achieve this by +applying guidance at sampling time in a simple framework that supports both +reconstruction and classification losses, or any combination of the two. This +approach ensures that generated audio can match its surrounding context, or +conform to a class distribution or latent representation specified relative to +any suitable pre-trained classifier or embedding model. Audio samples are +available at https://machinelearning.apple.com/research/controllable-music + +
+
+
+
+
+ + ♻ ☆ General-Purpose Retrieval-Enhanced Medical Prediction Model Using + Near-Infinite History + + +
+ Developing clinical prediction models (e.g., mortality prediction) based on +electronic health records (EHRs) typically relies on expert opinion for feature +selection and adjusting observation window size. This burdens experts and +creates a bottleneck in the development process. We propose Retrieval-Enhanced +Medical prediction model (REMed) to address such challenges. REMed can +essentially evaluate an unlimited number of clinical events, select the +relevant ones, and make predictions. This approach effectively eliminates the +need for manual feature selection and enables an unrestricted observation +window. We verified these properties through experiments on 27 clinical tasks +and two independent cohorts from publicly available EHR datasets, where REMed +outperformed other contemporary architectures that aim to handle as many events +as possible. Notably, we found that the preferences of REMed align closely with +those of medical experts. We expect our approach to significantly expedite the +development of EHR prediction models by minimizing clinicians' need for manual +involvement. + +
+
+ comment: The source codes corresponding to this paper are available at: + https://github.com/starmpcc/REMed +
+
+
+
+
+ + ♻ ☆ The Bayesian Stability Zoo + + +
+ We show that many definitions of stability found in the learning theory +literature are equivalent to one another. We distinguish between two families +of definitions of stability: distribution-dependent and +distribution-independent Bayesian stability. Within each family, we establish +equivalences between various definitions, encompassing approximate differential +privacy, pure differential privacy, replicability, global stability, perfect +generalization, TV stability, mutual information stability, KL-divergence +stability, and R\'enyi-divergence stability. Along the way, we prove boosting +results that enable the amplification of the stability of a learning rule. This +work is a step towards a more systematic taxonomy of stability notions in +learning theory, which can promote clarity and an improved understanding of an +array of stability concepts that have emerged in recent years. + +
+
+ comment: v2, minor typo fix +
+
+
+
+
+ + ♻ ☆ Grounding Foundation Models through Federated Transfer Learning: A + General Framework + + +
+ Foundation Models (FMs) such as GPT-4 encoded with vast knowledge and +powerful emergent abilities have achieved remarkable success in various natural +language processing and computer vision tasks. Grounding FMs by adapting them +to domain-specific tasks or augmenting them with domain-specific knowledge +enables us to exploit the full potential of FMs. However, grounding FMs faces +several challenges, stemming primarily from constrained computing resources, +data privacy, model heterogeneity, and model ownership. Federated Transfer +Learning (FTL), the combination of federated learning and transfer learning, +provides promising solutions to address these challenges. In recent years, the +need for grounding FMs leveraging FTL, coined FTL-FM, has arisen strongly in +both academia and industry. Motivated by the strong growth in FTL-FM research +and the potential impact of FTL-FM on industrial applications, we propose an +FTL-FM framework that formulates problems of grounding FMs in the federated +learning setting, construct a detailed taxonomy based on the FTL-FM framework +to categorize state-of-the-art FTL-FM works, and comprehensively overview +FTL-FM works based on the proposed taxonomy. We also establish correspondences +between FTL-FM and conventional phases of adapting FM so that FM practitioners +can align their research works with FTL-FM. In addition, we overview advanced +efficiency-improving and privacy-preserving techniques because efficiency and +privacy are critical concerns in FTL-FM. Last, we discuss opportunities and +future research directions of FTL-FM. + +
+
+ comment: In progress. fixed some typos, errors, and revised the text a little + bit +
+
+
+
+
+ + ♻ ☆ Primal-Attention: Self-attention through Asymmetric Kernel SVD in Primal + Representation NeurIPS 2023 + + +
+ Recently, a new line of works has emerged to understand and improve +self-attention in Transformers by treating it as a kernel machine. However, +existing works apply the methods for symmetric kernels to the asymmetric +self-attention, resulting in a nontrivial gap between the analytical +understanding and numerical implementation. In this paper, we provide a new +perspective to represent and optimize self-attention through asymmetric Kernel +Singular Value Decomposition (KSVD), which is also motivated by the low-rank +property of self-attention normally observed in deep layers. Through asymmetric +KSVD, $i$) a primal-dual representation of self-attention is formulated, where +the optimization objective is cast to maximize the projection variances in the +attention outputs; $ii$) a novel attention mechanism, i.e., Primal-Attention, +is proposed via the primal representation of KSVD, avoiding explicit +computation of the kernel matrix in the dual; $iii$) with KKT conditions, we +prove that the stationary solution to the KSVD optimization in Primal-Attention +yields a zero-value objective. In this manner, KSVD optimization can be +implemented by simply minimizing a regularization loss, so that low-rank +property is promoted without extra decomposition. Numerical experiments show +state-of-the-art performance of our Primal-Attention with improved efficiency. +Moreover, we demonstrate that the deployed KSVD optimization regularizes +Primal-Attention with a sharper singular value decay than that of the canonical +self-attention, further verifying the great potential of our method. To the +best of our knowledge, this is the first work that provides a primal-dual +representation for the asymmetric kernel in self-attention and successfully +applies it to modeling and optimization. + +
+
+ comment: NeurIPS 2023. We provide a primal-dual representation for the + asymmetric self-attention in transformer that allows to avoid explicit + computation of the kernel matrix +
+
+
+
+
+ + ♻ ☆ Efficient Deep Learning Models for Privacy-preserving People Counting on + Low-resolution Infrared Arrays + + +
+ Ultra-low-resolution Infrared (IR) array sensors offer a low-cost, +energy-efficient, and privacy-preserving solution for people counting, with +applications such as occupancy monitoring. Previous work has shown that Deep +Learning (DL) can yield superior performance on this task. However, the +literature was missing an extensive comparative analysis of various efficient +DL architectures for IR array-based people counting, that considers not only +their accuracy, but also the cost of deploying them on memory- and +energy-constrained Internet of Things (IoT) edge nodes. In this work, we +address this need by comparing 6 different DL architectures on a novel dataset +composed of IR images collected from a commercial 8x8 array, which we made +openly available. With a wide architectural exploration of each model type, we +obtain a rich set of Pareto-optimal solutions, spanning cross-validated +balanced accuracy scores in the 55.70-82.70% range. When deployed on a +commercial Microcontroller (MCU) by STMicroelectronics, the STM32L4A6ZG, these +models occupy 0.41-9.28kB of memory, and require 1.10-7.74ms per inference, +while consuming 17.18-120.43 $\mu$J of energy. Our models are significantly +more accurate than a previous deterministic method (up to +39.9%), while being +up to 3.53x faster and more energy efficient. Further, our models' accuracy is +comparable to state-of-the-art DL solutions on similar resolution sensors, +despite a much lower complexity. All our models enable continuous, real-time +inference on a MCU-based IoT node, with years of autonomous operation without +battery recharging. + +
+
+ comment: This article has been accepted for publication in IEEE Internet of + Things Journal; Fixed typos +
+
+
+
+
+ + ♻ ☆ Out-of-distribution Detection Learning with Unreliable + Out-of-distribution Sources NeurIPS 2023 + + +
+ Out-of-distribution (OOD) detection discerns OOD data where the predictor +cannot make valid predictions as in-distribution (ID) data, thereby increasing +the reliability of open-world classification. However, it is typically hard to +collect real out-of-distribution (OOD) data for training a predictor capable of +discerning ID and OOD patterns. This obstacle gives rise to data +generation-based learning methods, synthesizing OOD data via data generators +for predictor training without requiring any real OOD data. Related methods +typically pre-train a generator on ID data and adopt various selection +procedures to find those data likely to be the OOD cases. However, generated +data may still coincide with ID semantics, i.e., mistaken OOD generation +remains, confusing the predictor between ID and OOD data. To this end, we +suggest that generated data (with mistaken OOD generation) can be used to +devise an auxiliary OOD detection task to facilitate real OOD detection. +Specifically, we can ensure that learning from such an auxiliary task is +beneficial if the ID and the OOD parts have disjoint supports, with the help of +a well-designed training procedure for the predictor. Accordingly, we propose a +powerful data generation-based learning method named Auxiliary Task-based OOD +Learning (ATOL) that can relieve the mistaken OOD generation. We conduct +extensive experiments under various OOD detection setups, demonstrating the +effectiveness of our method against its advanced counterparts. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Discovering Interpretable Physical Models using Symbolic Regression and + Discrete Exterior Calculus + + +
+ Computational modeling is a key resource to gather insight into physical +systems in modern scientific research and engineering. While access to large +amount of data has fueled the use of Machine Learning (ML) to recover physical +models from experiments and increase the accuracy of physical simulations, +purely data-driven models have limited generalization and interpretability. To +overcome these limitations, we propose a framework that combines Symbolic +Regression (SR) and Discrete Exterior Calculus (DEC) for the automated +discovery of physical models starting from experimental data. Since these +models consist of mathematical expressions, they are interpretable and amenable +to analysis, and the use of a natural, general-purpose discrete mathematical +language for physics favors generalization with limited input data. +Importantly, DEC provides building blocks for the discrete analogue of field +theories, which are beyond the state-of-the-art applications of SR to physical +problems. Further, we show that DEC allows to implement a strongly-typed SR +procedure that guarantees the mathematical consistency of the recovered models +and reduces the search space of symbolic expressions. Finally, we prove the +effectiveness of our methodology by re-discovering three models of Continuum +Physics from synthetic experimental data: Poisson equation, the Euler's +Elastica and the equations of Linear Elasticity. Thanks to their +general-purpose nature, the methods developed in this paper may be applied to +diverse contexts of physical modeling. + +
+
+
+
+
+ + ♻ ☆ T3D: Towards 3D Medical Image Understanding through Vision-Language + Pre-training + + +
+ Expert annotation of 3D medical image for downstream analysis is +resource-intensive, posing challenges in clinical applications. Visual +self-supervised learning (vSSL), though effective for learning visual +invariance, neglects the incorporation of domain knowledge from medicine. To +incorporate medical knowledge into visual representation learning, +vision-language pre-training (VLP) has shown promising results in 2D image. +However, existing VLP approaches become generally impractical when applied to +high-resolution 3D medical images due to GPU hardware constraints and the +potential loss of critical details caused by downsampling, which is the +intuitive solution to hardware constraints. To address the above limitations, +we introduce T3D, the first VLP framework designed for high-resolution 3D +medical images. T3D incorporates two text-informed pretext tasks: +(\lowerromannumeral{1}) text-informed contrastive learning; +(\lowerromannumeral{2}) text-informed image restoration. These tasks focus on +learning 3D visual representations from high-resolution 3D medical images and +integrating clinical knowledge from radiology reports, without distorting +information through forced alignment of downsampled volumes with detailed +anatomical text. Trained on a newly curated large-scale dataset of 3D medical +images and radiology reports, T3D significantly outperforms current vSSL +methods in tasks like organ and tumor segmentation, as well as disease +classification. This underlines T3D's potential in representation learning for +3D medical image analysis. All data and code will be available upon acceptance. + +
+
+
+
+
+ + ♻ ☆ Investigating the Catastrophic Forgetting in Multimodal Large Language + Models + + +
+ Following the success of GPT4, there has been a surge in interest in +multimodal large language model (MLLM) research. This line of research focuses +on developing general-purpose LLMs through fine-tuning pre-trained LLMs and +vision models. However, catastrophic forgetting, a notorious phenomenon where +the fine-tuned model fails to retain similar performance compared to the +pre-trained model, still remains an inherent problem in multimodal LLMs (MLLM). +In this paper, we introduce EMT: Evaluating MulTimodality for evaluating the +catastrophic forgetting in MLLMs, by treating each MLLM as an image classifier. +We first apply EMT to evaluate several open-source fine-tuned MLLMs and we +discover that almost all evaluated MLLMs fail to retain the same performance +levels as their vision encoders on standard image classification tasks. +Moreover, we continue fine-tuning LLaVA, an MLLM and utilize EMT to assess +performance throughout the fine-tuning. Interestingly, our results suggest that +early-stage fine-tuning on an image dataset improves performance across other +image datasets, by enhancing the alignment of text and visual features. +However, as fine-tuning proceeds, the MLLMs begin to hallucinate, resulting in +a significant loss of generalizability, even when the image encoder remains +frozen. Our results suggest that MLLMs have yet to demonstrate performance on +par with their vision models on standard image classification tasks and the +current MLLM fine-tuning procedure still has room for improvement. + +
+
+
+
+
+ + ♻ ☆ Deep Knowledge Tracing is an implicit dynamic multidimensional item + response theory model + + +
+ Knowledge tracing consists in predicting the performance of some students on +new questions given their performance on previous questions, and can be a prior +step to optimizing assessment and learning. Deep knowledge tracing (DKT) is a +competitive model for knowledge tracing relying on recurrent neural networks, +even if some simpler models may match its performance. However, little is known +about why DKT works so well. In this paper, we frame deep knowledge tracing as +a encoderdecoder architecture. This viewpoint not only allows us to propose +better models in terms of performance, simplicity or expressivity but also +opens up promising avenues for future research directions. In particular, we +show on several small and large datasets that a simpler decoder, with possibly +fewer parameters than the one used by DKT, can predict student performance +better. + +
+
+
+
+
+ + ♻ ☆ Cost-effective On-device Continual Learning over Memory Hierarchy with + Miro + + +
+ Continual learning (CL) trains NN models incrementally from a continuous +stream of tasks. To remember previously learned knowledge, prior studies store +old samples over a memory hierarchy and replay them when new tasks arrive. Edge +devices that adopt CL to preserve data privacy are typically energy-sensitive +and thus require high model accuracy while not compromising energy efficiency, +i.e., cost-effectiveness. Our work is the first to explore the design space of +hierarchical memory replay-based CL to gain insights into achieving +cost-effectiveness on edge devices. We present Miro, a novel system runtime +that carefully integrates our insights into the CL framework by enabling it to +dynamically configure the CL system based on resource states for the best +cost-effectiveness. To reach this goal, Miro also performs online profiling on +parameters with clear accuracy-energy trade-offs and adapts to optimal values +with low overhead. Extensive evaluations show that Miro significantly +outperforms baseline systems we build for comparison, consistently achieving +higher cost-effectiveness. + +
+
+ comment: This paper is published in the 29th Annual International Conference + on Mobile Computing and Networking (ACM MobiCom '23) +
+
+
+
+
+ + ♻ ☆ Regularization Trade-offs with Fake Features + + +
+ Recent successes of massively overparameterized models have inspired a new +line of work investigating the underlying conditions that enable +overparameterized models to generalize well. This paper considers a framework +where the possibly overparametrized model includes fake features, i.e., +features that are present in the model but not in the data. We present a +non-asymptotic high-probability bound on the generalization error of the ridge +regression problem under the model misspecification of having fake features. +Our highprobability results provide insights into the interplay between the +implicit regularization provided by the fake features and the explicit +regularization provided by the ridge parameter. Numerical results illustrate +the trade-off between the number of fake features and how the optimal ridge +parameter may heavily depend on the number of fake features. + +
+
+
+
+
+ + ♻ ☆ LongLoRA: Efficient Fine-tuning of Long-Context Large Language Models + + +
+ We present LongLoRA, an efficient fine-tuning approach that extends the +context sizes of pre-trained large language models (LLMs), with limited +computation cost. Typically, training LLMs with long context sizes is +computationally expensive, requiring extensive training hours and GPU +resources. For example, training on the context length of 8192 needs 16x +computational costs in self-attention layers as that of 2048. In this paper, we +speed up the context extension of LLMs in two aspects. On the one hand, +although dense global attention is needed during inference, fine-tuning the +model can be effectively and efficiently done by sparse local attention. The +proposed shifted sparse attention (S$^2$-Attn) effectively enables context +extension, leading to non-trivial computation saving with similar performance +to fine-tuning with vanilla attention. Particularly, it can be implemented with +only two lines of code in training, while being optional in inference. On the +other hand, we revisit the parameter-efficient fine-tuning regime for context +expansion. Notably, we find that LoRA for context extension works well under +the premise of trainable embedding and normalization. LongLoRA combines this +improved LoRA with S$^2$-Attn. LongLoRA demonstrates strong empirical results +on various tasks on Llama2 models from 7B/13B to 70B. LongLoRA adopts Llama2 7B +from 4k context to 100k, or Llama2 70B to 32k on a single 8x A100 machine. +LongLoRA extends models' context while retaining their original architectures, +and is compatible with most existing techniques, like Flash-Attention2. In +addition, we further conduct supervised fine-tuning with LongLoRA and our long +instruction-following LongAlpaca dataset. + +
+
+ comment: Code, models, dataset, and demo are available at + https://github.com/dvlab-research/LongLoRA +
+
+
+
+
+ + ♻ ☆ Stackelberg Driver Model for Continual Policy Improvement in + Scenario-Based Closed-Loop Autonomous Driving + + +
+ The deployment of autonomous vehicles (AVs) has faced hurdles due to the +dominance of rare but critical corner cases within the long-tail distribution +of driving scenarios, which negatively affects their overall performance. To +address this challenge, adversarial generation methods have emerged as a class +of efficient approaches to synthesize safety-critical scenarios for AV testing. +However, these generated scenarios are often underutilized for AV training, +resulting in the potential for continual AV policy improvement remaining +untapped, along with a deficiency in the closed-loop design needed to achieve +it. Therefore, we tailor the Stackelberg Driver Model (SDM) to accurately +characterize the hierarchical nature of vehicle interaction dynamics, +facilitating iterative improvement by engaging background vehicles (BVs) and AV +in a sequential game-like interaction paradigm. With AV acting as the leader +and BVs as followers, this leader-follower modeling ensures that AV would +consistently refine its policy, always taking into account the additional +information that BVs play the best response to challenge AV. Extensive +experiments have shown that our algorithm exhibits superior performance +compared to several baselines especially in higher dimensional scenarios, +leading to substantial advancements in AV capabilities while continually +generating progressively challenging scenarios. Code is available at +https://github.com/BlueCat-de/SDM. + +
+
+
+
+
+ + ♻ ☆ DeepInception: Hypnotize Large Language Model to Be Jailbreaker + + +
+ Despite remarkable success in various applications, large language models +(LLMs) are vulnerable to adversarial jailbreaks that make the safety guardrails +void. However, previous studies for jailbreaks usually resort to brute-force +optimization or extrapolations of a high computation cost, which might not be +practical or effective. In this paper, inspired by the Milgram experiment that +individuals can harm another person if they are told to do so by an +authoritative figure, we disclose a lightweight method, termed as +DeepInception, which can easily hypnotize LLM to be a jailbreaker and unlock +its misusing risks. Specifically, DeepInception leverages the personification +ability of LLM to construct a novel nested scene to behave, which realizes an +adaptive way to escape the usage control in a normal scenario and provides the +possibility for further direct jailbreaks. Empirically, we conduct +comprehensive experiments to show its efficacy. Our DeepInception can achieve +competitive jailbreak success rates with previous counterparts and realize a +continuous jailbreak in subsequent interactions, which reveals the critical +weakness of self-losing on both open/closed-source LLMs like Falcon, Vicuna, +Llama-2, and GPT-3.5/4/4V. Our investigation appeals that people should pay +more attention to the safety aspects of LLMs and a stronger defense against +their misuse risks. The code is publicly available at: +https://github.com/tmlr-group/DeepInception. + +
+
+
+
+
+ + ♻ ☆ Exploring Weight Balancing on Long-Tailed Recognition Problem + + +
+ Recognition problems in long-tailed data, in which the sample size per class +is heavily skewed, have gained importance because the distribution of the +sample size per class in a dataset is generally exponential unless the sample +size is intentionally adjusted. Various methods have been devised to address +these problems. Recently, weight balancing, which combines well-known classical +regularization techniques with two-stage training, has been proposed. Despite +its simplicity, it is known for its high performance compared with existing +methods devised in various ways. However, there is a lack of understanding as +to why this method is effective for long-tailed data. In this study, we analyze +weight balancing by focusing on neural collapse and the cone effect at each +training stage and found that it can be decomposed into an increase in Fisher's +discriminant ratio of the feature extractor caused by weight decay and cross +entropy loss and implicit logit adjustment caused by weight decay and +class-balanced loss. Our analysis enables the training method to be further +simplified by reducing the number of training stages to one while increasing +accuracy. + +
+
+
+
+
+ + ♻ ☆ Continual Learning with Distributed Optimization: Does CoCoA Forget? + + +
+ We focus on the continual learning problem where the tasks arrive +sequentially and the aim is to perform well on the newly arrived task without +performance degradation on the previously seen tasks. In contrast to the +continual learning literature focusing on the centralized setting, we +investigate the distributed estimation framework. We consider the +well-established distributed learning algorithm COCOA. We derive closed form +expressions for the iterations for the overparametrized case. We illustrate the +convergence and the error performance of the algorithm based on the +over/under-parameterization of the problem. Our results show that depending on +the problem dimensions and data generation assumptions, COCOA can perform +continual learning over a sequence of tasks, i.e., it can learn a new task +without forgetting previously learned tasks, with access only to one task at a +time. + +
+
+
+
+
+ + ♻ ☆ PipeOptim: Ensuring Effective 1F1B Schedule with Optimizer-Dependent + Weight Prediction + + +
+ Asynchronous pipeline model parallelism with a "1F1B" (one forward, one +backward) schedule generates little bubble overhead and always provides quite a +high throughput. However, the "1F1B" schedule inevitably leads to weight +inconsistency and weight staleness issues due to the cross-training of +different mini-batches across GPUs. To simultaneously address these two +problems, in this paper, we propose an optimizer-dependent weight prediction +strategy (a.k.a PipeOptim) for asynchronous pipeline training. The key insight +of our proposal is that we employ a weight prediction strategy in the forward +pass to ensure that each mini-batch uses consistent and staleness-free weights +to compute the forward pass. To be concrete, we first construct the weight +prediction scheme based on the update rule of the used optimizer when training +the deep neural network models. Then throughout the "1F1B" pipelined training, +each mini-batch is mandated to execute weight prediction ahead of the forward +pass, subsequently employing the predicted weights to perform the forward pass. +As a result, PipeOptim 1) inherits the advantage of the "1F1B" schedule and +generates pretty high throughput, and 2) can ensure effective parameter +learning regardless of the type of the used optimizer. To verify the +effectiveness of our proposal, we conducted extensive experimental evaluations +using eight different deep-learning models spanning three machine-learning +tasks including image classification, sentiment analysis, and machine +translation. The experiment results demonstrate that PipeOptim outperforms the +popular pipelined approaches including GPipe, PipeDream, PipeDream-2BW, and +SpecTrain. The code of PipeOptim can be accessible at +https://github.com/guanleics/PipeOptim. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ♻ ☆ Comparative Analysis of CPU and GPU Profiling for Deep Learning Models + + +
+ Deep Learning(DL) and Machine Learning(ML) applications are rapidly +increasing in recent days. Massive amounts of data are being generated over the +internet which can derive meaningful results by the use of ML and DL +algorithms. Hardware resources and open-source libraries have made it easy to +implement these algorithms. Tensorflow and Pytorch are one of the leading +frameworks for implementing ML projects. By using those frameworks, we can +trace the operations executed on both GPU and CPU to analyze the resource +allocations and consumption. This paper presents the time and memory allocation +of CPU and GPU while training deep neural networks using Pytorch. This paper +analysis shows that GPU has a lower running time as compared to CPU for deep +neural networks. For a simpler network, there are not many significant +improvements in GPU over the CPU. + +
+
+ comment: 6 pages, 11 figures +
+
+
+
+
+ + ♻ ☆ Byzantine-Robust Distributed Online Learning: Taming Adversarial + Participants in An Adversarial Environment + + +
+ This paper studies distributed online learning under Byzantine attacks. The +performance of an online learning algorithm is often characterized by +(adversarial) regret, which evaluates the quality of one-step-ahead +decision-making when an environment provides adversarial losses, and a +sublinear bound is preferred. But we prove that, even with a class of +state-of-the-art robust aggregation rules, in an adversarial environment and in +the presence of Byzantine participants, distributed online gradient descent can +only achieve a linear adversarial regret bound, which is tight. This is the +inevitable consequence of Byzantine attacks, even though we can control the +constant of the linear adversarial regret to a reasonable level. Interestingly, +when the environment is not fully adversarial so that the losses of the honest +participants are i.i.d. (independent and identically distributed), we show that +sublinear stochastic regret, in contrast to the aforementioned adversarial +regret, is possible. We develop a Byzantine-robust distributed online momentum +algorithm to attain such a sublinear stochastic regret bound. Extensive +numerical experiments corroborate our theoretical analysis. + +
+
+
+
+
+ + ♻ ☆ Generating Molecular Conformer Fields + + +
+ In this paper we tackle the problem of generating conformers of a molecule in +3D space given its molecular graph. We parameterize these conformers as +continuous functions that map elements from the molecular graph to points in 3D +space. We then formulate the problem of learning to generate conformers as +learning a distribution over these functions using a diffusion generative +model, called Molecular Conformer Fields (MCF). Our approach is simple and +scalable, and achieves state-of-the-art performance on challenging molecular +conformer generation benchmarks while making no assumptions about the explicit +structure of molecules (e.g. modeling torsional angles). MCF represents an +advance in extending diffusion models to handle complex scientific problems in +a conceptually simple, scalable and effective manner. + +
+
+ comment: 20 pages, 11 figures. arXiv admin note: text overlap with + arXiv:2305.15586 +
+
+
+
+
+ + ♻ ☆ Deep Learning-Driven Enhancement of Welding Quality Control: Predicting + Welding Depth and Pore Volume in Hairpin Welding + + +
+ To advance quality assurance in the welding process, this study presents a +robust deep learning model that enables the prediction of two critical welds +Key Performance Characteristics (KPCs): welding depth and average pore volume. +In the proposed approach, a comprehensive range of laser welding Key Input +Characteristics (KICs) is utilized, including welding beam geometries, welding +feed rates, path repetitions for weld beam geometries, and bright light weld +ratios for all paths, all of which were obtained from hairpin welding +experiments. Two deep learning networks are employed with multiple hidden dense +layers and linear activation functions to showcase the capabilities of deep +neural networks in capturing the intricate nonlinear connections inherent +within welding KPCs and KICs. Applying deep learning networks to the small +numerical experimental hairpin welding dataset has shown promising results, +achieving Mean Absolute Error (MAE) values as low as 0.1079 for predicting +welding depth and 0.0641 for average pore volume. Additionally, the validity +verification demonstrates the reliability of the proposed method. This, in +turn, promises significant advantages in controlling welding outcomes, moving +beyond the current trend of relying merely on monitoring for defect +classification. + +
+
+
+
+
+ + ♻ ☆ Language Agent Tree Search Unifies Reasoning Acting and Planning in + Language Models + + +
+ While large language models (LLMs) have demonstrated impressive performance +on a range of decision-making tasks, they rely on simple acting processes and +fall short of broad deployment as autonomous agents. We introduce LATS +(Language Agent Tree Search), a general framework that synergizes the +capabilities of LLMs in planning, acting, and reasoning. Drawing inspiration +from Monte Carlo tree search in model-based reinforcement learning, LATS +employs LLMs as agents, value functions, and optimizers, repurposing their +latent strengths for enhanced decision-making. What is crucial in this method +is the use of an environment for external feedback, which offers a more +deliberate and adaptive problem-solving mechanism that moves beyond the +limitations of existing techniques. Our experimental evaluation across diverse +domains, such as programming, HotPotQA, and WebShop, illustrates the +applicability of LATS for both reasoning and acting. In particular, LATS +achieves 94.4% for programming on HumanEval with GPT-4 and an average score of +75.9 for web browsing on WebShop with GPT-3.5, demonstrating the effectiveness +and generality of our method. + +
+
+ comment: Website and code can be found at + https://andyz245.github.io/LanguageAgentTreeSearch +
+
+
+
+
+ + ♻ ☆ Pitfall of Optimism: Distributional Reinforcement Learning by + Randomizing Risk Criterion NeurIPS 2023 + + +
+ Distributional reinforcement learning algorithms have attempted to utilize +estimated uncertainty for exploration, such as optimism in the face of +uncertainty. However, using the estimated variance for optimistic exploration +may cause biased data collection and hinder convergence or performance. In this +paper, we present a novel distributional reinforcement learning algorithm that +selects actions by randomizing risk criterion to avoid one-sided tendency on +risk. We provide a perturbed distributional Bellman optimality operator by +distorting the risk measure and prove the convergence and optimality of the +proposed method with the weaker contraction property. Our theoretical results +support that the proposed method does not fall into biased exploration and is +guaranteed to converge to an optimal return. Finally, we empirically show that +our method outperforms other existing distribution-based algorithms in various +environments including Atari 55 games. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Motion Informed Needle Segmentation in Ultrasound Images + + +
+ Segmenting a moving needle in ultrasound images is challenging due to the +presence of artifacts, noise, and needle occlusion. This task becomes even more +demanding in scenarios where data availability is limited. Convolutional Neural +Networks (CNNs) have been successful in many computer vision applications, but +struggle to accurately segment needles without considering their motion. In +this paper, we present a novel approach for needle segmentation that combines +classical Kalman Filter (KF) techniques with data-driven learning, +incorporating both needle features and needle motion. Our method offers two key +contributions. First, we propose a compatible framework that seamlessly +integrates into commonly used encoder-decoder style architectures. Second, we +demonstrate superior performance compared to recent state-of-the-art needle +segmentation models using our novel convolutional neural network (CNN) based +KF-inspired block, achieving a 15\% reduction in pixel-wise needle tip error +and an 8\% reduction in length error. Third, to our knowledge we are the first +to implement a learnable filter to incorporate non-linear needle motion for +improving needle segmentation. + +
+
+ comment: 7 pages, 4 figures, under review +
+
+
+
+
+ + ♻ ☆ Real-Time Surface-to-Air Missile Engagement Zone Prediction Using + Simulation and Machine Learning + + +
+ Surface-to-Air Missiles (SAMs) are crucial in modern air defense systems. A +critical aspect of their effectiveness is the Engagement Zone (EZ), the spatial +region within which a SAM can effectively engage and neutralize a target. +Notably, the EZ is intrinsically related to the missile's maximum range; it +defines the furthest distance at which a missile can intercept a target. The +accurate computation of this EZ is essential but challenging due to the dynamic +and complex factors involved, which often lead to high computational costs and +extended processing times when using conventional simulation methods. In light +of these challenges, our study investigates the potential of machine learning +techniques, proposing an approach that integrates machine learning with a +custom-designed simulation tool to train supervised algorithms. We leverage a +comprehensive dataset of pre-computed SAM EZ simulations, enabling our model to +accurately predict the SAM EZ for new input parameters. It accelerates SAM EZ +simulations, enhances air defense strategic planning, and provides real-time +insights, improving SAM system performance. The study also includes a +comparative analysis of machine learning algorithms, illuminating their +capabilities and performance metrics and suggesting areas for future research, +highlighting the transformative potential of machine learning in SAM EZ +simulations. + +
+
+
+
+
+ + ♻ ☆ VideoDubber: Machine Translation with Speech-Aware Length Control for + Video Dubbing AAAI 2023 + + +
+ Video dubbing aims to translate the original speech in a film or television +program into the speech in a target language, which can be achieved with a +cascaded system consisting of speech recognition, machine translation and +speech synthesis. To ensure the translated speech to be well aligned with the +corresponding video, the length/duration of the translated speech should be as +close as possible to that of the original speech, which requires strict length +control. Previous works usually control the number of words or characters +generated by the machine translation model to be similar to the source +sentence, without considering the isochronicity of speech as the speech +duration of words/characters in different languages varies. In this paper, we +propose a machine translation system tailored for the task of video dubbing, +which directly considers the speech duration of each token in translation, to +match the length of source and target speech. Specifically, we control the +speech length of generated sentence by guiding the prediction of each word with +the duration information, including the speech duration of itself as well as +how much duration is left for the remaining words. We design experiments on +four language directions (German -> English, Spanish -> English, Chinese <-> +English), and the results show that the proposed method achieves better length +control ability on the generated speech than baseline methods. To make up the +lack of real-world datasets, we also construct a real-world test set collected +from films to provide comprehensive evaluations on the video dubbing task. + +
+
+ comment: AAAI 2023 camera version +
+
+
+
+
+ + ♻ ☆ Neural Priming for Sample-Efficient Adaptation + + +
+ We propose Neural Priming, a technique for adapting large pretrained models +to distribution shifts and downstream tasks given few or no labeled examples. +Presented with class names or unlabeled test samples, Neural Priming enables +the model to recall and conditions its parameters on relevant data seen +throughout pretraining, thereby priming it for the test distribution. Neural +Priming can be performed at test time, even for pretraining datasets as large +as LAION-2B. Performing lightweight updates on the recalled data significantly +improves accuracy across a variety of distribution shift and transfer learning +benchmarks. Concretely, in the zero-shot setting, we see a 2.45% improvement in +accuracy on ImageNet and 3.81% accuracy improvement on average across standard +transfer learning benchmarks. Further, using Neural Priming at inference to +adapt to distribution shift, we see a 1.41% accuracy improvement on ImageNetV2. +These results demonstrate the effectiveness of Neural Priming in addressing the +challenge of limited labeled data and changing distributions. Code is available +at github.com/RAIVNLab/neural-priming. + +
+
+ comment: 18 pages, 7 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ MOTOR: A Time-To-Event Foundation Model For Structured Medical Records + + +
+ We present a self-supervised, time-to-event (TTE) foundation model called +MOTOR (Many Outcome Time Oriented Representations) which is pretrained on +timestamped sequences of events in electronic health records (EHR) and health +insurance claims. TTE models are used for estimating the probability +distribution of the time until a specific event occurs, which is an important +task in medical settings. TTE models provide many advantages over +classification using fixed time horizons, including naturally handling censored +observations, but are challenging to train with limited labeled data. MOTOR +addresses this challenge by pretraining on up to 55M patient records (9B +clinical events). We evaluate MOTOR's transfer learning performance on 19 +tasks, across 3 patient databases (a private EHR system, MIMIC-IV, and Merative +claims data). Task-specific models adapted from MOTOR improve time-dependent C +statistics by 4.6% over state-of-the-art, improve label efficiency by up to 95% +,and are more robust to temporal distributional shifts. We further evaluate +cross-site portability by adapting our MOTOR foundation model for six +prediction tasks on the MIMIC-IV dataset, where it outperforms all baselines. +MOTOR is the first foundation model for medical TTE predictions and we release +a 143M parameter pretrained model for research use at [redacted URL]. + +
+
+
+
+
+ + ♻ ☆ T-Cal: An optimal test for the calibration of predictive models + + +
+ The prediction accuracy of machine learning methods is steadily increasing, +but the calibration of their uncertainty predictions poses a significant +challenge. Numerous works focus on obtaining well-calibrated predictive models, +but less is known about reliably assessing model calibration. This limits our +ability to know when algorithms for improving calibration have a real effect, +and when their improvements are merely artifacts due to random noise in finite +datasets. In this work, we consider detecting mis-calibration of predictive +models using a finite validation dataset as a hypothesis testing problem. The +null hypothesis is that the predictive model is calibrated, while the +alternative hypothesis is that the deviation from calibration is sufficiently +large. + We find that detecting mis-calibration is only possible when the conditional +probabilities of the classes are sufficiently smooth functions of the +predictions. When the conditional class probabilities are H\"older continuous, +we propose T-Cal, a minimax optimal test for calibration based on a debiased +plug-in estimator of the $\ell_2$-Expected Calibration Error (ECE). We further +propose Adaptive T-Cal, a version that is adaptive to unknown smoothness. We +verify our theoretical findings with a broad range of experiments, including +with several popular deep neural net architectures and several standard +post-hoc calibration methods. T-Cal is a practical general-purpose tool, which +-- combined with classical tests for discrete-valued predictors -- can be used +to test the calibration of virtually any probabilistic classification method. + +
+
+ comment: The implementation of T-Cal is available at + https://github.com/dh7401/T-Cal +
+
+
+
+
+ + ♻ ☆ Convolutional layers are equivariant to discrete shifts but not + continuous translations + + +
+ The purpose of this short and simple note is to clarify a common +misconception about convolutional neural networks (CNNs). CNNs are made up of +convolutional layers which are shift equivariant due to weight sharing. +However, convolutional layers are not translation equivariant, even when +boundary effects are ignored and when pooling and subsampling are absent. This +is because shift equivariance is a discrete symmetry while translation +equivariance is a continuous symmetry. This fact is well known among +researchers in equivariant machine learning, but is usually overlooked among +non-experts. To minimize confusion, we suggest using the term `shift +equivariance' to refer to discrete shifts in pixels and `translation +equivariance' to refer to continuous translations. + +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ AV2AV: Direct Audio-Visual Speech to Audio-Visual Speech Translation + with Unified Audio-Visual Speech Representation + + +
+ This paper proposes a novel direct Audio-Visual Speech to Audio-Visual Speech +Translation (AV2AV) framework, where the input and output of the system are +multimodal (i.e., audio and visual speech). With the proposed AV2AV, two key +advantages can be brought: 1) We can perform real-like conversations with +individuals worldwide in a virtual meeting by utilizing our own primary +languages. In contrast to Speech-to-Speech Translation (A2A), which solely +translates between audio modalities, the proposed AV2AV directly translates +between audio-visual speech. This capability enhances the dialogue experience +by presenting synchronized lip movements along with the translated speech. 2) +We can improve the robustness of the spoken language translation system. By +employing the complementary information of audio-visual speech, the system can +effectively translate spoken language even in the presence of acoustic noise, +showcasing robust performance. To mitigate the problem of the absence of a +parallel AV2AV translation dataset, we propose to train our spoken language +translation system with the audio-only dataset of A2A. This is done by learning +unified audio-visual speech representations through self-supervised learning in +advance to train the translation system. Moreover, we propose an AV-Renderer +that can generate raw audio and video in parallel. It is designed with +zero-shot speaker modeling, thus the speaker in source audio-visual speech can +be maintained at the target translated audio-visual speech. The effectiveness +of AV2AV is evaluated with extensive experiments in a many-to-many language +translation setting. The demo page is available on +https://choijeongsoo.github.io/av2av. + +
+
+
+
+
+ + ♻ ☆ Investigation of UAV Detection in Images with Complex Backgrounds and + Rainy Artifacts + + +
+ To detect unmanned aerial vehicles (UAVs) in real-time, computer vision and +deep learning approaches are evolving research areas. Interest in this problem +has grown due to concerns regarding the possible hazards and misuse of +employing UAVs in many applications. These include potential privacy +violations. To address the concerns, vision-based object detection methods have +been developed for UAV detection. However, UAV detection in images with complex +backgrounds and weather artifacts like rain has yet to be reasonably studied. +Hence, for this purpose, we prepared two training datasets. The first dataset +has the sky as its background and is called the Sky Background Dataset (SBD). +The second training dataset has more complex scenes (with diverse backgrounds) +and is named the Complex Background Dataset (CBD). Additionally, two test sets +were prepared: one containing clear images and the other with images with three +rain artifacts, named the Rainy Test Set (RTS). This work also focuses on +benchmarking state-of-the-art object detection models, and to the best of our +knowledge, it is the first to investigate the performance of recent and popular +vision-based object detection methods for UAV detection under challenging +conditions such as complex backgrounds, varying UAV sizes, and low-to-heavy +rainy conditions. The findings presented in the paper shall help provide +insights concerning the performance of the selected models for UAV detection +under challenging conditions and pave the way to develop more robust UAV +detection methods. The codes and datasets are available at: +https://github.com/AdnanMunir294/UAVD-CBRA. + +
+
+ comment: Accepted at the Real-World Surveillance Workshop, IEEE/CVF Winter + Conference on Applications of Computer Vision 2024 +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. A critical challenge is how +to make use of existing large-scale pre-trained VLMs, which are trained on +common objects, to perform the domain-specific transfer for accomplishing +domain-related downstream tasks. In this paper, we propose a new framework that +includes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap +between the General Vision-Language Model (GVLM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +fine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning +methods on RS5M to implement the DVLM. Experimental results show that our +proposed dataset is highly effective for various tasks, and our model GeoRSCLIP +improves upon the baseline or previous state-of-the-art model by $3\%\sim20\%$ +in Zero-shot Classification (ZSC), $3\%\sim6\%$ in Remote Sensing Cross-Modal +Text-Image Retrieval (RSCTIR) and $4\%\sim5\%$ in Semantic Localization (SeLo) +tasks. Dataset and models have been released in: +\url{https://github.com/om-ai-lab/RS5M}. + +
+
+ comment: RS5M dataset v5 +
+
+
+
+
+ + ♻ ☆ Rethinking Radiology Report Generation via Causal Reasoning and + Counterfactual Augmentation + + +
+ Radiology Report Generation (RRG) draws attention as an interaction between +vision and language fields. Previous works inherited the ideology of +vision-to-language generation tasks,aiming to generate paragraphs with high +consistency as reports. However, one unique characteristic of RRG, the +independence between diseases, was neglected, leading to the injection of +disease co-occurrence as a confounder that effects the results through backdoor +path. Unfortunately, this confounder confuses the process of report generation +worse because of the biased RRG data distribution. In this paper, to rethink +this issue thoroughly, we reason about its causes and effects from a novel +perspective of statistics and causality, where the Joint Vision Coupling and +the Conditional Sentence Coherence Coupling are two aspects prone to implicitly +decrease the accuracy of reports. Then, a counterfactual augmentation strategy +that contains the Counterfactual Sample Synthesis and the Counterfactual Report +Reconstruction sub-methods is proposed to break these two aspects of spurious +effects. Experimental results and further analyses on two widely used datasets +justify our reasoning and proposed methods. + +
+
+ comment: 10 pages,5 figures +
+
+
+
+
+ + ♻ ☆ Out-of-distribution Detection Learning with Unreliable + Out-of-distribution Sources NeurIPS 2023 + + +
+ Out-of-distribution (OOD) detection discerns OOD data where the predictor +cannot make valid predictions as in-distribution (ID) data, thereby increasing +the reliability of open-world classification. However, it is typically hard to +collect real out-of-distribution (OOD) data for training a predictor capable of +discerning ID and OOD patterns. This obstacle gives rise to data +generation-based learning methods, synthesizing OOD data via data generators +for predictor training without requiring any real OOD data. Related methods +typically pre-train a generator on ID data and adopt various selection +procedures to find those data likely to be the OOD cases. However, generated +data may still coincide with ID semantics, i.e., mistaken OOD generation +remains, confusing the predictor between ID and OOD data. To this end, we +suggest that generated data (with mistaken OOD generation) can be used to +devise an auxiliary OOD detection task to facilitate real OOD detection. +Specifically, we can ensure that learning from such an auxiliary task is +beneficial if the ID and the OOD parts have disjoint supports, with the help of +a well-designed training procedure for the predictor. Accordingly, we propose a +powerful data generation-based learning method named Auxiliary Task-based OOD +Learning (ATOL) that can relieve the mistaken OOD generation. We conduct +extensive experiments under various OOD detection setups, demonstrating the +effectiveness of our method against its advanced counterparts. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ VideoDubber: Machine Translation with Speech-Aware Length Control for + Video Dubbing AAAI 2023 + + +
+ Video dubbing aims to translate the original speech in a film or television +program into the speech in a target language, which can be achieved with a +cascaded system consisting of speech recognition, machine translation and +speech synthesis. To ensure the translated speech to be well aligned with the +corresponding video, the length/duration of the translated speech should be as +close as possible to that of the original speech, which requires strict length +control. Previous works usually control the number of words or characters +generated by the machine translation model to be similar to the source +sentence, without considering the isochronicity of speech as the speech +duration of words/characters in different languages varies. In this paper, we +propose a machine translation system tailored for the task of video dubbing, +which directly considers the speech duration of each token in translation, to +match the length of source and target speech. Specifically, we control the +speech length of generated sentence by guiding the prediction of each word with +the duration information, including the speech duration of itself as well as +how much duration is left for the remaining words. We design experiments on +four language directions (German -> English, Spanish -> English, Chinese <-> +English), and the results show that the proposed method achieves better length +control ability on the generated speech than baseline methods. To make up the +lack of real-world datasets, we also construct a real-world test set collected +from films to provide comprehensive evaluations on the video dubbing task. + +
+
+ comment: AAAI 2023 camera version +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 70 + +
+
+
+ + ☆ Generative Powers of Ten + + +
+ We present a method that uses a text-to-image model to generate consistent +content across multiple image scales, enabling extreme semantic zooms into a +scene, e.g., ranging from a wide-angle landscape view of a forest to a macro +shot of an insect sitting on one of the tree branches. We achieve this through +a joint multi-scale diffusion sampling approach that encourages consistency +across different scales while preserving the integrity of each individual +sampling process. Since each generated scale is guided by a different text +prompt, our method enables deeper levels of zoom than traditional +super-resolution methods that may struggle to create new contextual structure +at vastly different scales. We compare our method qualitatively with +alternative techniques in image super-resolution and outpainting, and show that +our method is most effective at generating consistent multi-scale content. + +
+
+ comment: Project page: https://powers-of-10.github.io/ +
+
+
+
+
+ + ☆ Competition-Level Problems Are Effective Evaluators of LLMs + + +
+ Large language models (LLMs) have demonstrated impressive reasoning +capabilities, yet there is ongoing debate about these abilities and the +potential data contamination problem recently. This paper aims to evaluate the +reasoning capacities of LLMs, specifically in solving recent competition-level +programming problems in Codeforces, which are expert-crafted and unique, +requiring deep understanding and robust reasoning skills. We first provide a +comprehensive evaluation of GPT-4's peiceived zero-shot performance on this +task, considering various aspects such as problems' release time, difficulties, +and types of errors encountered. Surprisingly, the peiceived performance of +GPT-4 has experienced a cliff like decline in problems after September 2021 +consistently across all the difficulties and types of problems, which shows the +potential data contamination, as well as the challenges for any existing LLM to +solve unseen complex reasoning problems. We further explore various approaches +such as fine-tuning, Chain-of-Thought prompting and problem description +simplification, unfortunately none of them is able to consistently mitigate the +challenges. Through our work, we emphasis the importance of this excellent data +source for assessing the genuine reasoning capabilities of LLMs, and foster the +development of LLMs with stronger reasoning abilities and better generalization +in the future. + +
+
+ comment: In progress +
+
+
+
+
+ + ☆ TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and + Advanced Decoding Techniques + + +
+ Recent advances in language models (LMs), have demonstrated significant +efficacy in tasks related to the arts and humanities. While LMs have exhibited +exceptional performance across a wide range of natural language processing +tasks, there are notable challenges associated with their utilization on small +datasets and their ability to replicate more creative human capacities. In this +study, we aim to address these challenges by training a Persian classical +poetry generation model using a transformer architecture on a specialized +dataset with no pretraining. Additionally, we propose a novel decoding method +to enhance coherence and meaningfulness in the generated poetry, effectively +managing the tradeoff between diversity and quality. Furthermore, the results +of our training approach and the proposed decoding method are evaluated through +comprehensive set of automatic and human evaluations and showed its superior +capability to generate coherent and meaningful poetry in compare to other +decoding methods and an existing Persian large language model (LLM). + +
+
+
+
+
+ + ☆ Magicoder: Source Code Is All You Need + + +
+ We introduce Magicoder, a series of fully open-source (code, weights, and +data) Large Language Models (LLMs) for code that significantly closes the gap +with top code models while having no more than 7B parameters. Magicoder models +are trained on 75K synthetic instruction data using OSS-Instruct, a novel +approach to enlightening LLMs with open-source code snippets to generate +high-quality instruction data for code. Our main motivation is to mitigate the +inherent bias of the synthetic data generated by LLMs by empowering them with a +wealth of open-source references for the production of more diverse, realistic, +and controllable data. The orthogonality of OSS-Instruct and other data +generation methods like Evol-Instruct further enables us to build an enhanced +MagicoderS. Both Magicoder and MagicoderS substantially outperform +state-of-the-art code models with similar or even larger sizes on a wide range +of coding benchmarks, including Python text-to-code generation, multilingual +coding, and data-science program completion. Notably, MagicoderS-CL-7B based on +CodeLlama even surpasses the prominent ChatGPT on HumanEval+ (66.5 vs. 65.9 in +pass@1). Overall, OSS-Instruct opens a new direction for low-bias and +high-quality instruction tuning using abundant open-source references. + +
+
+
+
+
+ + ☆ Tree of Attacks: Jailbreaking Black-Box LLMs Automatically + + +
+ While Large Language Models (LLMs) display versatile functionality, they +continue to generate harmful, biased, and toxic content, as demonstrated by the +prevalence of human-designed jailbreaks. In this work, we present Tree of +Attacks with Pruning (TAP), an automated method for generating jailbreaks that +only requires black-box access to the target LLM. TAP utilizes an LLM to +iteratively refine candidate (attack) prompts using tree-of-thoughts reasoning +until one of the generated prompts jailbreaks the target. Crucially, before +sending prompts to the target, TAP assesses them and prunes the ones unlikely +to result in jailbreaks. Using tree-of-thought reasoning allows TAP to navigate +a large search space of prompts and pruning reduces the total number of queries +sent to the target. In empirical evaluations, we observe that TAP generates +prompts that jailbreak state-of-the-art LLMs (including GPT4 and GPT4-Turbo) +for more than 80% of the prompts using only a small number of queries. This +significantly improves upon the previous state-of-the-art black-box method for +generating jailbreaks. + +
+
+ comment: An implementation of the presented method is available at + https://github.com/RICommunity/TAP +
+
+
+
+
+ + ☆ When it Rains, it Pours: Modeling Media Storms and the News Ecosystem EMNLP 2023 + + +
+ Most events in the world receive at most brief coverage by the news media. +Occasionally, however, an event will trigger a media storm, with voluminous and +widespread coverage lasting for weeks instead of days. In this work, we develop +and apply a pairwise article similarity model, allowing us to identify story +clusters in corpora covering local and national online news, and thereby create +a comprehensive corpus of media storms over a nearly two year period. Using +this corpus, we investigate media storms at a new level of granularity, +allowing us to validate claims about storm evolution and topical distribution, +and provide empirical support for previously hypothesized patterns of influence +of storms on media coverage and intermedia agenda setting. + +
+
+ comment: Findings of EMNLP 2023; 16 pages; 12 figures; 4 tables +
+
+
+
+
+ + ☆ A Glitch in the Matrix? Locating and Detecting Language Model Grounding + with Fakepedia + + +
+ Large language models (LLMs) have demonstrated impressive capabilities in +storing and recalling factual knowledge, but also in adapting to novel +in-context information. Yet, the mechanisms underlying their in-context +grounding remain unknown, especially in situations where in-context information +contradicts factual knowledge embedded in the parameters. This is critical for +retrieval-augmented generation methods, which enrich the context with +up-to-date information, hoping that grounding can rectify the outdated +parametric knowledge. In this study, we introduce Fakepedia, a counterfactual +dataset designed to evaluate grounding abilities when the parametric knowledge +clashes with the in-context information. We benchmark various LLMs with +Fakepedia and discover that GPT-4-turbo has a strong preference for its +parametric knowledge. Mistral-7B, on the contrary, is the model that most +robustly chooses the grounded answer. Then, we conduct causal mediation +analysis on LLM components when answering Fakepedia queries. We demonstrate +that inspection of the computational graph alone can predict LLM grounding with +92.8% accuracy, especially because few MLPs in the Transformer can predict +non-grounded behavior. Our results, together with existing findings about +factual recall mechanisms, provide a coherent narrative of how grounding and +factual recall mechanisms interact within LLMs. + +
+
+
+
+
+ + ☆ Know Your Audience: Do LLMs Adapt to Different Age and Education Levels? + + +
+ Large language models (LLMs) offer a range of new possibilities, including +adapting the text to different audiences and their reading needs. But how well +do they adapt? We evaluate the readability of answers generated by four +state-of-the-art LLMs (commercial and open-source) to science questions when +prompted to target different age groups and education levels. To assess the +adaptability of LLMs to diverse audiences, we compare the readability scores of +the generated responses against the recommended comprehension level of each age +and education group. We find large variations in the readability of the answers +by different LLMs. Our results suggest LLM answers need to be better adapted to +the intended audience demographics to be more comprehensible. They underline +the importance of enhancing the adaptability of LLMs in education settings to +cater to diverse age and education levels. Overall, current LLMs have set +readability ranges and do not adapt well to different audiences, even when +prompted. That limits their potential for educational purposes. + +
+
+
+
+
+ + ☆ TimeChat: A Time-sensitive Multimodal Large Language Model for Long + Video Understanding + + +
+ This work proposes TimeChat, a time-sensitive multimodal large language model +specifically designed for long video understanding. Our model incorporates two +key architectural contributions: (1) a timestamp-aware frame encoder that binds +visual content with the timestamp of each frame, and (2) a sliding video +Q-Former that produces a video token sequence of varying lengths to accommodate +videos of various durations. Additionally, we construct an instruction-tuning +dataset, encompassing 6 tasks and a total of 125K instances, to further enhance +TimeChat's instruction-following performance. Experiment results across various +video understanding tasks, such as dense captioning, temporal grounding, and +highlight detection, demonstrate TimeChat's strong zero-shot temporal +localization and reasoning capabilities. For example, it achieves +9.2 F1 score +and +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5) +on Charades-STA, compared to state-of-the-art video large language models, +holding the potential to serve as a versatile video assistant for long-form +video comprehension tasks and satisfy realistic user requirements. + +
+
+ comment: 17 pages, 10 figures, code is available at + https://github.com/RenShuhuai-Andy/TimeChat +
+
+
+
+
+ + ☆ Distilled Self-Critique of LLMs with Synthetic Data: a Bayesian + Perspective ICLR 2024 + + +
+ This paper proposes an interpretation of RLAIF as Bayesian inference by +introducing distilled Self-Critique (dSC), which refines the outputs of a LLM +through a Gibbs sampler that is later distilled into a fine-tuned model. Only +requiring synthetic data, dSC is exercised in experiments regarding safety, +sentiment, and privacy control, showing it can be a viable and cheap +alternative to align LLMs. Code released at +\url{https://github.com/vicgalle/distilled-self-critique}. + +
+
+ comment: Submitted to ICLR 2024 (TinyPapers track) +
+
+
+
+
+ + ☆ Zero- and Few-Shots Knowledge Graph Triplet Extraction with Large + Language Models + + +
+ In this work, we tested the Triplet Extraction (TE) capabilities of a variety +of Large Language Models (LLMs) of different sizes in the Zero- and Few-Shots +settings. In detail, we proposed a pipeline that dynamically gathers contextual +information from a Knowledge Base (KB), both in the form of context triplets +and of (sentence, triplets) pairs as examples, and provides it to the LLM +through a prompt. The additional context allowed the LLMs to be competitive +with all the older fully trained baselines based on the Bidirectional Long +Short-Term Memory (BiLSTM) Network architecture. We further conducted a +detailed analysis of the quality of the gathered KB context, finding it to be +strongly correlated with the final TE performance of the model. In contrast, +the size of the model appeared to only logarithmically improve the TE +capabilities of the LLMs. + +
+
+
+
+
+ + ☆ A Machine Learning Approach Towards SKILL Code Autocompletion SP + + +
+ As Moore's Law continues to increase the complexity of electronic systems, +Electronic Design Automation (EDA) must advance to meet global demand. An +important example of an EDA technology is SKILL, a scripting language used to +customize and extend EDA software. Recently, code generation models using the +transformer architecture have achieved impressive results in academic settings +and have even been used in commercial developer tools to improve developer +productivity. To the best of our knowledge, this study is the first to apply +transformers to SKILL code autocompletion towards improving the productivity of +hardware design engineers. In this study, a novel, data-efficient methodology +for generating SKILL code is proposed and experimentally validated. More +specifically, we propose a novel methodology for (i) creating a high-quality +SKILL dataset with both unlabeled and labeled data, (ii) a training strategy +where T5 models pre-trained on general programming language code are fine-tuned +on our custom SKILL dataset using unsupervised and supervised learning, and +(iii) evaluating synthesized SKILL code. We show that models trained using the +proposed methodology outperform baselines in terms of human-judgment score and +BLEU score. A major challenge faced was the extremely small amount of available +SKILL code data that can be used to train a transformer model to generate SKILL +code. Despite our validated improvements, the extremely small dataset available +to us was still not enough to train a model that can reliably autocomplete +SKILL code. We discuss this and other limitations as well as future work that +could address these limitations. + +
+
+ comment: Accepted for SPIE Advanced Lithography + Patterning, 2024 +
+
+
+
+
+ + ☆ Evaluating Dependencies in Fact Editing for Language Models: Specificity + and Implication Awareness EMNLP2023 + + +
+ The potential of using a large language model (LLM) as a knowledge base (KB) +has sparked significant interest. To manage the knowledge acquired by LLMs, we +need to ensure that the editing of learned facts respects internal logical +constraints, which are known as dependency of knowledge. Existing work on +editing LLMs has partially addressed the issue of dependency, when the editing +of a fact should apply to its lexical variations without disrupting irrelevant +ones. However, they neglect the dependency between a fact and its logical +implications. We propose an evaluation protocol with an accompanying +question-answering dataset, DepEdit, that provides a comprehensive assessment +of the editing process considering the above notions of dependency. Our +protocol involves setting up a controlled environment in which we edit facts +and monitor their impact on LLMs, along with their implications based on +If-Then rules. Extensive experiments on DepEdit show that existing knowledge +editing methods are sensitive to the surface form of knowledge, and that they +have limited performance in inferring the implications of edited facts. + +
+
+ comment: Findings of EMNLP2023 +
+
+
+
+
+ + ☆ Prompting Disentangled Embeddings for Knowledge Graph Completion with + Pre-trained Language Model + + +
+ Both graph structures and textual information play a critical role in +Knowledge Graph Completion (KGC). With the success of Pre-trained Language +Models (PLMs) such as BERT, they have been applied for text encoding for KGC. +However, the current methods mostly prefer to fine-tune PLMs, leading to huge +training costs and limited scalability to larger PLMs. In contrast, we propose +to utilize prompts and perform KGC on a frozen PLM with only the prompts +trained. Accordingly, we propose a new KGC method named PDKGC with two prompts +-- a hard task prompt which is to adapt the KGC task to the PLM pre-training +task of token prediction, and a disentangled structure prompt which learns +disentangled graph representation so as to enable the PLM to combine more +relevant structure knowledge with the text information. With the two prompts, +PDKGC builds a textual predictor and a structural predictor, respectively, and +their combination leads to more comprehensive entity prediction. Solid +evaluation on two widely used KGC datasets has shown that PDKGC often +outperforms the baselines including the state-of-the-art, and its components +are all effective. Our codes and data are available at +https://github.com/genggengcss/PDKGC. + +
+
+ comment: under review +
+
+
+
+
+ + ☆ Exchange-of-Thought: Enhancing Large Language Model Capabilities through + Cross-Model Communication EMNLP2023 + + +
+ Large Language Models (LLMs) have recently made significant strides in +complex reasoning tasks through the Chain-of-Thought technique. Despite this +progress, their reasoning is often constrained by their intrinsic +understanding, lacking external insights. To address this, we propose +Exchange-of-Thought (EoT), a novel framework that enables cross-model +communication during problem-solving. Drawing inspiration from network +topology, EoT integrates four unique communication paradigms: Memory, Report, +Relay, and Debate. This paper delves into the communication dynamics and volume +associated with each paradigm. To counterbalance the risks of incorrect +reasoning chains, we implement a robust confidence evaluation mechanism within +these communications. Our experiments across diverse complex reasoning tasks +demonstrate that EoT significantly surpasses established baselines, +underscoring the value of external insights in enhancing LLM performance. +Furthermore, we show that EoT achieves these superior results in a +cost-effective manner, marking a promising advancement for efficient and +collaborative AI problem-solving. + +
+
+ comment: 19 pages, 11 figures, accepted by EMNLP2023 +
+
+
+
+
+ + ☆ Developing Linguistic Patterns to Mitigate Inherent Human Bias in + Offensive Language Detection + + +
+ With the proliferation of social media, there has been a sharp increase in +offensive content, particularly targeting vulnerable groups, exacerbating +social problems such as hatred, racism, and sexism. Detecting offensive +language use is crucial to prevent offensive language from being widely shared +on social media. However, the accurate detection of irony, implication, and +various forms of hate speech on social media remains a challenge. Natural +language-based deep learning models require extensive training with large, +comprehensive, and labeled datasets. Unfortunately, manually creating such +datasets is both costly and error-prone. Additionally, the presence of +human-bias in offensive language datasets is a major concern for deep learning +models. In this paper, we propose a linguistic data augmentation approach to +reduce bias in labeling processes, which aims to mitigate the influence of +human bias by leveraging the power of machines to improve the accuracy and +fairness of labeling processes. This approach has the potential to improve +offensive language classification tasks across multiple languages and reduce +the prevalence of offensive content on social media. + +
+
+
+
+
+ + ☆ Retrieval-augmented Multi-modal Chain-of-Thoughts Reasoning for Large + Language Models + + +
+ The advancement of Large Language Models(LLMs) has brought substantial +attention to the Chain of Thought(CoT) approach, primarily due to its ability +to enhance the capability of LLMs on tasks requiring complex reasoning. +Moreover, the significance of CoT approaches extends to the application of LLMs +for multi-modal tasks, such as multi-modal question answering. However, the +selection of optimal CoT demonstration examples in multi-modal reasoning for +LLMs remains less explored for LLMs due to the inherent complexity of +multi-modal examples. In this paper, we introduce a novel approach that +addresses this challenge by using retrieval mechanisms to dynamically and +automatically select demonstration examples based on cross-modal similarities. +This method aims to refine the CoT reasoning process in multi-modal scenarios +via informing LLMs with more relevant and informative examples. Furthermore, we +employ a stratified sampling method categorising demonstration examples into +groups based on their types and retrieving examples from different groups +respectively to promote the diversity of demonstration examples. Through a +series of experiments, we demonstrate that our approach significantly improves +the performance of LLMs, achieving state-of-the-art results in multi-modal +reasoning tasks. Specifically, our methods demonstrate significant advancements +on the ScienceQA dataset. While our method based on ChatGPT outperforms the +Chameleon(ChatGPT) by 2.74% with an accuracy of 82.67%, the GPT4-based approach +surpasses the Chameleon(GPT-4) by 0.89%, achieving 87.43% on accuracy under the +same setting. Moreover, our best performing show a 6.05% increase over +Chameleon for ChatGPT-based models and a 4.57% increase for GPT-4-based models. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Mitigating Fine-Grained Hallucination by Fine-Tuning Large + Vision-Language Models with Caption Rewrites + + +
+ Large language models (LLMs) have shown remarkable performance in natural +language processing (NLP) tasks. To comprehend and execute diverse human +instructions over image data, instruction-tuned large vision-language models +(LVLMs) have been introduced. However, LVLMs may suffer from different types of +object hallucinations. Nevertheless, LVLMs are evaluated for coarse-grained +object hallucinations only (i.e., generated objects non-existent in the input +image). The fine-grained object attributes and behaviors non-existent in the +image may still be generated but not measured by the current evaluation +methods. In this paper, we thus focus on reducing fine-grained hallucinations +of LVLMs. We propose \textit{ReCaption}, a framework that consists of two +components: rewriting captions using ChatGPT and fine-tuning the +instruction-tuned LVLMs on the rewritten captions. We also propose a +fine-grained probing-based evaluation method named \textit{Fine-Grained Object +Hallucination Evaluation} (\textit{FGHE}). Our experiment results demonstrate +that ReCaption effectively reduces fine-grained object hallucination for +different LVLM options and improves their text generation quality. The code can +be found at https://github.com/Anonymousanoy/FOHE. + +
+
+ comment: MMM 2024 +
+
+
+
+
+ + ☆ Data Management For Large Language Models: A Survey + + +
+ Data plays a fundamental role in the training of Large Language Models +(LLMs). Effective data management, particularly in the formulation of a +well-suited training dataset, holds significance for enhancing model +performance and improving training efficiency during pretraining and supervised +fine-tuning phases. Despite the considerable importance of data management, the +current research community still falls short in providing a systematic analysis +of the rationale behind management strategy selection, its consequential +effects, methodologies for evaluating curated datasets, and the ongoing pursuit +of improved strategies. Consequently, the exploration of data management has +attracted more and more attention among the research community. This survey +provides a comprehensive overview of current research in data management within +both the pretraining and supervised fine-tuning stages of LLMs, covering +various noteworthy aspects of data management strategy design: data quantity, +data quality, domain/task composition, etc. Looking toward the future, we +extrapolate existing challenges and outline promising directions for +development in this field. Therefore, this survey serves as a guiding resource +for practitioners aspiring to construct powerful LLMs through effective data +management practices. The collection of the latest papers is available at +https://github.com/ZigeW/data_management_LLM. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Jellyfish: A Large Language Model for Data Preprocessing + + +
+ In this paper, we present Jellyfish, an open-source LLM as a universal task +solver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned +with the datasets of several typical DP tasks including error detection, data +imputation, schema matching, and entity matching, and delivers generalizability +to other tasks. Remarkably, Jellyfish can operate on a local, single, and +low-priced GPU with its 13 billion parameters, ensuring data security and +enabling further tuning. Its proficiency in understanding natural language +allows users to manually craft instructions for DP tasks. Unlike many existing +methods that heavily rely on prior knowledge, Jellyfish acquires domain +knowledge during its tuning process and integrates optional knowledge injection +during inference. A distinctive feature of Jellyfish is its interpreter, which +elucidates its output decisions. To construct Jellyfish, we develop a series of +pre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance +serializer, which automatically translates raw data into model prompts, and a +knowledge injector, which optionally introduces task- and dataset-specific +knowledge to enhance DP performance. Our evaluation of Jellyfish, using a range +of real datasets, shows its competitiveness compared to state-of-the-art +methods and its strong generalizability to unseen tasks. Jellyfish's +performance rivals that of GPT series models, and its interpreter offers +enhanced reasoning capabilities compared to GPT-3.5. Furthermore, our +evaluation highlights the effectiveness of the techniques employed in +constructing Jellyfish. Our model is available at Hugging Face: +https://huggingface.co/NECOUDBFM/Jellyfish . + +
+
+
+
+
+ + ☆ STADEE: STAtistics-based DEEp Detection of Machine Generated Text + + +
+ We present STADEE, a \textbf{STA}tistics-based \textbf{DEE}p detection method +to identify machine-generated text, addressing the limitations of current +methods that rely heavily on fine-tuning pre-trained language models (PLMs). +STADEE integrates key statistical text features with a deep classifier, +focusing on aspects like token probability and cumulative probability, crucial +for handling nucleus sampling. Tested across diverse datasets and scenarios +(in-domain, out-of-domain, and in-the-wild), STADEE demonstrates superior +performance, achieving an 87.05% F1 score in-domain and outperforming both +traditional statistical methods and fine-tuned PLMs, especially in +out-of-domain and in-the-wild settings, highlighting its effectiveness and +generalizability. + +
+
+
+
+
+ + ☆ ChatGPT as a Math Questioner? Evaluating ChatGPT on Generating + Pre-university Math Questions + + +
+ Mathematical questioning is crucial for assessing students problem-solving +skills. Since manually creating such questions requires substantial effort, +automatic methods have been explored. Existing state-of-the-art models rely on +fine-tuning strategies and struggle to generate questions that heavily involve +multiple steps of logical and arithmetic reasoning. Meanwhile, large language +models(LLMs) such as ChatGPT have excelled in many NLP tasks involving logical +and arithmetic reasoning. Nonetheless, their applications in generating +educational questions are underutilized, especially in the field of +mathematics. To bridge this gap, we take the first step to conduct an in-depth +analysis of ChatGPT in generating pre-university math questions. Our analysis +is categorized into two main settings: context-aware and context-unaware. In +the context-aware setting, we evaluate ChatGPT on existing math +question-answering benchmarks covering elementary, secondary, and ternary +classes. In the context-unaware setting, we evaluate ChatGPT in generating math +questions for each lesson from pre-university math curriculums that we crawl. +Our crawling results in TopicMath, a comprehensive and novel collection of +pre-university math curriculums collected from 121 math topics and 428 lessons +from elementary, secondary, and tertiary classes. Through this analysis, we aim +to provide insight into the potential of ChatGPT as a math questioner. + +
+
+ comment: Accepted at the 39th ACM/SIGAPP Symposium On Applied Computing (SAC + 2024), Main Conference +
+
+
+
+
+ + ☆ Characterizing Large Language Model Geometry Solves Toxicity Detection + and Generation + + +
+ Large Language Models~(LLMs) drive current AI breakthroughs despite very +little being known about their internal representations, e.g., how to extract a +few informative features to solve various downstream tasks. To provide a +practical and principled answer, we propose to characterize LLMs from a +geometric perspective. We obtain in closed form (i) the intrinsic dimension in +which the Multi-Head Attention embeddings are constrained to exist and (ii) the +partition and per-region affine mappings of the per-layer feedforward networks. +Our results are informative, do not rely on approximations, and are actionable. +First, we show that, motivated by our geometric interpretation, we can bypass +Llama$2$'s RLHF by controlling its embedding's intrinsic dimension through +informed prompt manipulation. Second, we derive $7$ interpretable spline +features that can be extracted from any (pre-trained) LLM layer, providing a +rich abstract representation of their inputs. Those features alone ($224$ for +Mistral-7B and Llama$2$-7B) are sufficient to help solve toxicity detection, +infer the domain of the prompt, and even tackle the Jigsaw challenge, which +aims at characterizing the type of toxicity of various prompts. Our results +demonstrate how, even in large-scale regimes, exact theoretical results can +answer practical questions in language models. Code: +\url{https://github.com/RandallBalestriero/SplineLLM}. + +
+
+
+
+
+ + ☆ Voice-Based Smart Assistant System for Vehicles using RASA + + +
+ Conversational AIs, or chatbots, mimic human speech when conversing. Smart +assistants facilitate the automation of several tasks that needed human +intervention earlier. Because of their accuracy, absence of dependence on human +resources, and accessibility around the clock, chatbots can be employed in +vehicles too. Due to people's propensity to divert their attention away from +the task of driving while engaging in other activities like calling, playing +music, navigation, and getting updates on the weather forecast and latest news, +road safety has declined and accidents have increased as a result. It would be +advantageous to automate these tasks using voice commands rather than carrying +them out manually. This paper focuses on the development of a voice-based smart +assistance application for vehicles based on the RASA framework. The smart +assistant provides functionalities like navigation, communication via calls, +getting weather forecasts and the latest news updates, and music that are +completely voice-based in nature. + +
+
+ comment: 6 pages, 4 figures, accepted at IEEE International Conference on + Computational Intelligence, Networks and Security ICCINS-2023 +
+
+
+
+
+ + ☆ Expand BERT Representation with Visual Information via Grounded Language + Learning with Multimodal Partial Alignment + + +
+ Language models have been supervised with both language-only objective and +visual grounding in existing studies of visual-grounded language learning. +However, due to differences in the distribution and scale of visual-grounded +datasets and language corpora, the language model tends to mix up the context +of the tokens that occurred in the grounded data with those that do not. As a +result, during representation learning, there is a mismatch between the visual +information and the contextual meaning of the sentence. To overcome this +limitation, we propose GroundedBERT - a grounded language learning method that +enhances the BERT representation with visually grounded information. +GroundedBERT comprises two components: (i) the original BERT which captures the +contextual representation of words learned from the language corpora, and (ii) +a visual grounding module which captures visual information learned from +visual-grounded datasets. Moreover, we employ Optimal Transport (OT), +specifically its partial variant, to solve the fractional alignment problem +between the two modalities. Our proposed method significantly outperforms the +baseline language models on various language tasks of the GLUE and SQuAD +datasets. + +
+
+
+
+
+ + ☆ Explaining with Contrastive Phrasal Highlighting: A Case Study in + Assisting Humans to Detect Translation Differences EMNLP 2023 + + +
+ Explainable NLP techniques primarily explain by answering "Which tokens in +the input are responsible for this prediction?''. We argue that for NLP models +that make predictions by comparing two input texts, it is more useful to +explain by answering "What differences between the two inputs explain this +prediction?''. We introduce a technique to generate contrastive highlights that +explain the predictions of a semantic divergence model via +phrase-alignment-guided erasure. We show that the resulting highlights match +human rationales of cross-lingual semantic differences better than popular +post-hoc saliency techniques and that they successfully help people detect +fine-grained meaning differences in human translations and critical machine +translation errors. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ A Challenging Multimodal Video Summary: Simultaneously Extracting and + Generating Keyframe-Caption Pairs from Video + + +
+ This paper proposes a practical multimodal video summarization task setting +and a dataset to train and evaluate the task. The target task involves +summarizing a given video into a predefined number of keyframe-caption pairs +and displaying them in a listable format to grasp the video content quickly. +This task aims to extract crucial scenes from the video in the form of images +(keyframes) and generate corresponding captions explaining each keyframe's +situation. This task is useful as a practical application and presents a highly +challenging problem worthy of study. Specifically, achieving simultaneous +optimization of the keyframe selection performance and caption quality +necessitates careful consideration of the mutual dependence on both preceding +and subsequent keyframes and captions. To facilitate subsequent research in +this field, we also construct a dataset by expanding upon existing datasets and +propose an evaluation framework. Furthermore, we develop two baseline systems +and report their respective performance. + +
+
+
+
+
+ + ☆ APoLLo: Unified Adapter and Prompt Learning for Vision Language Models EMNLP 2023 + + +
+ The choice of input text prompt plays a critical role in the performance of +Vision-Language Pretrained (VLP) models such as CLIP. We present APoLLo, a +unified multi-modal approach that combines Adapter and Prompt learning for +Vision-Language models. Our method is designed to substantially improve the +generalization capabilities of VLP models when they are fine-tuned in a +few-shot setting. We introduce trainable cross-attention-based adapter layers +in conjunction with vision and language encoders to strengthen the alignment +between the two modalities. We enforce consistency between the respective +encoder branches (receiving augmented inputs) to prevent overfitting in +downstream tasks. Our method is evaluated on three representative tasks: +generalization to novel classes, cross-dataset evaluation, and unseen domain +shifts. In practice, APoLLo achieves a relative gain up to 6.03% over MaPLe +(SOTA) on novel classes for 10 diverse image recognition datasets. + +
+
+ comment: Accepted at EMNLP 2023 (Main track) +
+
+
+
+
+ + ☆ The Unlocking Spell on Base LLMs: Rethinking Alignment via In-Context + Learning + + +
+ The alignment tuning process of large language models (LLMs) typically +involves instruction learning through supervised fine-tuning (SFT) and +preference tuning via reinforcement learning from human feedback (RLHF). A +recent study, LIMA (Zhou et al. 2023), shows that using merely 1K examples for +SFT can achieve significant alignment performance as well, suggesting that the +effect of alignment tuning might be "superficial." This raises questions about +how exactly the alignment tuning transforms a base LLM. + We analyze the effect of alignment tuning by examining the token distribution +shift between base LLMs and their aligned counterpart. Our findings reveal that +base LLMs and their alignment-tuned versions perform nearly identically in +decoding on the majority of token positions. Most distribution shifts occur +with stylistic tokens. These direct evidence strongly supports the Superficial +Alignment Hypothesis suggested by LIMA. + Based on these findings, we rethink the alignment of LLMs by posing the +research question: how effectively can we align base LLMs without SFT or RLHF? +To address this, we introduce a simple, tuning-free alignment method, URIAL. +URIAL achieves effective alignment purely through in-context learning (ICL) +with base LLMs, requiring as few as three constant stylistic examples and a +system prompt. We conduct a fine-grained and interpretable evaluation on a +diverse set of examples, named JUST-EVAL-INSTRUCT. Results demonstrate that +base LLMs with URIAL can match or even surpass the performance of LLMs aligned +with SFT or SFT+RLHF. We show that the gap between tuning-free and tuning-based +alignment methods can be significantly reduced through strategic prompting and +ICL. Our findings on the superficial nature of alignment tuning and results +with URIAL suggest that deeper analysis and theoretical understanding of +alignment is crucial to future LLM research. + +
+
+ comment: 26 pages, 8 figures. Project website: + https://allenai.github.io/re-align/ +
+
+
+
+
+ + ☆ New Evaluation Metrics Capture Quality Degradation due to LLM + Watermarking + + +
+ With the increasing use of large-language models (LLMs) like ChatGPT, +watermarking has emerged as a promising approach for tracing machine-generated +content. However, research on LLM watermarking often relies on simple +perplexity or diversity-based measures to assess the quality of watermarked +text, which can mask important limitations in watermarking. Here we introduce +two new easy-to-use methods for evaluating watermarking algorithms for LLMs: 1) +evaluation by LLM-judger with specific guidelines; and 2) binary classification +on text embeddings to distinguish between watermarked and unwatermarked text. +We apply these methods to characterize the effectiveness of current +watermarking techniques. Our experiments, conducted across various datasets, +reveal that current watermarking methods are detectable by even simple +classifiers, challenging the notion of watermarking subtlety. We also found, +through the LLM judger, that watermarking impacts text quality, especially in +degrading the coherence and depth of the response. Our findings underscore the +trade-off between watermark robustness and text quality and highlight the +importance of having more informative metrics to assess watermarking quality. + +
+
+
+
+
+ + ☆ Measuring Distributional Shifts in Text: The Advantage of Language + Model-Based Embeddings + + +
+ An essential part of monitoring machine learning models in production is +measuring input and output data drift. In this paper, we present a system for +measuring distributional shifts in natural language data and highlight and +investigate the potential advantage of using large language models (LLMs) for +this problem. Recent advancements in LLMs and their successful adoption in +different domains indicate their effectiveness in capturing semantic +relationships for solving various natural language processing problems. The +power of LLMs comes largely from the encodings (embeddings) generated in the +hidden layers of the corresponding neural network. First we propose a +clustering-based algorithm for measuring distributional shifts in text data by +exploiting such embeddings. Then we study the effectiveness of our approach +when applied to text embeddings generated by both LLMs and classical embedding +algorithms. Our experiments show that general-purpose LLM-based embeddings +provide a high sensitivity to data drift compared to other embedding methods. +We propose drift sensitivity as an important evaluation metric to consider when +comparing language models. Finally, we present insights and lessons learned +from deploying our framework as part of the Fiddler ML Monitoring platform over +a period of 18 months. + +
+
+
+
+
+ + ☆ An Evaluation Framework for Mapping News Headlines to Event Classes in a + Knowledge Graph + + +
+ Mapping ongoing news headlines to event-related classes in a rich knowledge +base can be an important component in a knowledge-based event analysis and +forecasting solution. In this paper, we present a methodology for creating a +benchmark dataset of news headlines mapped to event classes in Wikidata, and +resources for the evaluation of methods that perform the mapping. We use the +dataset to study two classes of unsupervised methods for this task: 1) +adaptations of classic entity linking methods, and 2) methods that treat the +problem as a zero-shot text classification problem. For the first approach, we +evaluate off-the-shelf entity linking systems. For the second approach, we +explore a) pre-trained natural language inference (NLI) models, and b) +pre-trained large generative language models. We present the results of our +evaluation, lessons learned, and directions for future work. The dataset and +scripts for evaluation are made publicly available. + +
+
+ comment: Presented at CASE 2023 @ RANLP + https://aclanthology.org/2023.case-1.6/ +
+
+
+
+
+ + ☆ Revisiting Topic-Guided Language Models + + +
+ A recent line of work in natural language processing has aimed to combine +language models and topic models. These topic-guided language models augment +neural language models with topic models, unsupervised learning methods that +can discover document-level patterns of word use. This paper compares the +effectiveness of these methods in a standardized setting. We study four +topic-guided language models and two baselines, evaluating the held-out +predictive performance of each model on four corpora. Surprisingly, we find +that none of these methods outperform a standard LSTM language model baseline, +and most fail to learn good topics. Further, we train a probe of the neural +language model that shows that the baseline's hidden states already encode +topic information. We make public all code used for this study. + +
+
+ comment: Published in Transactions on Machine Learning Research (TMLR) + (12/2023) +
+
+
+
+
+ + ☆ GNN2R: Weakly-Supervised Rationale-Providing Question Answering over + Knowledge Graphs + + +
+ Most current methods for multi-hop question answering (QA) over knowledge +graphs (KGs) only provide final conclusive answers without explanations, such +as a set of KG entities that is difficult for normal users to review and +comprehend. This issue severely limits the application of KG-based QA in +real-world scenarios. However, it is non-trivial to solve due to two +challenges: First, annotations of reasoning chains of multi-hop questions, +which could serve as supervision for explanation generation, are usually +lacking. Second, it is difficult to maintain high efficiency when explicit KG +triples need to be retrieved to generate explanations. In this paper, we +propose a novel Graph Neural Network-based Two-Step Reasoning model (GNN2R) to +solve this issue. GNN2R can provide both final answers and reasoning subgraphs +as a rationale behind final answers efficiently with only weak supervision that +is available through question-final answer pairs. We extensively evaluated +GNN2R with detailed analyses in experiments. The results demonstrate that, in +terms of effectiveness, efficiency, and quality of generated explanations, +GNN2R outperforms existing state-of-the-art methods that are applicable to this +task. Our code and pre-trained models are available at +https://github.com/ruijie-wang-uzh/GNN2R. + +
+
+
+
+
+ + ☆ Fine-tuning pre-trained extractive QA models for clinical document + parsing + + +
+ Electronic health records (EHRs) contain a vast amount of high-dimensional +multi-modal data that can accurately represent a patient's medical history. +Unfortunately, most of this data is either unstructured or semi-structured, +rendering it unsuitable for real-time and retrospective analyses. A remote +patient monitoring (RPM) program for Heart Failure (HF) patients needs to have +access to clinical markers like EF (Ejection Fraction) or LVEF (Left +Ventricular Ejection Fraction) in order to ascertain eligibility and +appropriateness for the program. This paper explains a system that can parse +echocardiogram reports and verify EF values. This system helps identify +eligible HF patients who can be enrolled in such a program. At the heart of +this system is a pre-trained extractive QA transformer model that is fine-tuned +on custom-labeled data. The methods used to prepare such a model for deployment +are illustrated by running experiments on a public clinical dataset like +MIMIC-IV-Note. The pipeline can be used to generalize solutions to similar +problems in a low-resource setting. We found that the system saved over 1500 +hours for our clinicians over 12 months by automating the task at scale. + +
+
+
+
+
+ + ☆ VaQuitA: Enhancing Alignment in LLM-Assisted Video Understanding + + +
+ Recent advancements in language-model-based video understanding have been +progressing at a remarkable pace, spurred by the introduction of Large Language +Models (LLMs). However, the focus of prior research has been predominantly on +devising a projection layer that maps video features to tokens, an approach +that is both rudimentary and inefficient. In our study, we introduce a +cutting-edge framework, VaQuitA, designed to refine the synergy between video +and textual information. At the data level, instead of sampling frames +uniformly, we implement a sampling method guided by CLIP-score rankings, which +enables a more aligned selection of frames with the given question. At the +feature level, we integrate a trainable Video Perceiver alongside a +Visual-Query Transformer (abbreviated as VQ-Former), which bolsters the +interplay between the input question and the video features. We also discover +that incorporating a simple prompt, "Please be critical", into the LLM input +can substantially enhance its video comprehension capabilities. Our +experimental results indicate that VaQuitA consistently sets a new benchmark +for zero-shot video question-answering tasks and is adept at producing +high-quality, multi-turn video dialogues with users. + +
+
+
+
+
+ + ☆ LLMs Accelerate Annotation for Medical Information Extraction ML4H + + +
+ The unstructured nature of clinical notes within electronic health records +often conceals vital patient-related information, making it challenging to +access or interpret. To uncover this hidden information, specialized Natural +Language Processing (NLP) models are required. However, training these models +necessitates large amounts of labeled data, a process that is both +time-consuming and costly when relying solely on human experts for annotation. +In this paper, we propose an approach that combines Large Language Models +(LLMs) with human expertise to create an efficient method for generating ground +truth labels for medical text annotation. By utilizing LLMs in conjunction with +human annotators, we significantly reduce the human annotation burden, enabling +the rapid creation of labeled datasets. We rigorously evaluate our method on a +medical information extraction task, demonstrating that our approach not only +substantially cuts down on human intervention but also maintains high accuracy. +The results highlight the potential of using LLMs to improve the utilization of +unstructured clinical data, allowing for the swift deployment of tailored NLP +solutions in healthcare. + +
+
+ comment: Published in proceedings of the Machine Learning for Health (ML4H) + Symposium 2023 +
+
+
+
+
+ + ☆ Fine-Tuning Language Models for Context-Specific SQL Query Generation + + +
+ The ability to generate SQL queries from natural language has significant +implications for making data accessible to non-specialists. This paper presents +a novel approach to fine-tuning open-source large language models (LLMs) for +the task of transforming natural language into SQL queries within the retail +domain. We introduce models specialized in generating SQL queries, trained on +synthetic datasets tailored to the Snowflake SQL and GoogleSQL dialects. Our +methodology involves generating a context-specific dataset using GPT-4, then +fine-tuning three open-source LLMs(Starcoder Plus, Code-Llama, and Mistral) +employing the LoRa technique to optimize for resource constraints. The +fine-tuned models demonstrate superior performance in zero-shot settings +compared to the baseline GPT-4, with Code-Llama achieving the highest accuracy +rates, at 81.58% for Snowflake SQL and 82.66% for GoogleSQL. These results +underscore the effectiveness of fine-tuning LLMs on domain-specific tasks and +suggest a promising direction for enhancing the accessibility of relational +databases through natural language interfaces. + +
+
+
+
+
+ + ☆ Recursive Visual Programming + + +
+ Visual Programming (VP) has emerged as a powerful framework for Visual +Question Answering (VQA). By generating and executing bespoke code for each +question, these methods demonstrate impressive compositional and reasoning +capabilities, especially in few-shot and zero-shot scenarios. However, existing +VP methods generate all code in a single function, resulting in code that is +suboptimal in terms of both accuracy and interpretability. Inspired by human +coding practices, we propose Recursive Visual Programming (RVP), which +simplifies generated routines, provides more efficient problem solving, and can +manage more complex data structures. RVP is inspired by human coding practices +and approaches VQA tasks with an iterative recursive code generation approach, +allowing decomposition of complicated problems into smaller parts. Notably, RVP +is capable of dynamic type assignment, i.e., as the system recursively +generates a new piece of code, it autonomously determines the appropriate +return type and crafts the requisite code to generate that output. We show +RVP's efficacy through extensive experiments on benchmarks including VSR, COVR, +GQA, and NextQA, underscoring the value of adopting human-like recursive and +modular programming techniques for solving VQA tasks through coding. + +
+
+
+
+
+ + ☆ Improving Multimodal Sentiment Analysis: Supervised Angular Margin-based + Contrastive Learning for Enhanced Fusion Representation + + +
+ The effectiveness of a model is heavily reliant on the quality of the fusion +representation of multiple modalities in multimodal sentiment analysis. +Moreover, each modality is extracted from raw input and integrated with the +rest to construct a multimodal representation. Although previous methods have +proposed multimodal representations and achieved promising results, most of +them focus on forming positive and negative pairs, neglecting the variation in +sentiment scores within the same class. Additionally, they fail to capture the +significance of unimodal representations in the fusion vector. To address these +limitations, we introduce a framework called Supervised Angular-based +Contrastive Learning for Multimodal Sentiment Analysis. This framework aims to +enhance discrimination and generalizability of the multimodal representation +and overcome biases in the fusion vector's modality. Our experimental results, +along with visualizations on two widely used datasets, demonstrate the +effectiveness of our approach. + +
+
+
+
+
+ + ♻ ☆ Bengali Fake Reviews: A Benchmark Dataset and Detection System + + +
+ The proliferation of fake reviews on various online platforms has created a +major concern for both consumers and businesses. Such reviews can deceive +customers and cause damage to the reputation of products or services, making it +crucial to identify them. Although the detection of fake reviews has been +extensively studied in English language, detecting fake reviews in non-English +languages such as Bengali is still a relatively unexplored research area. This +paper introduces the Bengali Fake Review Detection (BFRD) dataset, the first +publicly available dataset for identifying fake reviews in Bengali. The dataset +consists of 7710 non-fake and 1339 fake food-related reviews collected from +social media posts. To convert non-Bengali words in a review, a unique pipeline +has been proposed that translates English words to their corresponding Bengali +meaning and also back transliterates Romanized Bengali to Bengali. We have +conducted rigorous experimentation using multiple deep learning and pre-trained +transformer language models to develop a reliable detection system. Finally, we +propose a weighted ensemble model that combines four pre-trained transformers: +BanglaBERT, BanglaBERT Base, BanglaBERT Large, and BanglaBERT Generator . +According to the experiment results, the proposed ensemble model obtained a +weighted F1-score of 0.9843 on 13390 reviews, including 1339 actual fake +reviews and 5356 augmented fake reviews generated with the nlpaug library. The +remaining 6695 reviews were randomly selected from the 7710 non-fake instances. +The model achieved a 0.9558 weighted F1-score when the fake reviews were +augmented using the bnaug library. + +
+
+
+
+
+ + ♻ ☆ Rethinking Label Smoothing on Multi-hop Question Answering CCL2023 + + +
+ Multi-Hop Question Answering (MHQA) is a significant area in question +answering, requiring multiple reasoning components, including document +retrieval, supporting sentence prediction, and answer span extraction. In this +work, we analyze the primary factors limiting the performance of multi-hop +reasoning and introduce label smoothing into the MHQA task. This is aimed at +enhancing the generalization capabilities of MHQA systems and mitigating +overfitting of answer spans and reasoning paths in training set. We propose a +novel label smoothing technique, F1 Smoothing, which incorporates uncertainty +into the learning process and is specifically tailored for Machine Reading +Comprehension (MRC) tasks. Inspired by the principles of curriculum learning, +we introduce the Linear Decay Label Smoothing Algorithm (LDLA), which +progressively reduces uncertainty throughout the training process. Experiment +on the HotpotQA dataset demonstrates the effectiveness of our methods in +enhancing performance and generalizability in multi-hop reasoning, achieving +new state-of-the-art results on the leaderboard. + +
+
+ comment: 13 pages, 8 figures, accepted by CCL2023 +
+
+
+
+
+ + ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze + Foundation Models Under Distribution Shift + + +
+ Foundation models, specifically Large Language Models (LLM's), have lately +gained wide-spread attention and adoption. Reinforcement Learning with Human +Feedback (RLHF) involves training a reward model to capture desired behaviors, +which is then used to align LLM's. These reward models are additionally used at +inference-time to estimate LLM responses' adherence to those desired behaviors. +However, there is little work measuring how robust these reward models are to +distribution shifts. In this work, we evaluate how reward model performance - +measured via accuracy and calibration (i.e. alignment between accuracy and +confidence) - is affected by distribution shift. We show novel calibration +patterns and accuracy drops due to OOD prompts and responses, and that the +reward model is more sensitive to shifts in responses than prompts. +Additionally, we adapt an OOD detection technique commonly used in +classification to the reward model setting to detect these distribution shifts +in prompts and responses. + +
+
+
+
+
+ + ♻ ☆ Improving the Robustness of Summarization Models by Detecting and + Removing Input Noise EMNLP + + +
+ The evaluation of abstractive summarization models typically uses test data +that is identically distributed as training data. In real-world practice, +documents to be summarized may contain input noise caused by text extraction +artifacts or data pipeline bugs. The robustness of model performance under +distribution shift caused by such noise is relatively under-studied. We present +a large empirical study quantifying the sometimes severe loss in performance +(up to 12 ROUGE-1 points) from different types of input noise for a range of +datasets and model sizes. We then propose a light-weight method for detecting +and removing such noise in the input during model inference without requiring +any extra training, auxiliary models, or even prior knowledge of the type of +noise. Our proposed approach effectively mitigates the loss in performance, +recovering a large fraction of the performance drop, sometimes as large as 11 +ROUGE-1 points. + +
+
+ comment: EMNLP Findings 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ BioCLIP: A Vision Foundation Model for the Tree of Life + + +
+ Images of the natural world, collected by a variety of cameras, from drones +to individual phones, are increasingly abundant sources of biological +information. There is an explosion of computational methods and tools, +particularly computer vision, for extracting biologically relevant information +from images for science and conservation. Yet most of these are bespoke +approaches designed for a specific task and are not easily adaptable or +extendable to new questions, contexts, and datasets. A vision model for general +organismal biology questions on images is of timely need. To approach this, we +curate and release TreeOfLife-10M, the largest and most diverse ML-ready +dataset of biology images. We then develop BioCLIP, a foundation model for the +tree of life, leveraging the unique properties of biology captured by +TreeOfLife-10M, namely the abundance and variety of images of plants, animals, +and fungi, together with the availability of rich structured biological +knowledge. We rigorously benchmark our approach on diverse fine-grained biology +classification tasks, and find that BioCLIP consistently and substantially +outperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation +reveals that BioCLIP has learned a hierarchical representation conforming to +the tree of life, shedding light on its strong generalizability. Our code, +models and data will be made available at +https://github.com/Imageomics/bioclip. + +
+
+ comment: 18 pages; updated title +
+
+
+
+
+ + ♻ ☆ Never Train from Scratch: Fair Comparison of Long-Sequence Models + Requires Data-Driven Priors + + +
+ Modeling long-range dependencies across sequences is a longstanding goal in +machine learning and has led to architectures, such as state space models, that +dramatically outperform Transformers on long sequences. However, these +impressive empirical gains have been by and large demonstrated on benchmarks +(e.g. Long Range Arena), where models are randomly initialized and trained to +predict a target label from an input sequence. In this work, we show that +random initialization leads to gross overestimation of the differences between +architectures and that pretraining with standard denoising objectives, using +$\textit{only the downstream task data}$, leads to dramatic gains across +multiple architectures and to very small gaps between Transformers and state +space models (SSMs). In stark contrast to prior works, we find vanilla +Transformers to match the performance of S4 on Long Range Arena when properly +pretrained, and we improve the best reported results of SSMs on the PathX-256 +task by 20 absolute points. Subsequently, we analyze the utility of +previously-proposed structured parameterizations for SSMs and show they become +mostly redundant in the presence of data-driven initialization obtained through +pretraining. Our work shows that, when evaluating different architectures on +supervised tasks, incorporation of data-driven priors via pretraining is +essential for reliable performance estimation, and can be done efficiently. + +
+
+
+
+
+ + ♻ ☆ USB: A Unified Summarization Benchmark Across Tasks and Domains EMNLP + + +
+ While the NLP community has produced numerous summarization benchmarks, none +provide the rich annotations required to simultaneously address many important +problems related to control and reliability. We introduce a Wikipedia-derived +benchmark, complemented by a rich set of crowd-sourced annotations, that +supports $8$ interrelated tasks: (i) extractive summarization; (ii) abstractive +summarization; (iii) topic-based summarization; (iv) compressing selected +sentences into a one-line summary; (v) surfacing evidence for a summary +sentence; (vi) predicting the factual accuracy of a summary sentence; (vii) +identifying unsubstantiated spans in a summary sentence; (viii) correcting +factual errors in summaries. We compare various methods on this benchmark and +discover that on multiple tasks, moderately-sized fine-tuned models +consistently outperform much larger few-shot prompted language models. For +factuality-related tasks, we also evaluate existing heuristics to create +training data and find that training on them results in worse performance than +training on $20\times$ less human-labeled data. Our articles draw from $6$ +domains, facilitating cross-domain analysis. On some tasks, the amount of +training data matters more than the domain where it comes from, while for other +tasks training specifically on data from the target domain, even if limited, is +more beneficial. + +
+
+ comment: EMNLP Findings 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ Efficient Deep Speech Understanding at the Edge + + +
+ In contemporary speech understanding (SU), a sophisticated pipeline is +employed, encompassing the ingestion of streaming voice input. The pipeline +executes beam search iteratively, invoking a deep neural network to generate +tentative outputs (referred to as hypotheses) in an autoregressive manner. +Periodically, the pipeline assesses attention and Connectionist Temporal +Classification (CTC) scores. + This paper aims to enhance SU performance on edge devices with limited +resources. Adopting a hybrid strategy, our approach focuses on accelerating +on-device execution and offloading inputs surpassing the device's capacity. +While this approach is established, we tackle SU's distinctive challenges +through innovative techniques: (1) Late Contextualization: This involves the +parallel execution of a model's attentive encoder during input ingestion. (2) +Pilot Inference: Addressing temporal load imbalances in the SU pipeline, this +technique aims to mitigate them effectively. (3) Autoregression Offramps: +Decisions regarding offloading are made solely based on hypotheses, presenting +a novel approach. + These techniques are designed to seamlessly integrate with existing speech +models, pipelines, and frameworks, offering flexibility for independent or +combined application. Collectively, they form a hybrid solution for edge SU. +Our prototype, named XYZ, has undergone testing on Arm platforms featuring 6 to +8 cores, demonstrating state-of-the-art accuracy. Notably, it achieves a 2x +reduction in end-to-end latency and a corresponding 2x decrease in offloading +requirements. + +
+
+
+
+
+ + ♻ ☆ Automatic Textual Normalization for Hate Speech Detection + + +
+ Social media data is a valuable resource for research, yet it contains a wide +range of non-standard words (NSW). These irregularities hinder the effective +operation of NLP tools. Current state-of-the-art methods for the Vietnamese +language address this issue as a problem of lexical normalization, involving +the creation of manual rules or the implementation of multi-staged deep +learning frameworks, which necessitate extensive efforts to craft intricate +rules. In contrast, our approach is straightforward, employing solely a +sequence-to-sequence (Seq2Seq) model. In this research, we provide a dataset +for textual normalization, comprising 2,181 human-annotated comments with an +inter-annotator agreement of 0.9014. By leveraging the Seq2Seq model for +textual normalization, our results reveal that the accuracy achieved falls +slightly short of 70%. Nevertheless, textual normalization enhances the +accuracy of the Hate Speech Detection (HSD) task by approximately 2%, +demonstrating its potential to improve the performance of complex NLP tasks. +Our dataset is accessible for research purposes. + +
+
+ comment: Accepted to present at 2023 International Conference on Intelligent + Systems Design and Applications (ISDA2023) +
+
+
+
+
+ + ♻ ☆ Joint Prompt Optimization of Stacked LLMs using Variational Inference NeurIPS 2023 + + +
+ Large language models (LLMs) can be seen as atomic units of computation +mapping sequences to a distribution over sequences. Thus, they can be seen as +stochastic language layers in a language network, where the learnable +parameters are the natural language prompts at each layer. By stacking two such +layers and feeding the output of one layer to the next, we obtain a Deep +Language Network (DLN). We first show how to effectively perform prompt +optimization for a 1-Layer language network (DLN-1). Then, we present an +extension that applies to 2-layer DLNs (DLN-2), where two prompts must be +learned. The key idea is to consider the output of the first layer as a latent +variable, which requires inference, and prompts to be learned as the parameters +of the generative distribution. We first test the effectiveness of DLN-1 in +multiple reasoning and natural language understanding tasks. Then, we show that +DLN-2 can reach higher performance than a single layer, showing promise that we +might reach comparable performance to GPT-4, even when each LLM in the network +is smaller and less powerful. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Can GPT-4V(ision) Serve Medical Applications? Case Studies on GPT-4V for + Multimodal Medical Diagnosis + + +
+ Driven by the large foundation models, the development of artificial +intelligence has witnessed tremendous progress lately, leading to a surge of +general interest from the public. In this study, we aim to assess the +performance of OpenAI's newest model, GPT-4V(ision), specifically in the realm +of multimodal medical diagnosis. Our evaluation encompasses 17 human body +systems, including Central Nervous System, Head and Neck, Cardiac, Chest, +Hematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology, +Obstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma, +Pediatrics, with images taken from 8 modalities used in daily clinic routine, +e.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI), +Positron Emission Tomography (PET), Digital Subtraction Angiography (DSA), +Mammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on +multiple clinical tasks with or without patent history provided, including +imaging modality and anatomy recognition, disease diagnosis, report generation, +disease localisation. + Our observation shows that, while GPT-4V demonstrates proficiency in +distinguishing between medical image modalities and anatomy, it faces +significant challenges in disease diagnosis and generating comprehensive +reports. These findings underscore that while large multimodal models have made +significant advancements in computer vision and natural language processing, it +remains far from being used to effectively support real-world medical +applications and clinical decision-making. + All images used in this report can be found in +https://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation. + +
+
+
+
+
+ + ♻ ☆ Responsible Task Automation: Empowering Large Language Models as + Responsible Task Automators + + +
+ The recent success of Large Language Models (LLMs) signifies an impressive +stride towards artificial general intelligence. They have shown a promising +prospect in automatically completing tasks upon user instructions, functioning +as brain-like coordinators. The associated risks will be revealed as we +delegate an increasing number of tasks to machines for automated completion. A +big question emerges: how can we make machines behave responsibly when helping +humans automate tasks as personal copilots? In this paper, we explore this +question in depth from the perspectives of feasibility, completeness and +security. In specific, we present Responsible Task Automation (ResponsibleTA) +as a fundamental framework to facilitate responsible collaboration between +LLM-based coordinators and executors for task automation with three empowered +capabilities: 1) predicting the feasibility of the commands for executors; 2) +verifying the completeness of executors; 3) enhancing the security (e.g., the +protection of users' privacy). We further propose and compare two paradigms for +implementing the first two capabilities. One is to leverage the generic +knowledge of LLMs themselves via prompt engineering while the other is to adopt +domain-specific learnable models. Moreover, we introduce a local memory +mechanism for achieving the third capability. We evaluate our proposed +ResponsibleTA on UI task automation and hope it could bring more attentions to +ensuring LLMs more responsible in diverse scenarios. + +
+
+
+
+
+ + ♻ ☆ Pareto Probing: Trading Off Accuracy for Complexity EMNLP 2020 + + +
+ The question of how to probe contextual word representations for linguistic +structure in a way that is both principled and useful has seen significant +attention recently in the NLP literature. In our contribution to this +discussion, we argue for a probe metric that reflects the fundamental trade-off +between probe complexity and performance: the Pareto hypervolume. To measure +complexity, we present a number of parametric and non-parametric metrics. Our +experiments using Pareto hypervolume as an evaluation metric show that probes +often do not conform to our expectations -- e.g., why should the non-contextual +fastText representations encode more morpho-syntactic information than the +contextual BERT representations? These results suggest that common, simplistic +probing tasks, such as part-of-speech labeling and dependency arc labeling, are +inadequate to evaluate the linguistic structure encoded in contextual word +representations. This leads us to propose full dependency parsing as a probing +task. In support of our suggestion that harder probing tasks are necessary, our +experiments with dependency parsing reveal a wide gap in syntactic knowledge +between contextual and non-contextual representations. + +
+
+ comment: Tiago Pimentel and Naomi Saphra contributed equally to this work. + Camera ready version of EMNLP 2020 publication. In this new version, we fixed + some notation issues in the appendix, and added a new appendix section + describing our MLP. Code available in + https://github.com/rycolab/pareto-probing +
+
+
+
+
+ + ♻ ☆ Event-driven Real-time Retrieval in Web Search + + +
+ Information retrieval in real-time search presents unique challenges distinct +from those encountered in classical web search. These challenges are +particularly pronounced due to the rapid change of user search intent, which is +influenced by the occurrence and evolution of breaking news events, such as +earthquakes, elections, and wars. Previous dense retrieval methods, which +primarily focused on static semantic representation, lack the capacity to +capture immediate search intent, leading to inferior performance in retrieving +the most recent event-related documents in time-sensitive scenarios. To address +this issue, this paper expands the query with event information that represents +real-time search intent. The Event information is then integrated with the +query through a cross-attention mechanism, resulting in a time-context query +representation. We further enhance the model's capacity for event +representation through multi-task training. Since publicly available datasets +such as MS-MARCO do not contain any event information on the query side and +have few time-sensitive queries, we design an automatic data collection and +annotation pipeline to address this issue, which includes ModelZoo-based Coarse +Annotation and LLM-driven Fine Annotation processes. In addition, we share the +training tricks such as two-stage training and hard negative sampling. Finally, +we conduct a set of offline experiments on a million-scale production dataset +to evaluate our approach and deploy an A/B testing in a real online system to +verify the performance. Extensive experimental results demonstrate that our +proposed approach significantly outperforms existing state-of-the-art baseline +methods. + +
+
+
+
+
+ + ♻ ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained large language models have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks and to be appropriately specialized to +particular domains. Here, we target bioinformatics due to the amount of +specialized domain knowledge, algorithms, and data operations this discipline +requires. We present BioCoder, a benchmark developed to evaluate large language +models (LLMs) in generating bioinformatics-specific code. BioCoder spans a +broad spectrum of the field and covers cross-file dependencies, class +declarations, and global variables. It incorporates 1026 Python functions and +1243 Java methods extracted from GitHub, along with 253 examples from the +Rosalind Project, all pertaining to bioinformatics. Using topic modeling we +show that overall coverage of the included code is representative of the full +spectrum of bioinformatics calculations. BioCoder incorporates a fuzz-testing +framework for evaluation. We have applied it to evaluate many models including +InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, InstructCodeT5+, +GPT-3.5, and GPT-4. Furthermore, we finetuned StarCoder, demonstrating how our +dataset can effectively enhance the performance of LLMs on our benchmark (by +>15% in terms of Pass@K in certain prompt configurations and always >3%). The +results highlight two key aspects of successful models: (1) Successful models +accommodate a long prompt (> ~2600 tokens) with full context, for functional +dependencies. (2) They contain specific domain knowledge of bioinformatics, +beyond just general coding knowledge. This is evident from the performance gain +of GPT-3.5/4 compared to the smaller models on the benchmark (50% vs up to +~25%). Our dataset, benchmark, Docker images, and scripts required for testing +are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ♻ ☆ YUAN 2.0: A Large Language Model with Localized Filtering-based + Attention + + +
+ In this work, we develop and release Yuan 2.0, a series of large language +models with parameters ranging from 2.1 billion to 102.6 billion. The Localized +Filtering-based Attention (LFA) is introduced to incorporate prior knowledge of +local dependencies of natural language into Attention. A data filtering and +generating system is presented to build pre-training and fine-tuning dataset in +high quality. A distributed training method with non-uniform pipeline parallel, +data parallel, and optimizer parallel is proposed, which greatly reduces the +bandwidth requirements of intra-node communication, and achieves good +performance in large-scale distributed training. Yuan 2.0 models display +impressive ability in code generation, math problem-solving, and chatting +compared with existing models. The latest version of YUAN 2.0, including model +weights and source code, is accessible at Github. + +
+
+
+
+
+ + ♻ ☆ Viewing Knowledge Transfer in Multilingual Machine Translation Through a + Representational Lens EMNLP 2023 + + +
+ We argue that translation quality alone is not a sufficient metric for +measuring knowledge transfer in multilingual neural machine translation. To +support this claim, we introduce Representational Transfer Potential (RTP), +which measures representational similarities between languages. We show that +RTP can measure both positive and negative transfer (interference), and find +that RTP is strongly correlated with changes in translation quality, indicating +that transfer does occur. Furthermore, we investigate data and language +characteristics that are relevant for transfer, and find that multi-parallel +overlap is an important yet under-explored feature. Based on this, we develop a +novel training scheme, which uses an auxiliary similarity loss that encourages +representations to be more invariant across languages by taking advantage of +multi-parallel data. We show that our method yields increased translation +quality for low- and mid-resource languages across multiple data and model +setups. + +
+
+ comment: Accepted to EMNLP 2023 Findings +
+
+
+
+
+ + ♻ ☆ BianQue: Balancing the Questioning and Suggestion Ability of Health LLMs + with Multi-turn Health Conversations Polished by ChatGPT + + +
+ Large language models (LLMs) have performed well in providing general and +extensive health suggestions in single-turn conversations, exemplified by +systems such as ChatGPT, ChatGLM, ChatDoctor, DoctorGLM, and etc. However, the +limited information provided by users during single turn results in inadequate +personalization and targeting of the generated suggestions, which requires +users to independently select the useful part. It is mainly caused by the +missing ability to engage in multi-turn questioning. In real-world medical +consultations, doctors usually employ a series of iterative inquiries to +comprehend the patient's condition thoroughly, enabling them to provide +effective and personalized suggestions subsequently, which can be defined as +chain of questioning (CoQ) for LLMs. To improve the CoQ of LLMs, we propose +BianQue, a ChatGLM-based LLM finetuned with the self-constructed health +conversation dataset BianQueCorpus that is consist of multiple turns of +questioning and health suggestions polished by ChatGPT. Experimental results +demonstrate that the proposed BianQue can simultaneously balance the +capabilities of both questioning and health suggestions, which will help +promote the research and application of LLMs in the field of proactive health. + +
+
+
+
+
+ + ♻ ☆ Bias in Emotion Recognition with ChatGPT + + +
+ This technical report explores the ability of ChatGPT in recognizing emotions +from text, which can be the basis of various applications like interactive +chatbots, data annotation, and mental health analysis. While prior research has +shown ChatGPT's basic ability in sentiment analysis, its performance in more +nuanced emotion recognition is not yet explored. Here, we conducted experiments +to evaluate its performance of emotion recognition across different datasets +and emotion labels. Our findings indicate a reasonable level of reproducibility +in its performance, with noticeable improvement through fine-tuning. However, +the performance varies with different emotion labels and datasets, highlighting +an inherent instability and possible bias. The choice of dataset and emotion +labels significantly impacts ChatGPT's emotion recognition performance. This +paper sheds light on the importance of dataset and label selection, and the +potential of fine-tuning in enhancing ChatGPT's emotion recognition +capabilities, providing a groundwork for better integration of emotion analysis +in applications using ChatGPT. + +
+
+ comment: 5 pages, 4 figures, 6 tables +
+
+
+
+
+ + ♻ ☆ Plug-and-Play Knowledge Injection for Pre-trained Language Models ACL 2023 + + +
+ Injecting external knowledge can improve the performance of pre-trained +language models (PLMs) on various downstream NLP tasks. However, massive +retraining is required to deploy new knowledge injection methods or knowledge +bases for downstream tasks. In this work, we are the first to study how to +improve the flexibility and efficiency of knowledge injection by reusing +existing downstream models. To this end, we explore a new paradigm +plug-and-play knowledge injection, where knowledge bases are injected into +frozen existing downstream models by a knowledge plugin. Correspondingly, we +propose a plug-and-play injection method map-tuning, which trains a mapping of +knowledge embeddings to enrich model inputs with mapped embeddings while +keeping model parameters frozen. Experimental results on three knowledge-driven +NLP tasks show that existing injection methods are not suitable for the new +paradigm, while map-tuning effectively improves the performance of downstream +models. Moreover, we show that a frozen downstream model can be well adapted to +different domains with different mapping networks of domain knowledge. Our code +and models are available at https://github.com/THUNLP/Knowledge-Plugin. + +
+
+ comment: ACL 2023 +
+
+
+
+
+ + ♻ ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ In the realm of embodied artificial intelligence, the reasoning capabilities +of Large Language Models (LLMs) play a pivotal role. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ How much can ChatGPT really help Computational Biologists in + Programming? + + +
+ ChatGPT, a recently developed product by openAI, is successfully leaving its +mark as a multi-purpose natural language based chatbot. In this paper, we are +more interested in analyzing its potential in the field of computational +biology. A major share of work done by computational biologists these days +involve coding up bioinformatics algorithms, analyzing data, creating +pipelining scripts and even machine learning modeling and feature extraction. +This paper focuses on the potential influence (both positive and negative) of +ChatGPT in the mentioned aspects with illustrative examples from different +perspectives. Compared to other fields of computer science, computational +biology has - (1) less coding resources, (2) more sensitivity and bias issues +(deals with medical data) and (3) more necessity of coding assistance (people +from diverse background come to this field). Keeping such issues in mind, we +cover use cases such as code writing, reviewing, debugging, converting, +refactoring and pipelining using ChatGPT from the perspective of computational +biologists in this paper. + +
+
+
+
+
+ + ♻ ☆ Enhancing Abstractiveness of Summarization Models through Calibrated + Distillation EMNLP + + +
+ Sequence-level knowledge distillation reduces the size of Seq2Seq models for +more efficient abstractive summarization. However, it often leads to a loss of +abstractiveness in summarization. In this paper, we propose a novel approach +named DisCal to enhance the level of abstractiveness (measured by n-gram +overlap) without sacrificing the informativeness (measured by ROUGE) of +generated summaries. DisCal exposes diverse pseudo summaries with two +supervision to the student model. Firstly, the best pseudo summary is +identified in terms of abstractiveness and informativeness and used for +sequence-level distillation. Secondly, their ranks are used to ensure the +student model to assign higher prediction scores to summaries with higher +ranks. Our experiments show that DisCal outperforms prior methods in +abstractive summarization distillation, producing highly abstractive and +informative summaries. + +
+
+ comment: Accepted at EMNLP-Findings 2023 +
+
+
+
+
+ + ♻ ☆ Function-constrained Program Synthesis NeurIPS + + +
+ This work introduces (1) a technique that allows large language models (LLMs) +to leverage user-provided code when solving programming tasks and (2) a method +to iteratively generate modular sub-functions that can aid future code +generation attempts when the initial code generated by the LLM is inadequate. +Generating computer programs in general-purpose programming languages like +Python poses a challenge for LLMs when instructed to use code provided in the +prompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code +completions in real-time by drawing on all code available in a development +environment. However, restricting code-specific LLMs to use only in-context +code is not straightforward, as the model is not explicitly instructed to use +the user-provided code and users cannot highlight precisely which snippets of +code the model should incorporate into its context. Moreover, current systems +lack effective recovery methods, forcing users to iteratively re-prompt the +model with modified prompts until a sufficient solution is reached. Our method +differs from traditional LLM-powered code-generation by constraining +code-generation to an explicit function set and enabling recovery from failed +attempts through automatically generated sub-functions. When the LLM cannot +produce working code, we generate modular sub-functions to aid subsequent +attempts at generating functional code. A by-product of our method is a library +of reusable sub-functions that can solve related tasks, imitating a software +team where efficiency scales with experience. We also introduce a new +"half-shot" evaluation paradigm that provides tighter estimates of LLMs' coding +abilities compared to traditional zero-shot evaluation. Our proposed evaluation +method encourages models to output solutions in a structured format, decreasing +syntax errors that can be mistaken for poor coding ability. + +
+
+ comment: 17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop; corrected typo on + fig 1 caption +
+
+
+
+
+ + ♻ ☆ TaskWeb: Selecting Better Source Tasks for Multi-task NLP + + +
+ Recent work in NLP has shown promising results in training models on large +amounts of tasks to achieve better generalization. However, it is not +well-understood how tasks are related, and how helpful training tasks can be +chosen for a new task. In this work, we investigate whether knowing task +relationships via pairwise task transfer improves choosing one or more source +tasks that help to learn a new target task. We provide TaskWeb, a large-scale +benchmark of pairwise task transfers for 22 NLP tasks using three different +model types, sizes, and adaptation methods, spanning about 25,000 experiments. +Then, we design a new method TaskShop based on our analysis of TaskWeb. +TaskShop uses TaskWeb to estimate the benefit of using a source task for +learning a new target task, and to choose a subset of helpful training tasks +for multi-task training. Our method improves overall rankings and top-k +precision of source tasks by 10% and 38%, respectively. We also use TaskShop to +build much smaller multi-task training sets that improve zero-shot performances +across 11 different target tasks by at least 4.3%. + +
+
+ comment: 21 pages, 14 figures +
+
+
+
+
+ + ♻ ☆ Chain-of-Knowledge: Grounding Large Language Models via Dynamic + Knowledge Adapting over Heterogeneous Sources + + +
+ We present chain-of-knowledge (CoK), a novel framework that augments large +language models (LLMs) by dynamically incorporating grounding information from +heterogeneous sources. It results in more factual rationales and reduced +hallucination in generation. Specifically, CoK consists of three stages: +reasoning preparation, dynamic knowledge adapting, and answer consolidation. +Given a knowledge-intensive question, CoK first prepares several preliminary +rationales and answers while identifying the relevant knowledge domains. If +there is no majority consensus among the answers from samples, CoK corrects the +rationales step by step by adapting knowledge from the identified domains. +These corrected rationales can plausibly serve as a better foundation for the +final answer consolidation. Unlike prior studies that primarily use +unstructured data, CoK also leverages structured knowledge sources such as +Wikidata and tables that provide more reliable factual information. To access +both unstructured and structured knowledge sources in the dynamic knowledge +adapting stage, we propose an adaptive query generator that allows the +generation of queries for various types of query languages, including SPARQL, +SQL, and natural sentences. Moreover, to minimize error propagation between +rationales, CoK corrects the rationales progressively using preceding corrected +rationales to generate and correct subsequent rationales. Extensive experiments +show that CoK consistently improves the performance of LLMs on +knowledge-intensive tasks across different domains. + +
+
+
+
+
+ + ♻ ☆ Statler: State-Maintaining Language Models for Embodied Reasoning + + +
+ There has been a significant research interest in employing large language +models to empower intelligent robots with complex reasoning. Existing work +focuses on harnessing their abilities to reason about the histories of their +actions and observations. In this paper, we explore a new dimension in which +large language models may benefit robotics planning. In particular, we propose +Statler, a framework in which large language models are prompted to maintain an +estimate of the world state, which are often unobservable, and track its +transition as new actions are taken. Our framework then conditions each action +on the estimate of the current world state. Despite being conceptually simple, +our Statler framework significantly outperforms strong competing methods (e.g., +Code-as-Policies) on several robot planning tasks. Additionally, it has the +potential advantage of scaling up to more challenging long-horizon planning +tasks. We release our code at https://github.com/ripl/statler + +
+
+ comment: In submission; Project website: https://statler-lm.github.io/ +
+
+
+
+
+ + ♻ ☆ Understanding Social Reasoning in Language Models with Language Models + + +
+ As Large Language Models (LLMs) become increasingly integrated into our +everyday lives, understanding their ability to comprehend human mental states +becomes critical for ensuring effective interactions. However, despite the +recent attempts to assess the Theory-of-Mind (ToM) reasoning capabilities of +LLMs, the degree to which these models can align with human ToM remains a +nuanced topic of exploration. This is primarily due to two distinct challenges: +(1) the presence of inconsistent results from previous evaluations, and (2) +concerns surrounding the validity of existing evaluation methodologies. To +address these challenges, we present a novel framework for procedurally +generating evaluations with LLMs by populating causal templates. Using our +framework, we create a new social reasoning benchmark (BigToM) for LLMs which +consists of 25 controls and 5,000 model-written evaluations. We find that human +participants rate the quality of our benchmark higher than previous +crowd-sourced evaluations and comparable to expert-written evaluations. Using +BigToM, we evaluate the social reasoning capabilities of a variety of LLMs and +compare model performances with human performance. Our results suggest that +GPT4 has ToM capabilities that mirror human inference patterns, though less +reliable, while other LLMs struggle. + +
+
+
+
+
+ + ♻ ☆ Scaling laws for language encoding models in fMRI NeurIPS 2023 + + +
+ Representations from transformer-based unidirectional language models are +known to be effective at predicting brain responses to natural language. +However, most studies comparing language models to brains have used GPT-2 or +similarly sized language models. Here we tested whether larger open-source +models such as those from the OPT and LLaMA families are better at predicting +brain responses recorded using fMRI. Mirroring scaling results from other +contexts, we found that brain prediction performance scales logarithmically +with model size from 125M to 30B parameter models, with ~15% increased encoding +performance as measured by correlation with a held-out test set across 3 +subjects. Similar logarithmic behavior was observed when scaling the size of +the fMRI training set. We also characterized scaling for acoustic encoding +models that use HuBERT, WavLM, and Whisper, and we found comparable +improvements with model size. A noise ceiling analysis of these large, +high-performance encoding models showed that performance is nearing the +theoretical maximum for brain areas such as the precuneus and higher auditory +cortex. These results suggest that increasing scale in both models and data +will yield incredibly effective models of language processing in the brain, +enabling better scientific understanding as well as applications such as +decoding. + +
+
+ comment: Accepted to the Thirty-seventh Annual Conference on Neural + Information Processing Systems (NeurIPS 2023) +
+
+
+
+
+ + ♻ ☆ Give Me the Facts! A Survey on Factual Knowledge Probing in Pre-trained + Language Models EMNLP + + +
+ Pre-trained Language Models (PLMs) are trained on vast unlabeled data, rich +in world knowledge. This fact has sparked the interest of the community in +quantifying the amount of factual knowledge present in PLMs, as this explains +their performance on downstream tasks, and potentially justifies their use as +knowledge bases. In this work, we survey methods and datasets that are used to +probe PLMs for factual knowledge. Our contributions are: (1) We propose a +categorization scheme for factual probing methods that is based on how their +inputs, outputs and the probed PLMs are adapted; (2) We provide an overview of +the datasets used for factual probing; (3) We synthesize insights about +knowledge retention and prompt optimization in PLMs, analyze obstacles to +adopting PLMs as knowledge bases and outline directions for future work. + +
+
+ comment: Accepted at EMNLP Findings 2023 +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 152 + +
+
+
+ + ☆ PaSCo: Urban 3D Panoptic Scene Completion with Uncertainty Awareness SC + + +
+ We propose the task of Panoptic Scene Completion (PSC) which extends the +recently popular Semantic Scene Completion (SSC) task with instance-level +information to produce a richer understanding of the 3D scene. Our PSC proposal +utilizes a hybrid mask-based technique on the non-empty voxels from sparse +multi-scale completions. Whereas the SSC literature overlooks uncertainty which +is critical for robotics applications, we instead propose an efficient +ensembling to estimate both voxel-wise and instance-wise uncertainties along +PSC. This is achieved by building on a multi-input multi-output (MIMO) +strategy, while improving performance and yielding better uncertainty for +little additional compute. Additionally, we introduce a technique to aggregate +permutation-invariant mask predictions. Our experiments demonstrate that our +method surpasses all baselines in both Panoptic Scene Completion and +uncertainty estimation on three large-scale autonomous driving datasets. Our +code and data are available at https://astra-vision.github.io/PaSCo . + +
+
+ comment: Project page: https://astra-vision.github.io/PaSCo +
+
+
+
+
+ + ☆ Mesh-Guided Neural Implicit Field Editing + + +
+ Neural implicit fields have emerged as a powerful 3D representation for +reconstructing and rendering photo-realistic views, yet they possess limited +editability. Conversely, explicit 3D representations, such as polygonal meshes, +offer ease of editing but may not be as suitable for rendering high-quality +novel views. To harness the strengths of both representations, we propose a new +approach that employs a mesh as a guiding mechanism in editing the neural +radiance field. We first introduce a differentiable method using marching +tetrahedra for polygonal mesh extraction from the neural implicit field and +then design a differentiable color extractor to assign colors obtained from the +volume renderings to this extracted mesh. This differentiable colored mesh +allows gradient back-propagation from the explicit mesh to the implicit fields, +empowering users to easily manipulate the geometry and color of neural implicit +fields. To enhance user control from coarse-grained to fine-grained levels, we +introduce an octree-based structure into its optimization. This structure +prioritizes the edited regions and the surface part, making our method achieve +fine-grained edits to the neural implicit field and accommodate various user +modifications, including object additions, component removals, specific area +deformations, and adjustments to local and global colors. Through extensive +experiments involving diverse scenes and editing operations, we have +demonstrated the capabilities and effectiveness of our method. Our project page +is: \url{https://cassiepython.github.io/MNeuEdit/} + +
+
+ comment: Project page: https://cassiepython.github.io/MNeuEdit/ +
+
+
+
+
+ + ☆ GPS-Gaussian: Generalizable Pixel-wise 3D Gaussian Splatting for + Real-time Human Novel View Synthesis + + +
+ We present a new approach, termed GPS-Gaussian, for synthesizing novel views +of a character in a real-time manner. The proposed method enables 2K-resolution +rendering under a sparse-view camera setting. Unlike the original Gaussian +Splatting or neural implicit rendering methods that necessitate per-subject +optimizations, we introduce Gaussian parameter maps defined on the source views +and regress directly Gaussian Splatting properties for instant novel view +synthesis without any fine-tuning or optimization. To this end, we train our +Gaussian parameter regression module on a large amount of human scan data, +jointly with a depth estimation module to lift 2D parameter maps to 3D space. +The proposed framework is fully differentiable and experiments on several +datasets demonstrate that our method outperforms state-of-the-art methods while +achieving an exceeding rendering speed. + +
+
+ comment: The link to our projectpage is https://shunyuanzheng.github.io +
+
+
+
+
+ + ☆ Latent Feature-Guided Diffusion Models for Shadow Removal + + +
+ Recovering textures under shadows has remained a challenging problem due to +the difficulty of inferring shadow-free scenes from shadow images. In this +paper, we propose the use of diffusion models as they offer a promising +approach to gradually refine the details of shadow regions during the diffusion +process. Our method improves this process by conditioning on a learned latent +feature space that inherits the characteristics of shadow-free images, thus +avoiding the limitation of conventional methods that condition on degraded +images only. Additionally, we propose to alleviate potential local optima +during training by fusing noise features with the diffusion network. We +demonstrate the effectiveness of our approach which outperforms the previous +best method by 13% in terms of RMSE on the AISTD dataset. Further, we explore +instance-level shadow removal, where our model outperforms the previous best +method by 82% in terms of RMSE on the DESOBA dataset. + +
+
+ comment: project page see https://kfmei.page/shadow-diffusion/index.html +
+
+
+
+
+ + ☆ Aligning and Prompting Everything All at Once for Universal Visual + Perception + + +
+ Vision foundation models have been explored recently to build general-purpose +vision systems. However, predominant paradigms, driven by casting +instance-level tasks as an object-word alignment, bring heavy cross-modality +interaction, which is not effective in prompting object detection and visual +grounding. Another line of work that focuses on pixel-level tasks often +encounters a large annotation gap of things and stuff, and suffers from mutual +interference between foreground-object and background-class segmentation. In +stark contrast to the prevailing methods, we present APE, a universal visual +perception model for aligning and prompting everything all at once in an image +to perform diverse tasks, i.e., detection, segmentation, and grounding, as an +instance-level sentence-object matching paradigm. Specifically, APE advances +the convergence of detection and grounding by reformulating language-guided +grounding as open-vocabulary detection, which efficiently scales up model +prompting to thousands of category vocabularies and region descriptions while +maintaining the effectiveness of cross-modality fusion. To bridge the +granularity gap of different pixel-level tasks, APE equalizes semantic and +panoptic segmentation to proxy instance learning by considering any isolated +regions as individual instances. APE aligns vision and language representation +on broad data with natural and challenging characteristics all at once without +task-specific fine-tuning. The extensive experiments on over 160 datasets +demonstrate that, with only one-suit of weights, APE outperforms (or is on par +with) the state-of-the-art models, proving that an effective yet universal +perception for anything aligning and prompting is indeed feasible. Codes and +trained models are released at https://github.com/shenyunhang/APE. + +
+
+
+
+
+ + ☆ Steerers: A framework for rotation equivariant keypoint descriptors + + +
+ Image keypoint descriptions that are discriminative and matchable over large +changes in viewpoint are vital for 3D reconstruction. However, descriptions +output by learned descriptors are typically not robust to camera rotation. +While they can be made more robust by, e.g., data augmentation, this degrades +performance on upright images. Another approach is test-time augmentation, +which incurs a significant increase in runtime. We instead learn a linear +transform in description space that encodes rotations of the input image. We +call this linear transform a steerer since it allows us to transform the +descriptions as if the image was rotated. From representation theory we know +all possible steerers for the rotation group. Steerers can be optimized (A) +given a fixed descriptor, (B) jointly with a descriptor or (C) we can optimize +a descriptor given a fixed steerer. We perform experiments in all of these +three settings and obtain state-of-the-art results on the rotation invariant +image matching benchmarks AIMS and Roto-360. We publish code and model weights +at github.com/georg-bn/rotation-steerers. + +
+
+
+
+
+ + ☆ Guarding Barlow Twins Against Overfitting with Mixed Samples + + +
+ Self-supervised Learning (SSL) aims to learn transferable feature +representations for downstream applications without relying on labeled data. +The Barlow Twins algorithm, renowned for its widespread adoption and +straightforward implementation compared to its counterparts like contrastive +learning methods, minimizes feature redundancy while maximizing invariance to +common corruptions. Optimizing for the above objective forces the network to +learn useful representations, while avoiding noisy or constant features, +resulting in improved downstream task performance with limited adaptation. +Despite Barlow Twins' proven effectiveness in pre-training, the underlying SSL +objective can inadvertently cause feature overfitting due to the lack of strong +interaction between the samples unlike the contrastive learning approaches. +From our experiments, we observe that optimizing for the Barlow Twins objective +doesn't necessarily guarantee sustained improvements in representation quality +beyond a certain pre-training phase, and can potentially degrade downstream +performance on some datasets. To address this challenge, we introduce Mixed +Barlow Twins, which aims to improve sample interaction during Barlow Twins +training via linearly interpolated samples. This results in an additional +regularization term to the original Barlow Twins objective, assuming linear +interpolation in the input space translates to linearly interpolated features +in the feature space. Pre-training with this regularization effectively +mitigates feature overfitting and further enhances the downstream performance +on CIFAR-10, CIFAR-100, TinyImageNet, STL-10, and ImageNet datasets. The code +and checkpoints are available at: https://github.com/wgcban/mix-bt.git + +
+
+ comment: Code and checkpoints are available at: + https://github.com/wgcban/mix-bt.git +
+
+
+
+
+ + ☆ Readout Guidance: Learning Control from Diffusion Features + + +
+ We present Readout Guidance, a method for controlling text-to-image diffusion +models with learned signals. Readout Guidance uses readout heads, lightweight +networks trained to extract signals from the features of a pre-trained, frozen +diffusion model at every timestep. These readouts can encode single-image +properties, such as pose, depth, and edges; or higher-order properties that +relate multiple images, such as correspondence and appearance similarity. +Furthermore, by comparing the readout estimates to a user-defined target, and +back-propagating the gradient through the readout head, these estimates can be +used to guide the sampling process. Compared to prior methods for conditional +generation, Readout Guidance requires significantly fewer added parameters and +training samples, and offers a convenient and simple recipe for reproducing +different forms of conditional control under a single framework, with a single +architecture and sampling procedure. We showcase these benefits in the +applications of drag-based manipulation, identity-consistent generation, and +spatially aligned control. Project page: https://readout-guidance.github.io. + +
+
+
+
+
+ + ☆ Generative Powers of Ten + + +
+ We present a method that uses a text-to-image model to generate consistent +content across multiple image scales, enabling extreme semantic zooms into a +scene, e.g., ranging from a wide-angle landscape view of a forest to a macro +shot of an insect sitting on one of the tree branches. We achieve this through +a joint multi-scale diffusion sampling approach that encourages consistency +across different scales while preserving the integrity of each individual +sampling process. Since each generated scale is guided by a different text +prompt, our method enables deeper levels of zoom than traditional +super-resolution methods that may struggle to create new contextual structure +at vastly different scales. We compare our method qualitatively with +alternative techniques in image super-resolution and outpainting, and show that +our method is most effective at generating consistent multi-scale content. + +
+
+ comment: Project page: https://powers-of-10.github.io/ +
+
+
+
+
+ + ☆ Rejuvenating image-GPT as Strong Visual Representation Learners + + +
+ This paper enhances image-GPT (iGPT), one of the pioneering works that +introduce autoregressive pretraining to predict next pixels for visual +representation learning. Two simple yet essential changes are made. First, we +shift the prediction target from raw pixels to semantic tokens, enabling a +higher-level understanding of visual content. Second, we supplement the +autoregressive modeling by instructing the model to predict not only the next +tokens but also the visible tokens. This pipeline is particularly effective +when semantic tokens are encoded by discriminatively trained models, such as +CLIP. We introduce this novel approach as D-iGPT. Extensive experiments +showcase that D-iGPT excels as a strong learner of visual representations: A +notable achievement of D-iGPT is its compelling performance on the ImageNet-1K +dataset -- by training on publicly available datasets, D-iGPT achieves 89.5\% +top-1 accuracy with a vanilla ViT-Large model. This model also shows strong +generalization on the downstream task and robustness on out-of-distribution +samples. Code is avaiable at +\href{https://github.com/OliverRensu/D-iGPT}{https://github.com/OliverRensu/D-iGPT}. + +
+
+ comment: Larger models are coming +
+
+
+
+
+ + ☆ Repurposing Diffusion-Based Image Generators for Monocular Depth + Estimation + + +
+ Monocular depth estimation is a fundamental computer vision task. Recovering +3D depth from a single image is geometrically ill-posed and requires scene +understanding, so it is not surprising that the rise of deep learning has led +to a breakthrough. The impressive progress of monocular depth estimators has +mirrored the growth in model capacity, from relatively modest CNNs to large +Transformer architectures. Still, monocular depth estimators tend to struggle +when presented with images with unfamiliar content and layout, since their +knowledge of the visual world is restricted by the data seen during training, +and challenged by zero-shot generalization to new domains. This motivates us to +explore whether the extensive priors captured in recent generative diffusion +models can enable better, more generalizable depth estimation. We introduce +Marigold, a method for affine-invariant monocular depth estimation that is +derived from Stable Diffusion and retains its rich prior knowledge. The +estimator can be fine-tuned in a couple of days on a single GPU using only +synthetic training data. It delivers state-of-the-art performance across a wide +range of datasets, including over 20% performance gains in specific cases. +Project page: https://marigoldmonodepth.github.io. + +
+
+
+
+
+ + ☆ Optimizing Camera Configurations for Multi-View Pedestrian Detection + + +
+ Jointly considering multiple camera views (multi-view) is very effective for +pedestrian detection under occlusion. For such multi-view systems, it is +critical to have well-designed camera configurations, including camera +locations, directions, and fields-of-view (FoVs). Usually, these configurations +are crafted based on human experience or heuristics. In this work, we present a +novel solution that features a transformer-based camera configuration +generator. Using reinforcement learning, this generator autonomously explores +vast combinations within the action space and searches for configurations that +give the highest detection accuracy according to the training dataset. The +generator learns advanced techniques like maximizing coverage, minimizing +occlusion, and promoting collaboration. Across multiple simulation scenarios, +the configurations generated by our transformer-based model consistently +outperform random search, heuristic-based methods, and configurations designed +by human experts, shedding light on future camera layout optimization. + +
+
+
+
+
+ + ☆ Object Recognition as Next Token Prediction + + +
+ We present an approach to pose object recognition as next token prediction. +The idea is to apply a language decoder that auto-regressively predicts the +text tokens from image embeddings to form labels. To ground this prediction +process in auto-regression, we customize a non-causal attention mask for the +decoder, incorporating two key features: modeling tokens from different labels +to be independent, and treating image tokens as a prefix. This masking +mechanism inspires an efficient method - one-shot sampling - to simultaneously +sample tokens of multiple labels in parallel and rank generated labels by their +probabilities during inference. To further enhance the efficiency, we propose a +simple strategy to construct a compact decoder by simply discarding the +intermediate blocks of a pretrained language model. This approach yields a +decoder that matches the full model's performance while being notably more +efficient. The code is available at https://github.com/kaiyuyue/nxtp + +
+
+ comment: auto-regression for recognition +
+
+
+
+
+ + ☆ iMatching: Imperative Correspondence Learning + + +
+ Learning feature correspondence is a foundational task in computer vision, +holding immense importance for downstream applications such as visual odometry +and 3D reconstruction. Despite recent progress in data-driven models, feature +correspondence learning is still limited by the lack of accurate per-pixel +correspondence labels. To overcome this difficulty, we introduce a new +self-supervised scheme, imperative learning (IL), for training feature +correspondence. It enables correspondence learning on arbitrary uninterrupted +videos without any camera pose or depth labels, heralding a new era for +self-supervised correspondence learning. Specifically, we formulated the +problem of correspondence learning as a bilevel optimization, which takes the +reprojection error from bundle adjustment as a supervisory signal for the +model. To avoid large memory and computation overhead, we leverage the +stationary point to effectively back-propagate the implicit gradients through +bundle adjustment. Through extensive experiments, we demonstrate superior +performance on tasks including feature matching and pose estimation, in which +we obtained an average of 30% accuracy gain over the state-of-the-art matching +models. + +
+
+
+
+
+ + ☆ DiffiT: Diffusion Vision Transformers for Image Generation + + +
+ Diffusion models with their powerful expressivity and high sample quality +have enabled many new applications and use-cases in various domains. For sample +generation, these models rely on a denoising neural network that generates +images by iterative denoising. Yet, the role of denoising network architecture +is not well-studied with most efforts relying on convolutional residual U-Nets. +In this paper, we study the effectiveness of vision transformers in +diffusion-based generative learning. Specifically, we propose a new model, +denoted as Diffusion Vision Transformers (DiffiT), which consists of a hybrid +hierarchical architecture with a U-shaped encoder and decoder. We introduce a +novel time-dependent self-attention module that allows attention layers to +adapt their behavior at different stages of the denoising process in an +efficient manner. We also introduce latent DiffiT which consists of transformer +model with the proposed self-attention layers, for high-resolution image +generation. Our results show that DiffiT is surprisingly effective in +generating high-fidelity images, and it achieves state-of-the-art (SOTA) +benchmarks on a variety of class-conditional and unconditional synthesis tasks. +In the latent space, DiffiT achieves a new SOTA FID score of 1.73 on +ImageNet-256 dataset. Repository: https://github.com/NVlabs/DiffiT + +
+
+ comment: Tech report +
+
+
+
+
+ + ☆ MANUS: Markerless Hand-Object Grasp Capture using Articulated 3D + Gaussians + + +
+ Understanding how we grasp objects with our hands has important applications +in areas like robotics and mixed reality. However, this challenging problem +requires accurate modeling of the contact between hands and objects. To capture +grasps, existing methods use skeletons, meshes, or parametric models that can +cause misalignments resulting in inaccurate contacts. We present MANUS, a +method for Markerless Hand-Object Grasp Capture using Articulated 3D Gaussians. +We build a novel articulated 3D Gaussians representation that extends 3D +Gaussian splatting for high-fidelity representation of articulating hands. +Since our representation uses Gaussian primitives, it enables us to efficiently +and accurately estimate contacts between the hand and the object. For the most +accurate results, our method requires tens of camera views that current +datasets do not provide. We therefore build MANUS-Grasps, a new dataset that +contains hand-object grasps viewed from 53 cameras across 30+ scenes, 3 +subjects, and comprising over 7M frames. In addition to extensive qualitative +results, we also show that our method outperforms others on a quantitative +contact evaluation method that uses paint transfer from the object to the hand. + +
+
+
+
+
+ + ☆ BerfScene: Bev-conditioned Equivariant Radiance Fields for Infinite 3D + Scene Generation + + +
+ Generating large-scale 3D scenes cannot simply apply existing 3D object +synthesis technique since 3D scenes usually hold complex spatial configurations +and consist of a number of objects at varying scales. We thus propose a +practical and efficient 3D representation that incorporates an equivariant +radiance field with the guidance of a bird's-eye view (BEV) map. Concretely, +objects of synthesized 3D scenes could be easily manipulated through steering +the corresponding BEV maps. Moreover, by adequately incorporating positional +encoding and low-pass filters into the generator, the representation becomes +equivariant to the given BEV map. Such equivariance allows us to produce +large-scale, even infinite-scale, 3D scenes via synthesizing local scenes and +then stitching them with smooth consistency. Extensive experiments on 3D scene +datasets demonstrate the effectiveness of our approach. Our project website is +at https://zqh0253.github.io/BerfScene/. + +
+
+
+
+
+ + ☆ Fast View Synthesis of Casual Videos + + +
+ Novel view synthesis from an in-the-wild video is difficult due to challenges +like scene dynamics and lack of parallax. While existing methods have shown +promising results with implicit neural radiance fields, they are slow to train +and render. This paper revisits explicit video representations to synthesize +high-quality novel views from a monocular video efficiently. We treat static +and dynamic video content separately. Specifically, we build a global static +scene model using an extended plane-based scene representation to synthesize +temporally coherent novel video. Our plane-based scene representation is +augmented with spherical harmonics and displacement maps to capture +view-dependent effects and model non-planar complex surface geometry. We opt to +represent the dynamic content as per-frame point clouds for efficiency. While +such representations are inconsistency-prone, minor temporal inconsistencies +are perceptually masked due to motion. We develop a method to quickly estimate +such a hybrid video representation and render novel views in real time. Our +experiments show that our method can render high-quality novel views from an +in-the-wild video with comparable quality to state-of-the-art methods while +being 100x faster in training and enabling real-time rendering. + +
+
+ comment: Project page: https://casual-fvs.github.io/ +
+
+
+
+
+ + ☆ GaussianAvatar: Towards Realistic Human Avatar Modeling from a Single + Video via Animatable 3D Gaussians + + +
+ We present GaussianAvatar, an efficient approach to creating realistic human +avatars with dynamic 3D appearances from a single video. We start by +introducing animatable 3D Gaussians to explicitly represent humans in various +poses and clothing styles. Such an explicit and animatable representation can +fuse 3D appearances more efficiently and consistently from 2D observations. Our +representation is further augmented with dynamic properties to support +pose-dependent appearance modeling, where a dynamic appearance network along +with an optimizable feature tensor is designed to learn the +motion-to-appearance mapping. Moreover, by leveraging the differentiable motion +condition, our method enables a joint optimization of motions and appearances +during avatar modeling, which helps to tackle the long-standing issue of +inaccurate motion estimation in monocular settings. The efficacy of +GaussianAvatar is validated on both the public dataset and our collected +dataset, demonstrating its superior performances in terms of appearance quality +and rendering efficiency. + +
+
+ comment: https://huliangxiao.github.io/GaussianAvatar +
+
+
+
+
+ + ☆ Style Aligned Image Generation via Shared Attention + + +
+ Large-scale Text-to-Image (T2I) models have rapidly gained prominence across +creative fields, generating visually compelling outputs from textual prompts. +However, controlling these models to ensure consistent style remains +challenging, with existing methods necessitating fine-tuning and manual +intervention to disentangle content and style. In this paper, we introduce +StyleAligned, a novel technique designed to establish style alignment among a +series of generated images. By employing minimal `attention sharing' during the +diffusion process, our method maintains style consistency across images within +T2I models. This approach allows for the creation of style-consistent images +using a reference style through a straightforward inversion operation. Our +method's evaluation across diverse styles and text prompts demonstrates +high-quality synthesis and fidelity, underscoring its efficacy in achieving +consistent style across various inputs. + +
+
+ comment: Project page at style-aligned-gen.github.io +
+
+
+
+
+ + ☆ Can we truly transfer an actor's genuine happiness to avatars? An + investigation into virtual, real, posed and spontaneous faces + + +
+ A look is worth a thousand words is a popular phrase. And why is a simple +look enough to portray our feelings about something or someone? Behind this +question are the theoretical foundations of the field of psychology regarding +social cognition and the studies of psychologist Paul Ekman. Facial +expressions, as a form of non-verbal communication, are the primary way to +transmit emotions between human beings. The set of movements and expressions of +facial muscles that convey some emotional state of the individual to their +observers are targets of studies in many areas. Our research aims to evaluate +Ekman's action units in datasets of real human faces, posed and spontaneous, +and virtual human faces resulting from transferring real faces into Computer +Graphics faces. In addition, we also conducted a case study with specific movie +characters, such as SheHulk and Genius. We intend to find differences and +similarities in facial expressions between real and CG datasets, posed and +spontaneous faces, and also to consider the actors' genders in the videos. This +investigation can help several areas of knowledge, whether using real or +virtual human beings, in education, health, entertainment, games, security, and +even legal matters. Our results indicate that AU intensities are greater for +posed than spontaneous datasets, regardless of gender. Furthermore, there is a +smoothing of intensity up to 80 percent for AU6 and 45 percent for AU12 when a +real face is transformed into CG. + +
+
+ comment: To be published in Simp\'osio Brasileiro de Jogos e Entretenimento + Digital - SBGames 2023 +
+
+
+
+
+ + ☆ SplaTAM: Splat, Track & Map 3D Gaussians for Dense RGB-D SLAM + + +
+ Dense simultaneous localization and mapping (SLAM) is pivotal for embodied +scene understanding. Recent work has shown that 3D Gaussians enable +high-quality reconstruction and real-time rendering of scenes using multiple +posed cameras. In this light, we show for the first time that representing a +scene by 3D Gaussians can enable dense SLAM using a single unposed monocular +RGB-D camera. Our method, SplaTAM, addresses the limitations of prior radiance +field-based representations, including fast rendering and optimization, the +ability to determine if areas have been previously mapped, and structured map +expansion by adding more Gaussians. We employ an online tracking and mapping +pipeline while tailoring it to specifically use an underlying Gaussian +representation and silhouette-guided optimization via differentiable rendering. +Extensive experiments show that SplaTAM achieves up to 2X state-of-the-art +performance in camera pose estimation, map construction, and novel-view +synthesis, demonstrating its superiority over existing approaches, while +allowing real-time rendering of a high-resolution dense 3D map. + +
+
+
+
+
+ + ☆ VerA: Versatile Anonymization Fit for Clinical Facial Images + + +
+ The escalating legislative demand for data privacy in facial image +dissemination has underscored the significance of image anonymization. Recent +advancements in the field surpass traditional pixelation or blur methods, yet +they predominantly address regular single images. This leaves clinical image +anonymization -- a necessity for illustrating medical interventions -- largely +unaddressed. We present VerA, a versatile facial image anonymization that is +fit for clinical facial images where: (1) certain semantic areas must be +preserved to show medical intervention results, and (2) anonymizing image pairs +is crucial for showing before-and-after results. VerA outperforms or is on par +with state-of-the-art methods in de-identification and photorealism for regular +images. In addition, we validate our results on paired anonymization, and on +the anonymization of both single and paired clinical images with extensive +quantitative and qualitative evaluation. + +
+
+
+
+
+ + ☆ Mathematical Supplement for the $\texttt{gsplat}$ Library + + +
+ This report provides the mathematical details of the gsplat library, a +modular toolbox for efficient differentiable Gaussian splatting, as proposed by +Kerbl et al. It provides a self-contained reference for the computations +involved in the forward and backward passes of differentiable Gaussian +splatting. To facilitate practical usage and development, we provide a user +friendly Python API that exposes each component of the forward and backward +passes in rasterization at github.com/nerfstudio-project/gsplat . + +
+
+ comment: Find the library at: https://docs.gsplat.studio/ +
+
+
+
+
+ + ☆ GIVT: Generative Infinite-Vocabulary Transformers + + +
+ We introduce generative infinite-vocabulary transformers (GIVT) which +generate vector sequences with real-valued entries, instead of discrete tokens +from a finite vocabulary. To this end, we propose two surprisingly simple +modifications to decoder-only transformers: 1) at the input, we replace the +finite-vocabulary lookup table with a linear projection of the input vectors; +and 2) at the output, we replace the logits prediction (usually mapped to a +categorical distribution) with the parameters of a multivariate Gaussian +mixture model. Inspired by the image-generation paradigm of VQ-GAN and MaskGIT, +where transformers are used to model the discrete latent sequences of a VQ-VAE, +we use GIVT to model the unquantized real-valued latent sequences of a VAE. +When applying GIVT to class-conditional image generation with iterative masked +modeling, we show competitive results with MaskGIT, while our approach +outperforms both VQ-GAN and MaskGIT when using it for causal modeling. Finally, +we obtain competitive results outside of image generation when applying our +approach to panoptic segmentation and depth estimation with a VAE-based variant +of the UViM framework. + +
+
+
+
+
+ + ☆ TriDeNT: Triple Deep Network Training for Privileged Knowledge + Distillation in Histopathology + + +
+ Computational pathology models rarely utilise data that will not be available +for inference. This means most models cannot learn from highly informative data +such as additional immunohistochemical (IHC) stains and spatial +transcriptomics. We present TriDeNT, a novel self-supervised method for +utilising privileged data that is not available during inference to improve +performance. We demonstrate the efficacy of this method for a range of +different paired data including immunohistochemistry, spatial transcriptomics +and expert nuclei annotations. In all settings, TriDeNT outperforms other +state-of-the-art methods in downstream tasks, with observed improvements of up +to 101%. Furthermore, we provide qualitative and quantitative measurements of +the features learned by these models and how they differ from baselines. +TriDeNT offers a novel method to distil knowledge from scarce or costly data +during training, to create significantly better models for routine inputs. + +
+
+
+
+
+ + ☆ ArtAdapter: Text-to-Image Style Transfer using Multi-Level Style Encoder + and Explicit Adaptation + + +
+ This work introduces ArtAdapter, a transformative text-to-image (T2I) style +transfer framework that transcends traditional limitations of color, +brushstrokes, and object shape, capturing high-level style elements such as +composition and distinctive artistic expression. The integration of a +multi-level style encoder with our proposed explicit adaptation mechanism +enables ArtAdapte to achieve unprecedented fidelity in style transfer, ensuring +close alignment with textual descriptions. Additionally, the incorporation of +an Auxiliary Content Adapter (ACA) effectively separates content from style, +alleviating the borrowing of content from style references. Moreover, our novel +fast finetuning approach could further enhance zero-shot style representation +while mitigating the risk of overfitting. Comprehensive evaluations confirm +that ArtAdapter surpasses current state-of-the-art methods. + +
+
+
+
+
+ + ☆ Learning Pseudo-Labeler beyond Noun Concepts for Open-Vocabulary Object + Detection + + +
+ Open-vocabulary object detection (OVOD) has recently gained significant +attention as a crucial step toward achieving human-like visual intelligence. +Existing OVOD methods extend target vocabulary from pre-defined categories to +open-world by transferring knowledge of arbitrary concepts from vision-language +pre-training models to the detectors. While previous methods have shown +remarkable successes, they suffer from indirect supervision or limited +transferable concepts. In this paper, we propose a simple yet effective method +to directly learn region-text alignment for arbitrary concepts. Specifically, +the proposed method aims to learn arbitrary image-to-text mapping for +pseudo-labeling of arbitrary concepts, named Pseudo-Labeling for Arbitrary +Concepts (PLAC). The proposed method shows competitive performance on the +standard OVOD benchmark for noun concepts and a large improvement on referring +expression comprehension benchmark for arbitrary concepts. + +
+
+
+
+
+ + ☆ VideoSwap: Customized Video Subject Swapping with Interactive Semantic + Point Correspondence + + +
+ Current diffusion-based video editing primarily focuses on +structure-preserved editing by utilizing various dense correspondences to +ensure temporal consistency and motion alignment. However, these approaches are +often ineffective when the target edit involves a shape change. To embark on +video editing with shape change, we explore customized video subject swapping +in this work, where we aim to replace the main subject in a source video with a +target subject having a distinct identity and potentially different shape. In +contrast to previous methods that rely on dense correspondences, we introduce +the VideoSwap framework that exploits semantic point correspondences, inspired +by our observation that only a small number of semantic points are necessary to +align the subject's motion trajectory and modify its shape. We also introduce +various user-point interactions (\eg, removing points and dragging points) to +address various semantic point correspondence. Extensive experiments +demonstrate state-of-the-art video subject swapping results across a variety of +real-world videos. + +
+
+ comment: Project page at https://videoswap.github.io +
+
+
+
+
+ + ☆ Integrating AI into CCTV Systems: A Comprehensive Evaluation of Smart + Video Surveillance in Community Space + + +
+ This article presents an AI-enabled Smart Video Surveillance (SVS) designed +to enhance safety in community spaces such as educational and recreational +areas, and small businesses. The proposed system innovatively integrates with +existing CCTV and wired camera networks, simplifying its adoption across +various community cases to leverage recent AI advancements. Our SVS system, +focusing on privacy, uses metadata instead of pixel data for activity +recognition, aligning with ethical standards. It features cloud-based +infrastructure and a mobile app for real-time, privacy-conscious alerts in +communities. + This article notably pioneers a comprehensive real-world evaluation of the +SVS system, covering AI-driven visual processing, statistical analysis, +database management, cloud communication, and user notifications. It's also the +first to assess an end-to-end anomaly detection system's performance, vital for +identifying potential public safety incidents. + For our evaluation, we implemented the system in a community college, serving +as an ideal model to exemplify the proposed system's capabilities. Our findings +in this setting demonstrate the system's robustness, with throughput, latency, +and scalability effectively managing 16 CCTV cameras. The system maintained a +consistent 16.5 frames per second (FPS) over a 21-hour operation. The average +end-to-end latency for detecting behavioral anomalies and alerting users was +26.76 seconds. + +
+
+
+
+
+ + ☆ GaussianAvatars: Photorealistic Head Avatars with Rigged 3D Gaussians + + +
+ We introduce GaussianAvatars, a new method to create photorealistic head +avatars that are fully controllable in terms of expression, pose, and +viewpoint. The core idea is a dynamic 3D representation based on 3D Gaussian +splats that are rigged to a parametric morphable face model. This combination +facilitates photorealistic rendering while allowing for precise animation +control via the underlying parametric model, e.g., through expression transfer +from a driving sequence or by manually changing the morphable model parameters. +We parameterize each splat by a local coordinate frame of a triangle and +optimize for explicit displacement offset to obtain a more accurate geometric +representation. During avatar reconstruction, we jointly optimize for the +morphable model parameters and Gaussian splat parameters in an end-to-end +fashion. We demonstrate the animation capabilities of our photorealistic avatar +in several challenging scenarios. For instance, we show reenactments from a +driving video, where our method outperforms existing works by a significant +margin. + +
+
+ comment: Project page: https://shenhanqian.github.io/gaussian-avatars +
+
+
+
+
+ + ☆ DUCK: Distance-based Unlearning via Centroid Kinematics + + +
+ Machine Unlearning is rising as a new field, driven by the pressing necessity +of ensuring privacy in modern artificial intelligence models. This technique +primarily aims to eradicate any residual influence of a specific subset of data +from the knowledge acquired by a neural model during its training. This work +introduces a novel unlearning algorithm, denoted as Distance-based Unlearning +via Centroid Kinematics (DUCK), which employs metric learning to guide the +removal of samples matching the nearest incorrect centroid in the embedding +space. Evaluation of the algorithm's performance is conducted across various +benchmark datasets in two distinct scenarios, class removal, and homogeneous +sampling removal, obtaining state-of-the-art performance. We introduce a novel +metric, called Adaptive Unlearning Score (AUS), encompassing not only the +efficacy of the unlearning process in forgetting target data but also +quantifying the performance loss relative to the original model. Moreover, we +propose a novel membership inference attack to assess the algorithm's capacity +to erase previously acquired knowledge, designed to be adaptable to future +methodologies. + +
+
+
+
+
+ + ☆ TimeChat: A Time-sensitive Multimodal Large Language Model for Long + Video Understanding + + +
+ This work proposes TimeChat, a time-sensitive multimodal large language model +specifically designed for long video understanding. Our model incorporates two +key architectural contributions: (1) a timestamp-aware frame encoder that binds +visual content with the timestamp of each frame, and (2) a sliding video +Q-Former that produces a video token sequence of varying lengths to accommodate +videos of various durations. Additionally, we construct an instruction-tuning +dataset, encompassing 6 tasks and a total of 125K instances, to further enhance +TimeChat's instruction-following performance. Experiment results across various +video understanding tasks, such as dense captioning, temporal grounding, and +highlight detection, demonstrate TimeChat's strong zero-shot temporal +localization and reasoning capabilities. For example, it achieves +9.2 F1 score +and +2.8 CIDEr on YouCook2, +5.8 HIT@1 on QVHighlights, and +27.5 R@1 (IoU=0.5) +on Charades-STA, compared to state-of-the-art video large language models, +holding the potential to serve as a versatile video assistant for long-form +video comprehension tasks and satisfy realistic user requirements. + +
+
+ comment: 17 pages, 10 figures, code is available at + https://github.com/RenShuhuai-Andy/TimeChat +
+
+
+
+
+ + ☆ Implicit Learning of Scene Geometry from Poses for Global Localization + + +
+ Global visual localization estimates the absolute pose of a camera using a +single image, in a previously mapped area. Obtaining the pose from a single +image enables many robotics and augmented/virtual reality applications. +Inspired by latest advances in deep learning, many existing approaches directly +learn and regress 6 DoF pose from an input image. However, these methods do not +fully utilize the underlying scene geometry for pose regression. The challenge +in monocular relocalization is the minimal availability of supervised training +data, which is just the corresponding 6 DoF poses of the images. In this paper, +we propose to utilize these minimal available labels (.i.e, poses) to learn the +underlying 3D geometry of the scene and use the geometry to estimate the 6 DoF +camera pose. We present a learning method that uses these pose labels and rigid +alignment to learn two 3D geometric representations (\textit{X, Y, Z +coordinates}) of the scene, one in camera coordinate frame and the other in +global coordinate frame. Given a single image, it estimates these two 3D scene +representations, which are then aligned to estimate a pose that matches the +pose label. This formulation allows for the active inclusion of additional +learning constraints to minimize 3D alignment errors between the two 3D scene +representations, and 2D re-projection errors between the 3D global scene +representation and 2D image pixels, resulting in improved localization +accuracy. During inference, our model estimates the 3D scene geometry in camera +and global frames and aligns them rigidly to obtain pose in real-time. We +evaluate our work on three common visual localization datasets, conduct +ablation studies, and show that our method exceeds state-of-the-art regression +methods' pose accuracy on all datasets. + +
+
+ comment: IEEE ROBOTICS AND AUTOMATION LETTERS. ACCEPTED NOVEMBER, 2023 +
+
+
+
+
+ + ☆ VLTSeg: Simple Transfer of CLIP-Based Vision-Language Representations + for Domain Generalized Semantic Segmentation + + +
+ Domain generalization (DG) remains a significant challenge for perception +based on deep neural networks (DNN), where domain shifts occur due to lighting, +weather, or geolocation changes. In this work, we propose VLTSeg to enhance +domain generalization in semantic segmentation, where the network is solely +trained on the source domain and evaluated on unseen target domains. Our method +leverages the inherent semantic robustness of vision-language models. First, by +substituting traditional vision-only backbones with pre-trained encoders from +CLIP and EVA-CLIP as transfer learning setting we find that in the field of DG, +vision-language pre-training significantly outperforms supervised and +self-supervised vision pre-training. We thus propose a new vision-language +approach for domain generalized segmentation, which improves the domain +generalization SOTA by 7.6% mIoU when training on the synthetic GTA5 dataset. +We further show the superior generalization capabilities of vision-language +segmentation models by reaching 76.48% mIoU on the popular Cityscapes-to-ACDC +benchmark, outperforming the previous SOTA approach by 6.9% mIoU on the test +set at the time of writing. Additionally, our approach shows strong in-domain +generalization capabilities indicated by 86.1% mIoU on the Cityscapes test set, +resulting in a shared first place with the previous SOTA on the current +leaderboard at the time of submission. + +
+
+
+
+
+ + ☆ A multi-channel cycleGAN for CBCT to CT synthesis MICCAI 2023 + + +
+ Image synthesis is used to generate synthetic CTs (sCTs) from on-treatment +cone-beam CTs (CBCTs) with a view to improving image quality and enabling +accurate dose computation to facilitate a CBCT-based adaptive radiotherapy +workflow. As this area of research gains momentum, developments in sCT +generation methods are difficult to compare due to the lack of large public +datasets and sizeable variation in training procedures. To compare and assess +the latest advancements in sCT generation, the SynthRAD2023 challenge provides +a public dataset and evaluation framework for both MR and CBCT to sCT +synthesis. Our contribution focuses on the second task, CBCT-to-sCT synthesis. +By leveraging a multi-channel input to emphasize specific image features, our +approach effectively addresses some of the challenges inherent in CBCT imaging, +whilst restoring the contrast necessary for accurate visualisation of patients' +anatomy. Additionally, we introduce an auxiliary fusion network to further +enhance the fidelity of generated sCT images. + +
+
+ comment: RRRocket_Lollies submission for the Synthesizing computed tomography + for radiotherapy (SynthRAD2023) Challenge at MICCAI 2023 +
+
+
+
+
+ + ☆ ColonNeRF: Neural Radiance Fields for High-Fidelity Long-Sequence + Colonoscopy Reconstruction + + +
+ Colonoscopy reconstruction is pivotal for diagnosing colorectal cancer. +However, accurate long-sequence colonoscopy reconstruction faces three major +challenges: (1) dissimilarity among segments of the colon due to its meandering +and convoluted shape; (2) co-existence of simple and intricately folded +geometry structures; (3) sparse viewpoints due to constrained camera +trajectories. To tackle these challenges, we introduce a new reconstruction +framework based on neural radiance field (NeRF), named ColonNeRF, which +leverages neural rendering for novel view synthesis of long-sequence +colonoscopy. Specifically, to reconstruct the entire colon in a piecewise +manner, our ColonNeRF introduces a region division and integration module, +effectively reducing shape dissimilarity and ensuring geometric consistency in +each segment. To learn both the simple and complex geometry in a unified +framework, our ColonNeRF incorporates a multi-level fusion module that +progressively models the colon regions from easy to hard. Additionally, to +overcome the challenges from sparse views, we devise a DensiNet module for +densifying camera poses under the guidance of semantic consistency. We conduct +extensive experiments on both synthetic and real-world datasets to evaluate our +ColonNeRF. Quantitatively, our ColonNeRF outperforms existing methods on two +benchmarks over four evaluation metrics. Notably, our LPIPS-ALEX scores exhibit +a substantial increase of about 67%-85% on the SimCol-to-3D dataset. +Qualitatively, our reconstruction visualizations show much clearer textures and +more accurate geometric details. These sufficiently demonstrate our superior +performance over the state-of-the-art methods. + +
+
+ comment: for Project Page, see https://showlab.github.io/ColonNeRF/ +
+
+
+
+
+ + ☆ Towards Learning a Generalist Model for Embodied Navigation + + +
+ Building a generalist agent that can interact with the world is the +intriguing target of AI systems, thus spurring the research for embodied +navigation, where an agent is required to navigate according to instructions or +respond to queries. Despite the major progress attained, previous works +primarily focus on task-specific agents and lack generalizability to unseen +scenarios. Recently, LLMs have presented remarkable capabilities across various +fields, and provided a promising opportunity for embodied navigation. Drawing +on this, we propose the first generalist model for embodied navigation, +NaviLLM. It adapts LLMs to embodied navigation by introducing schema-based +instruction. The schema-based instruction flexibly casts various tasks into +generation problems, thereby unifying a wide range of tasks. This approach +allows us to integrate diverse data sources from various datasets into the +training, equipping NaviLLM with a wide range of capabilities required by +embodied navigation. We conduct extensive experiments to evaluate the +performance and generalizability of our model. The experimental results +demonstrate that our unified model achieves state-of-the-art performance on +CVDN, SOON, and ScanQA. Specifically, it surpasses the previous +stats-of-the-art method by a significant margin of 29% in goal progress on +CVDN. Moreover, our model also demonstrates strong generalizability and +presents impressive results on unseen tasks, e.g., embodied question answering +and 3D captioning. + +
+
+ comment: 13 pages, 3 figures +
+
+
+
+
+ + ☆ SRTransGAN: Image Super-Resolution using Transformer based Generative + Adversarial Network + + +
+ Image super-resolution aims to synthesize high-resolution image from a +low-resolution image. It is an active area to overcome the resolution +limitations in several applications like low-resolution object-recognition, +medical image enhancement, etc. The generative adversarial network (GAN) based +methods have been the state-of-the-art for image super-resolution by utilizing +the convolutional neural networks (CNNs) based generator and discriminator +networks. However, the CNNs are not able to exploit the global information very +effectively in contrast to the transformers, which are the recent breakthrough +in deep learning by exploiting the self-attention mechanism. Motivated from the +success of transformers in language and vision applications, we propose a +SRTransGAN for image super-resolution using transformer based GAN. +Specifically, we propose a novel transformer-based encoder-decoder network as a +generator to generate 2x images and 4x images. We design the discriminator +network using vision transformer which uses the image as sequence of patches +and hence useful for binary classification between synthesized and real +high-resolution images. The proposed SRTransGAN outperforms the existing +methods by 4.38 % on an average of PSNR and SSIM scores. We also analyze the +saliency map to understand the learning ability of the proposed method. + +
+
+
+
+
+ + ☆ Language-only Efficient Training of Zero-shot Composed Image Retrieval + + +
+ Composed image retrieval (CIR) task takes a composed query of image and text, +aiming to search relative images for both conditions. Conventional CIR +approaches need a training dataset composed of triplets of query image, query +text, and target image, which is very expensive to collect. Several recent +works have worked on the zero-shot (ZS) CIR paradigm to tackle the issue +without using pre-collected triplets. However, the existing ZS-CIR methods show +limited backbone scalability and generalizability due to the lack of diversity +of the input texts during training. We propose a novel CIR framework, only +using language for its training. Our LinCIR (Language-only training for CIR) +can be trained only with text datasets by a novel self-supervision named +self-masking projection (SMP). We project the text latent embedding to the +token embedding space and construct a new text by replacing the keyword tokens +of the original text. Then, we let the new and original texts have the same +latent embedding vector. With this simple strategy, LinCIR is surprisingly +efficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in +48 minutes and shows the best ZS-CIR performances on four different CIR +benchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised +method on FashionIQ. Code is available at https://github.com/navervision/lincir + +
+
+ comment: First two authors contributed equally; 16 pages, 2.9MB +
+
+
+
+
+ + ☆ A Generative Self-Supervised Framework using Functional Connectivity in + fMRI Data NeurIPS 2023 + + +
+ Deep neural networks trained on Functional Connectivity (FC) networks +extracted from functional Magnetic Resonance Imaging (fMRI) data have gained +popularity due to the increasing availability of data and advances in model +architectures, including Graph Neural Network (GNN). Recent research on the +application of GNN to FC suggests that exploiting the time-varying properties +of the FC could significantly improve the accuracy and interpretability of the +model prediction. However, the high cost of acquiring high-quality fMRI data +and corresponding phenotypic labels poses a hurdle to their application in +real-world settings, such that a model na\"ively trained in a supervised +fashion can suffer from insufficient performance or a lack of generalization on +a small number of data. In addition, most Self-Supervised Learning (SSL) +approaches for GNNs to date adopt a contrastive strategy, which tends to lose +appropriate semantic information when the graph structure is perturbed or does +not leverage both spatial and temporal information simultaneously. In light of +these challenges, we propose a generative SSL approach that is tailored to +effectively harness spatio-temporal information within dynamic FC. Our +empirical results, experimented with large-scale (>50,000) fMRI datasets, +demonstrate that our approach learns valuable representations and enables the +construction of accurate and robust models when fine-tuned for downstream +tasks. + +
+
+ comment: NeurIPS 2023 Temporal Graph Learning Workshop +
+
+
+
+
+ + ☆ Bootstrapping SparseFormers from Vision Foundation Models + + +
+ The recently proposed SparseFormer architecture provides an alternative +approach to visual understanding by utilizing a significantly lower number of +visual tokens via adjusting RoIs, greatly reducing computational costs while +still achieving promising performance. However, training SparseFormers from +scratch is still expensive, and scaling up the number of parameters can be +challenging. In this paper, we propose to bootstrap SparseFormers from +ViT-based vision foundation models in a simple and efficient way. Since the +majority of SparseFormer blocks are the standard transformer ones, we can +inherit weights from large-scale pre-trained vision transformers and freeze +them as much as possible. Therefore, we only need to train the +SparseFormer-specific lightweight focusing transformer to adjust token RoIs and +fine-tune a few early pre-trained blocks to align the final token +representation. In such a way, we can bootstrap SparseFormer architectures from +various large-scale pre-trained models (e.g., IN-21K pre-trained AugRegs or +CLIPs) using a rather smaller amount of training samples (e.g., IN-1K) and +without labels or captions within just a few hours. As a result, the +bootstrapped unimodal SparseFormer (from AugReg-ViT-L/16-384) can reach 84.9% +accuracy on IN-1K with only 49 tokens, and the multimodal SparseFormer from +CLIPs also demonstrates notable zero-shot performance with highly reduced +computational cost without seeing any caption during the bootstrapping +procedure. In addition, CLIP-bootstrapped SparseFormers, which align the output +space with language without seeing a word, can serve as efficient vision +encoders in multimodal large language models. Code will be publicly available +at https://github.com/showlab/sparseformer + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ UniGS: Unified Representation for Image Generation and Segmentation + + +
+ This paper introduces a novel unified representation of diffusion models for +image generation and segmentation. Specifically, we use a colormap to represent +entity-level masks, addressing the challenge of varying entity numbers while +aligning the representation closely with the image RGB domain. Two novel +modules, including the location-aware color palette and progressive dichotomy +module, are proposed to support our mask representation. On the one hand, a +location-aware palette guarantees the colors' consistency to entities' +locations. On the other hand, the progressive dichotomy module can efficiently +decode the synthesized colormap to high-quality entity-level masks in a +depth-first binary search without knowing the cluster numbers. To tackle the +issue of lacking large-scale segmentation training data, we employ an +inpainting pipeline and then improve the flexibility of diffusion models across +various tasks, including inpainting, image synthesis, referring segmentation, +and entity segmentation. Comprehensive experiments validate the efficiency of +our approach, demonstrating comparable segmentation mask quality to +state-of-the-art and adaptability to multiple tasks. The code will be released +at \href{https://github.com/qqlu/Entity}{https://github.com/qqlu/Entity}. + +
+
+
+
+
+ + ☆ Semantics-aware Motion Retargeting with Vision-Language Models + + +
+ Capturing and preserving motion semantics is essential to motion retargeting +between animation characters. However, most of the previous works neglect the +semantic information or rely on human-designed joint-level representations. +Here, we present a novel Semantics-aware Motion reTargeting (SMT) method with +the advantage of vision-language models to extract and maintain meaningful +motion semantics. We utilize a differentiable module to render 3D motions. Then +the high-level motion semantics are incorporated into the motion retargeting +process by feeding the vision-language model with the rendered images and +aligning the extracted semantic embeddings. To ensure the preservation of +fine-grained motion details and high-level semantics, we adopt a two-stage +pipeline consisting of skeleton-aware pre-training and fine-tuning with +semantics and geometry constraints. Experimental results show the effectiveness +of the proposed method in producing high-quality motion retargeting results +while accurately preserving motion semantics. Project page can be found at +https://sites.google.com/view/smtnet. + +
+
+
+
+
+ + ☆ Instance-guided Cartoon Editing with a Large-scale Dataset + + +
+ Cartoon editing, appreciated by both professional illustrators and hobbyists, +allows extensive creative freedom and the development of original narratives +within the cartoon domain. However, the existing literature on cartoon editing +is complex and leans heavily on manual operations, owing to the challenge of +automatic identification of individual character instances. Therefore, an +automated segmentation of these elements becomes imperative to facilitate a +variety of cartoon editing applications such as visual style editing, motion +decomposition and transfer, and the computation of stereoscopic depths for an +enriched visual experience. Unfortunately, most current segmentation methods +are designed for natural photographs, failing to recognize from the intricate +aesthetics of cartoon subjects, thus lowering segmentation quality. The major +challenge stems from two key shortcomings: the rarity of high-quality cartoon +dedicated datasets and the absence of competent models for high-resolution +instance extraction on cartoons. To address this, we introduce a high-quality +dataset of over 100k paired high-resolution cartoon images and their instance +labeling masks. We also present an instance-aware image segmentation model that +can generate accurate, high-resolution segmentation masks for characters in +cartoon images. We present that the proposed approach enables a range of +segmentation-dependent cartoon editing applications like 3D Ken Burns parallax +effects, text-guided cartoon style editing, and puppet animation from +illustrations and manga. + +
+
+ comment: Project page: https://cartoonsegmentation.github.io/ 10 pages, 10 + figures +
+
+
+
+
+ + ☆ COTR: Compact Occupancy TRansformer for Vision-based 3D Occupancy + Prediction + + +
+ The autonomous driving community has shown significant interest in 3D +occupancy prediction, driven by its exceptional geometric perception and +general object recognition capabilities. To achieve this, current works try to +construct a Tri-Perspective View (TPV) or Occupancy (OCC) representation +extending from the Bird-Eye-View perception. However, compressed views like TPV +representation lose 3D geometry information while raw and sparse OCC +representation requires heavy but reducant computational costs. To address the +above limitations, we propose Compact Occupancy TRansformer (COTR), with a +geometry-aware occupancy encoder and a semantic-aware group decoder to +reconstruct a compact 3D OCC representation. The occupancy encoder first +generates a compact geometrical OCC feature through efficient explicit-implicit +view transformation. Then, the occupancy decoder further enhances the semantic +discriminability of the compact OCC representation by a coarse-to-fine semantic +grouping strategy. Empirical experiments show that there are evident +performance gains across multiple baselines, e.g., COTR outperforms baselines +with a relative improvement of 8%-15%, demonstrating the superiority of our +method. + +
+
+
+
+
+ + ☆ A Reliable Representation with Bidirectional Transition Model for Visual + Reinforcement Learning Generalization + + +
+ Visual reinforcement learning has proven effective in solving control tasks +with high-dimensional observations. However, extracting reliable and +generalizable representations from vision-based observations remains a central +challenge. Inspired by the human thought process, when the representation +extracted from the observation can predict the future and trace history, the +representation is reliable and accurate in comprehending the environment. Based +on this concept, we introduce a Bidirectional Transition (BiT) model, which +leverages the ability to bidirectionally predict environmental transitions both +forward and backward to extract reliable representations. Our model +demonstrates competitive generalization performance and sample efficiency on +two settings of the DeepMind Control suite. Additionally, we utilize robotic +manipulation and CARLA simulators to demonstrate the wide applicability of our +method. + +
+
+
+
+
+ + ☆ Unsupervised Anomaly Detection using Aggregated Normative Diffusion + + +
+ Early detection of anomalies in medical images such as brain MRI is highly +relevant for diagnosis and treatment of many conditions. Supervised machine +learning methods are limited to a small number of pathologies where there is +good availability of labeled data. In contrast, unsupervised anomaly detection +(UAD) has the potential to identify a broader spectrum of anomalies by spotting +deviations from normal patterns. Our research demonstrates that existing +state-of-the-art UAD approaches do not generalise well to diverse types of +anomalies in realistic multi-modal MR data. To overcome this, we introduce a +new UAD method named Aggregated Normative Diffusion (ANDi). ANDi operates by +aggregating differences between predicted denoising steps and ground truth +backwards transitions in Denoising Diffusion Probabilistic Models (DDPMs) that +have been trained on pyramidal Gaussian noise. We validate ANDi against three +recent UAD baselines, and across three diverse brain MRI datasets. We show that +ANDi, in some cases, substantially surpasses these baselines and shows +increased robustness to varying types of anomalies. Particularly in detecting +multiple sclerosis (MS) lesions, ANDi achieves improvements of up to 178% in +terms of AUPRC. + +
+
+
+
+
+ + ☆ Adapting Short-Term Transformers for Action Detection in Untrimmed + Videos + + +
+ Vision transformer (ViT) has shown high potential in video recognition, owing +to its flexible design, adaptable self-attention mechanisms, and the efficacy +of masked pre-training. Yet, it still remains unclear how to adapt these +pre-trained short-term ViTs for temporal action detection (TAD) in untrimmed +videos. The existing works treat them as off-the-shelf feature extractors for +each short trimmed snippet without capturing the fine-grained relation among +different snippets in a broader temporal context. To mitigate this issue, this +paper focuses on designing a new mechanism for adapting these pre-trained ViT +models as a unified long-form video transformer to fully unleash its modeling +power in capturing inter-snippet relation, while still keeping low computation +overhead and memory consumption for efficient TAD. To this end, we design +effective cross-snippet propagation modules to gradually exchange short-term +video information among different snippets from two levels. For inner-backbone +information propagation, we introduce a cross-snippet propagation strategy to +enable multi-snippet temporal feature interaction inside the backbone. For +post-backbone information propagation, we propose temporal transformer layers +for further clip-level modeling. With the plain ViT-B pre-trained with +VideoMAE, our end-to-end temporal action detector (ViT-TAD) yields a very +competitive performance to previous temporal action detectors, riching up to +69.0 average mAP on THUMOS14, 37.12 average mAP on ActivityNet-1.3 and 17.20 +average mAP on FineAction. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ InstructTA: Instruction-Tuned Targeted Attack for Large Vision-Language + Models + + +
+ Large vision-language models (LVLMs) have demonstrated their incredible +capability in image understanding and response generation. However, this rich +visual interaction also makes LVLMs vulnerable to adversarial examples. In this +paper, we formulate a novel and practical gray-box attack scenario that the +adversary can only access the visual encoder of the victim LVLM, without the +knowledge of its prompts (which are often proprietary for service providers and +not publicly available) and its underlying large language model (LLM). This +practical setting poses challenges to the cross-prompt and cross-model +transferability of targeted adversarial attack, which aims to confuse the LVLM +to output a response that is semantically similar to the attacker's chosen +target text. To this end, we propose an instruction-tuned targeted attack +(dubbed InstructTA) to deliver the targeted adversarial attack on LVLMs with +high transferability. Initially, we utilize a public text-to-image generative +model to "reverse" the target response into a target image, and employ GPT-4 to +infer a reasonable instruction $\boldsymbol{p}^\prime$ from the target +response. We then form a local surrogate model (sharing the same visual encoder +with the victim LVLM) to extract instruction-aware features of an adversarial +image example and the target image, and minimize the distance between these two +features to optimize the adversarial example. To further improve the +transferability, we augment the instruction $\boldsymbol{p}^\prime$ with +instructions paraphrased from an LLM. Extensive experiments demonstrate the +superiority of our proposed method in targeted attack performance and +transferability. + +
+
+
+
+
+ + ☆ Unleashing the Potential of Large Language Model: Zero-shot VQA for + Flood Disaster Scenario + + +
+ Visual question answering (VQA) is a fundamental and essential AI task, and +VQA-based disaster scenario understanding is a hot research topic. For +instance, we can ask questions about a disaster image by the VQA model and the +answer can help identify whether anyone or anything is affected by the +disaster. However, previous VQA models for disaster damage assessment have some +shortcomings, such as limited candidate answer space, monotonous question +types, and limited answering capability of existing models. In this paper, we +propose a zero-shot VQA model named Zero-shot VQA for Flood Disaster Damage +Assessment (ZFDDA). It is a VQA model for damage assessment without +pre-training. Also, with flood disaster as the main research object, we build a +Freestyle Flood Disaster Image Question Answering dataset (FFD-IQA) to evaluate +our VQA model. This new dataset expands the question types to include +free-form, multiple-choice, and yes-no questions. At the same time, we expand +the size of the previous dataset to contain a total of 2,058 images and 22,422 +question-meta ground truth pairs. Most importantly, our model uses +well-designed chain of thought (CoT) demonstrations to unlock the potential of +the large language model, allowing zero-shot VQA to show better performance in +disaster scenarios. The experimental results show that the accuracy in +answering complex questions is greatly improved with CoT prompts. Our study +provides a research basis for subsequent research of VQA for other disaster +scenarios. + +
+
+ comment: accepted by The 4th International Conference on Artificial + Intelligence and Computer Engineering +
+
+
+
+
+ + ☆ FeaInfNet: Diagnosis in Medical Image with Feature-Driven Inference and + Visual Explanations + + +
+ Interpretable deep learning models have received widespread attention in the +field of image recognition. Due to the unique multi-instance learning of +medical images and the difficulty in identifying decision-making regions, many +interpretability models that have been proposed still have problems of +insufficient accuracy and interpretability in medical image disease diagnosis. +To solve these problems, we propose feature-driven inference network +(FeaInfNet). Our first key innovation involves proposing a feature-based +network reasoning structure, which is applied to FeaInfNet. The network of this +structure compares the similarity of each sub-region image patch with the +disease templates and normal templates that may appear in the region, and +finally combines the comparison of each sub-region to make the final diagnosis. +It simulates the diagnosis process of doctors to make the model interpretable +in the reasoning process, while avoiding the misleading caused by the +participation of normal areas in reasoning. Secondly, we propose local feature +masks (LFM) to extract feature vectors in order to provide global information +for these vectors, thus enhancing the expressive ability of the FeaInfNet. +Finally, we propose adaptive dynamic masks (Adaptive-DM) to interpret feature +vectors and prototypes into human-understandable image patches to provide +accurate visual interpretation. We conducted qualitative and quantitative +experiments on multiple publicly available medical datasets, including RSNA, +iChallenge-PM, Covid-19, ChinaCXRSet, and MontgomerySet. The results of our +experiments validate that our method achieves state-of-the-art performance in +terms of classification accuracy and interpretability compared to baseline +methods in medical image diagnosis. Additional ablation studies verify the +effectiveness of each of our proposed components. + +
+
+
+
+
+ + ☆ Unveiling Objects with SOLA: An Annotation-Free Image Search on the + Object Level for Automotive Data Sets + + +
+ Huge image data sets are the fundament for the development of the perception +of automated driving systems. A large number of images is necessary to train +robust neural networks that can cope with diverse situations. A sufficiently +large data set contains challenging situations and objects. For testing the +resulting functions, it is necessary that these situations and objects can be +found and extracted from the data set. While it is relatively easy to record a +large amount of unlabeled data, it is far more difficult to find demanding +situations and objects. However, during the development of perception systems, +it must be possible to access challenging data without having to perform +lengthy and time-consuming annotations. A developer must therefore be able to +search dynamically for specific situations and objects in a data set. Thus, we +designed a method which is based on state-of-the-art neural networks to search +for objects with certain properties within an image. For the ease of use, the +query of this search is described using natural language. To determine the time +savings and performance gains, we evaluated our method qualitatively and +quantitatively on automotive data sets. + +
+
+
+
+
+ + ☆ Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing + + +
+ Executing contact-rich manipulation tasks necessitates the fusion of tactile +and visual feedback. However, the distinct nature of these modalities poses +significant challenges. In this paper, we introduce a system that leverages +visual and tactile sensory inputs to enable dexterous in-hand manipulation. +Specifically, we propose Robot Synesthesia, a novel point cloud-based tactile +representation inspired by human tactile-visual synesthesia. This approach +allows for the simultaneous and seamless integration of both sensory inputs, +offering richer spatial information and facilitating better reasoning about +robot actions. The method, trained in a simulated environment and then deployed +to a real robot, is applicable to various in-hand object rotation tasks. +Comprehensive ablations are performed on how the integration of vision and +touch can improve reinforcement learning and Sim2Real performance. Our project +page is available at https://yingyuan0414.github.io/visuotactile/ . + +
+
+ comment: Project page: https://yingyuan0414.github.io/visuotactile/ +
+
+
+
+
+ + ☆ Generalization by Adaptation: Diffusion-Based Domain Extension for + Domain-Generalized Semantic Segmentation WACV 2024 + + +
+ When models, e.g., for semantic segmentation, are applied to images that are +vastly different from training data, the performance will drop significantly. +Domain adaptation methods try to overcome this issue, but need samples from the +target domain. However, this might not always be feasible for various reasons +and therefore domain generalization methods are useful as they do not require +any target data. We present a new diffusion-based domain extension (DIDEX) +method and employ a diffusion model to generate a pseudo-target domain with +diverse text prompts. In contrast to existing methods, this allows to control +the style and content of the generated images and to introduce a high +diversity. In a second step, we train a generalizing model by adapting towards +this pseudo-target domain. We outperform previous approaches by a large margin +across various datasets and architectures without using any real data. For the +generalization from GTA5, we improve state-of-the-art mIoU performance by 3.8% +absolute on average and for SYNTHIA by 11.8% absolute, marking a big step for +the generalization performance on these benchmarks. Code is available at +https://github.com/JNiemeijer/DIDEX + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ☆ VividTalk: One-Shot Audio-Driven Talking Head Generation Based on 3D + Hybrid Prior + + +
+ Audio-driven talking head generation has drawn much attention in recent +years, and many efforts have been made in lip-sync, expressive facial +expressions, natural head pose generation, and high video quality. However, no +model has yet led or tied on all these metrics due to the one-to-many mapping +between audio and motion. In this paper, we propose VividTalk, a two-stage +generic framework that supports generating high-visual quality talking head +videos with all the above properties. Specifically, in the first stage, we map +the audio to mesh by learning two motions, including non-rigid expression +motion and rigid head motion. For expression motion, both blendshape and vertex +are adopted as the intermediate representation to maximize the representation +ability of the model. For natural head motion, a novel learnable head pose +codebook with a two-phase training mechanism is proposed. In the second stage, +we proposed a dual branch motion-vae and a generator to transform the meshes +into dense motion and synthesize high-quality video frame-by-frame. Extensive +experiments show that the proposed VividTalk can generate high-visual quality +talking head videos with lip-sync and realistic enhanced by a large margin, and +outperforms previous state-of-the-art works in objective and subjective +comparisons. + +
+
+ comment: 10 pages, 8 figures +
+
+
+
+
+ + ☆ Few Clicks Suffice: Active Test-Time Adaptation for Semantic + Segmentation + + +
+ Test-time adaptation (TTA) adapts the pre-trained models during inference +using unlabeled test data and has received a lot of research attention due to +its potential practical value. Unfortunately, without any label supervision, +existing TTA methods rely heavily on heuristic or empirical studies. Where to +update the model always falls into suboptimal or brings more computational +resource consumption. Meanwhile, there is still a significant performance gap +between the TTA approaches and their supervised counterparts. Motivated by +active learning, in this work, we propose the active test-time adaptation for +semantic segmentation setup. Specifically, we introduce the human-in-the-loop +pattern during the testing phase, which queries very few labels to facilitate +predictions and model updates in an online manner. To do so, we propose a +simple but effective ATASeg framework, which consists of two parts, i.e., model +adapter and label annotator. Extensive experiments demonstrate that ATASeg +bridges the performance gap between TTA methods and their supervised +counterparts with only extremely few annotations, even one click for labeling +surpasses known SOTA TTA methods by 2.6% average mIoU on ACDC benchmark. +Empirical results imply that progress in either the model adapter or the label +annotator will bring improvements to the ATASeg framework, giving it large +research and reality potential. + +
+
+ comment: 15 pages, 10 figures +
+
+
+
+
+ + ☆ Equivariant plug-and-play image reconstruction + + +
+ Plug-and-play algorithms constitute a popular framework for solving inverse +imaging problems that rely on the implicit definition of an image prior via a +denoiser. These algorithms can leverage powerful pre-trained denoisers to solve +a wide range of imaging tasks, circumventing the necessity to train models on a +per-task basis. Unfortunately, plug-and-play methods often show unstable +behaviors, hampering their promise of versatility and leading to suboptimal +quality of reconstructed images. In this work, we show that enforcing +equivariance to certain groups of transformations (rotations, reflections, +and/or translations) on the denoiser strongly improves the stability of the +algorithm as well as its reconstruction quality. We provide a theoretical +analysis that illustrates the role of equivariance on better performance and +stability. We present a simple algorithm that enforces equivariance on any +existing denoiser by simply applying a random transformation to the input of +the denoiser and the inverse transformation to the output at each iteration of +the algorithm. Experiments on multiple imaging modalities and denoising +networks show that the equivariant plug-and-play algorithm improves both the +reconstruction performance and the stability compared to their non-equivariant +counterparts. + +
+
+
+
+
+ + ☆ Collaborative Neural Painting + + +
+ The process of painting fosters creativity and rational planning. However, +existing generative AI mostly focuses on producing visually pleasant artworks, +without emphasizing the painting process. We introduce a novel task, +Collaborative Neural Painting (CNP), to facilitate collaborative art painting +generation between humans and machines. Given any number of user-input +brushstrokes as the context or just the desired object class, CNP should +produce a sequence of strokes supporting the completion of a coherent painting. +Importantly, the process can be gradual and iterative, so allowing users' +modifications at any phase until the completion. Moreover, we propose to solve +this task using a painting representation based on a sequence of parametrized +strokes, which makes it easy both editing and composition operations. These +parametrized strokes are processed by a Transformer-based architecture with a +novel attention mechanism to model the relationship between the input strokes +and the strokes to complete. We also propose a new masking scheme to reflect +the interactive nature of CNP and adopt diffusion models as the basic learning +process for its effectiveness and diversity in the generative field. Finally, +to develop and validate methods on the novel task, we introduce a new dataset +of painted objects and an evaluation protocol to benchmark CNP both +quantitatively and qualitatively. We demonstrate the effectiveness of our +approach and the potential of the CNP task as a promising avenue for future +research. + +
+
+ comment: Submitted to Computer Vision and Image Understanding, project website + at https://fodark.github.io/collaborative-neural-painting/ +
+
+
+
+
+ + ☆ Exploring Multi-Modal Fusion for Image Manipulation Detection and + Localization + + +
+ Recent image manipulation localization and detection techniques usually +leverage forensic artifacts and traces that are produced by a noise-sensitive +filter, such as SRM and Bayar convolution. In this paper, we showcase that +different filters commonly used in such approaches excel at unveiling different +types of manipulations and provide complementary forensic traces. Thus, we +explore ways of merging the outputs of such filters and aim to leverage the +complementary nature of the artifacts produced to perform image manipulation +localization and detection (IMLD). We propose two distinct methods: one that +produces independent features from each forensic filter and then fuses them +(this is referred to as late fusion) and one that performs early mixing of +different modal outputs and produces early combined features (this is referred +to as early fusion). We demonstrate that both approaches achieve competitive +performance for both image manipulation localization and detection, +outperforming state-of-the-art models across several datasets. + +
+
+ comment: Accepted for publication, 30th Int. Conf. on MultiMedia Modeling (MMM + 2024), Amsterdam, NL, Jan.-Feb. 2024. This is the "submitted manuscript" + version +
+
+
+
+
+ + ☆ Two-stage optimized unified adversarial patch for attacking + visible-infrared cross-modal detectors in the physical world + + +
+ Currently, many studies have addressed security concerns related to visible +and infrared detectors independently. In practical scenarios, utilizing +cross-modal detectors for tasks proves more reliable than relying on +single-modal detectors. Despite this, there is a lack of comprehensive security +evaluations for cross-modal detectors. While existing research has explored the +feasibility of attacks against cross-modal detectors, the implementation of a +robust attack remains unaddressed. This work introduces the Two-stage Optimized +Unified Adversarial Patch (TOUAP) designed for performing attacks against +visible-infrared cross-modal detectors in real-world, black-box settings. The +TOUAP employs a two-stage optimization process: firstly, PSO optimizes an +irregular polygonal infrared patch to attack the infrared detector; secondly, +the color QR code is optimized, and the shape information of the infrared patch +from the first stage is used as a mask. The resulting irregular polygon visible +modal patch executes an attack on the visible detector. Through extensive +experiments conducted in both digital and physical environments, we validate +the effectiveness and robustness of the proposed method. As the TOUAP surpasses +baseline performance, we advocate for its widespread attention. + +
+
+
+
+
+ + ☆ IMProv: Inpainting-based Multimodal Prompting for Computer Vision Tasks + + +
+ In-context learning allows adapting a model to new tasks given a task +description at test time. In this paper, we present IMProv - a generative model +that is able to in-context learn visual tasks from multimodal prompts. Given a +textual description of a visual task (e.g. "Left: input image, Right: +foreground segmentation"), a few input-output visual examples, or both, the +model in-context learns to solve it for a new test input. We train a masked +generative transformer on a new dataset of figures from computer vision papers +and their associated captions, together with a captioned large-scale image-text +dataset. During inference time, we prompt the model with text and/or image task +example(s) and have the model inpaint the corresponding output. We show that +training our model with text conditioning and scaling the dataset size improves +in-context learning for computer vision tasks by over +10\% AP for Foreground +Segmentation, over +5\% gains in AP for Single Object Detection, and almost +20\% lower LPIPS in Colorization. Our empirical results suggest that vision and +language prompts are complementary and it is advantageous to use both to +achieve better in-context learning performance. Project page is available at +https://jerryxu.net/IMProv . + +
+
+ comment: Project page: https://jerryxu.net/IMProv +
+
+
+
+
+ + ☆ Localizing and Assessing Node Significance in Default Mode Network using + Sub-Community Detection in Mild Cognitive Impairment + + +
+ Our study aims to utilize fMRI to identify the affected brain regions within +the Default Mode Network (DMN) in subjects with Mild Cognitive Impairment +(MCI), using a novel Node Significance Score (NSS). We construct +subject-specific DMN graphs by employing partial correlation of Regions of +Interest (ROIs) that make-up the DMN. For the DMN graph, ROIs are the nodes and +edges are determined based on partial correlation. Four popular community +detection algorithms (Clique Percolation Method (CPM), Louvain algorithm, +Greedy Modularity and Leading Eigenvectors) are applied to determine the +largest sub-community. NSS ratings are derived for each node, considering (I) +frequency in the largest sub-community within a class across all subjects and +(II) occurrence in the largest sub-community according to all four methods. +After computing the NSS of each ROI in both healthy and MCI subjects, we +quantify the score disparity to identify nodes most impacted by MCI. The +results reveal a disparity exceeding 20% for 10 DMN nodes, maximally for PCC +and Fusiform, showing 45.69% and 43.08% disparity. This aligns with existing +medical literature, additionally providing a quantitative measure that enables +the ordering of the affected ROIs. These findings offer valuable insights and +could lead to treatment strategies aggressively targeting the affected nodes. + +
+
+ comment: 4 pages, 2 figures +
+
+
+
+
+ + ☆ Dynamic Erasing Network Based on Multi-Scale Temporal Features for + Weakly Supervised Video Anomaly Detection + + +
+ The goal of weakly supervised video anomaly detection is to learn a detection +model using only video-level labeled data. However, prior studies typically +divide videos into fixed-length segments without considering the complexity or +duration of anomalies. Moreover, these studies usually just detect the most +abnormal segments, potentially overlooking the completeness of anomalies. To +address these limitations, we propose a Dynamic Erasing Network (DE-Net) for +weakly supervised video anomaly detection, which learns multi-scale temporal +features. Specifically, to handle duration variations of abnormal events, we +first propose a multi-scale temporal modeling module, capable of extracting +features from segments of varying lengths and capturing both local and global +visual information across different temporal scales. Then, we design a dynamic +erasing strategy, which dynamically assesses the completeness of the detected +anomalies and erases prominent abnormal segments in order to encourage the +model to discover gentle abnormal segments in a video. The proposed method +obtains favorable performance compared to several state-of-the-art approaches +on three datasets: XD-Violence, TAD, and UCF-Crime. Code will be made available +at https://github.com/ArielZc/DE-Net. + +
+
+
+
+
+ + ☆ Light Field Imaging in the Restrictive Object Space based on Flexible + Angular Plane + + +
+ In some applications, the object space of light field imaging system is +restrictive, such as industrial and medical endoscopes. If the traditional +light field imaging system is used in the restrictive object space (ROS) +directly but without any specific considerations, the ROS will lead to severe +microlens image distortions and then affects light field decoding, calibration +and 3D reconstruction. The light field imaging in restrictive object space +(ROS-LF) is complicated but significant. In this paper, we first deduce that +the reason of the microlens image deviation is the position variation of the +angular plane, then we propose the flexible angular plane for ROS-LF, while in +the traditional light field the angular plane always coincides with the main +lens plane. Subsequently, we propose the microlens image non-distortion +principle for ROS-LF and introduce the ROS-LF imaging principle. We demonstrate +that the difference is an aperture constant term between the ROS-LF and +traditional light field imaging models. At last, we design a ROS-LF simulated +system and calibrate it to verify principles proposed in this paper. + +
+
+
+
+
+ + ☆ CZL-CIAE: CLIP-driven Zero-shot Learning for Correcting Inverse Age + Estimation + + +
+ Zero-shot age estimation aims to learn feature information about age from +input images and make inferences about a given person's image or video frame +without specific sample data. The development of zero-shot age estimation can +improve the efficiency and accuracy of various applications (e.g., age +verification and secure access control, etc.), while also promoting research on +multi-modal and zero-shot learning in the social media field. For example, +zero-sample age estimation can be used to create social networks focused on +specific age groups. However, existing methods mainly focus on supervised, +labeled age estimation learning, and the prediction effect of zero-shot +learning is very poor. To tackle the above issues, we propose a novel +CLIP-driven Zero-shot Learning for Correcting Inverse Age Estimation +(CZL-CIAE). Specifically, we first introduce the CLIP model to extract image +features and text semantic information respectively, and map them into a highly +semantically aligned high-dimensional feature space. Next, we designed a new +Transformer architecture (i.e., FourierFormer) to achieve channel evolution and +spatial interaction of images, and to fuse image and text semantic information. +Finally, we introduce reversible age estimation, which uses end-to-end error +feedback to reduce the error rate of age predictions. Through extensive +experiments on multiple data sets, CZL-CIAE has achieved better age prediction +results. + +
+
+ comment: 14 pages, 14 figures, 3 tables +
+
+
+
+
+ + ☆ A Comprehensive Literature Review on Sweet Orange Leaf Diseases + + +
+ Sweet orange leaf diseases are significant to agricultural productivity. Leaf +diseases impact fruit quality in the citrus industry. The apparition of machine +learning makes the development of disease finder. Early detection and diagnosis +are necessary for leaf management. Sweet orange leaf disease-predicting +automated systems have already been developed using different image-processing +techniques. This comprehensive literature review is systematically based on +leaf disease and machine learning methodologies applied to the detection of +damaged leaves via image classification. The benefits and limitations of +different machine learning models, including Vision Transformer (ViT), Neural +Network (CNN), CNN with SoftMax and RBF SVM, Hybrid CNN-SVM, HLB-ConvMLP, +EfficientNet-b0, YOLOv5, YOLOv7, Convolutional, Deep CNN. These machine +learning models tested on various datasets and detected the disease. This +comprehensive review study related to leaf disease compares the performance of +the models; those models' accuracy, precision, recall, etc., were used in the +subsisting studies + +
+
+ comment: 16 pages +
+
+
+
+
+ + ☆ Long-Tail Learning with Rebalanced Contrastive Loss + + +
+ Integrating supervised contrastive loss to cross entropy-based communication +has recently been proposed as a solution to address the long-tail learning +problem. However, when the class imbalance ratio is high, it requires adjusting +the supervised contrastive loss to support the tail classes, as the +conventional contrastive learning is biased towards head classes by default. To +this end, we present Rebalanced Contrastive Learning (RCL), an efficient means +to increase the long tail classification accuracy by addressing three main +aspects: 1. Feature space balancedness - Equal division of the feature space +among all the classes, 2. Intra-Class compactness - Reducing the distance +between same-class embeddings, 3. Regularization - Enforcing larger margins for +tail classes to reduce overfitting. RCL adopts class frequency-based SoftMax +loss balancing to supervised contrastive learning loss and exploits scalar +multiplied features fed to the contrastive learning loss to enforce +compactness. We implement RCL on the Balanced Contrastive Learning (BCL) +Framework, which has the SOTA performance. Our experiments on three benchmark +datasets demonstrate the richness of the learnt embeddings and increased top-1 +balanced accuracy RCL provides to the BCL framework. We further demonstrate +that the performance of RCL as a standalone loss also achieves state-of-the-art +level accuracy. + +
+
+
+
+
+ + ☆ Open-DDVM: A Reproduction and Extension of Diffusion Model for Optical + Flow Estimation + + +
+ Recently, Google proposes DDVM which for the first time demonstrates that a +general diffusion model for image-to-image translation task works impressively +well on optical flow estimation task without any specific designs like RAFT. +However, DDVM is still a closed-source model with the expensive and private +Palette-style pretraining. In this technical report, we present the first +open-source DDVM by reproducing it. We study several design choices and find +those important ones. By training on 40k public data with 4 GPUs, our +reproduction achieves comparable performance to the closed-source DDVM. The +code and model have been released in +https://github.com/DQiaole/FlowDiffusion_pytorch. + +
+
+ comment: Technical Report +
+
+
+
+
+ + ☆ Cross-Modal Adaptive Dual Association for Text-to-Image Person Retrieval + + +
+ Text-to-image person re-identification (ReID) aims to retrieve images of a +person based on a given textual description. The key challenge is to learn the +relations between detailed information from visual and textual modalities. +Existing works focus on learning a latent space to narrow the modality gap and +further build local correspondences between two modalities. However, these +methods assume that image-to-text and text-to-image associations are +modality-agnostic, resulting in suboptimal associations. In this work, we show +the discrepancy between image-to-text association and text-to-image association +and propose CADA: Cross-Modal Adaptive Dual Association that finely builds +bidirectional image-text detailed associations. Our approach features a +decoder-based adaptive dual association module that enables full interaction +between visual and textual modalities, allowing for bidirectional and adaptive +cross-modal correspondence associations. Specifically, the paper proposes a +bidirectional association mechanism: Association of text Tokens to image +Patches (ATP) and Association of image Regions to text Attributes (ARA). We +adaptively model the ATP based on the fact that aggregating cross-modal +features based on mistaken associations will lead to feature distortion. For +modeling the ARA, since the attributes are typically the first distinguishing +cues of a person, we propose to explore the attribute-level association by +predicting the masked text phrase using the related image region. Finally, we +learn the dual associations between texts and images, and the experimental +results demonstrate the superiority of our dual formulation. Codes will be made +publicly available. + +
+
+
+
+
+ + ☆ Fully Spiking Denoising Diffusion Implicit Models + + +
+ Spiking neural networks (SNNs) have garnered considerable attention owing to +their ability to run on neuromorphic devices with super-high speeds and +remarkable energy efficiencies. SNNs can be used in conventional neural +network-based time- and energy-consuming applications. However, research on +generative models within SNNs remains limited, despite their advantages. In +particular, diffusion models are a powerful class of generative models, whose +image generation quality surpass that of the other generative models, such as +GANs. However, diffusion models are characterized by high computational costs +and long inference times owing to their iterative denoising feature. Therefore, +we propose a novel approach fully spiking denoising diffusion implicit model +(FSDDIM) to construct a diffusion model within SNNs and leverage the high speed +and low energy consumption features of SNNs via synaptic current learning +(SCL). SCL fills the gap in that diffusion models use a neural network to +estimate real-valued parameters of a predefined probabilistic distribution, +whereas SNNs output binary spike trains. The SCL enables us to complete the +entire generative process of diffusion models exclusively using SNNs. We +demonstrate that the proposed method outperforms the state-of-the-art fully +spiking generative model. + +
+
+
+
+
+ + ☆ SRSNetwork: Siamese Reconstruction-Segmentation Networks based on + Dynamic-Parameter Convolution + + +
+ In this paper, we present a high-performance deep neural network for weak +target image segmentation, including medical image segmentation and infrared +image segmentation. To this end, this work analyzes the existing dynamic +convolutions and proposes dynamic parameter convolution (DPConv). Furthermore, +it reevaluates the relationship between reconstruction tasks and segmentation +tasks from the perspective of DPConv, leading to the proposal of a dual-network +model called the Siamese Reconstruction-Segmentation Network (SRSNet). The +proposed model is not only a universal network but also enhances the +segmentation performance without altering its structure, leveraging the +reconstruction task. Additionally, as the amount of training data for the +reconstruction network increases, the performance of the segmentation network +also improves synchronously. On seven datasets including five medical datasets +and two infrared image datasets, our SRSNet consistently achieves the best +segmentation results. The code is released at https://github.com/fidshu/SRSNet. + +
+
+ comment: 14 pages +
+
+
+
+
+ + ☆ MobileUtr: Revisiting the relationship between light-weight CNN and + Transformer for efficient medical image segmentation + + +
+ Due to the scarcity and specific imaging characteristics in medical images, +light-weighting Vision Transformers (ViTs) for efficient medical image +segmentation is a significant challenge, and current studies have not yet paid +attention to this issue. This work revisits the relationship between CNNs and +Transformers in lightweight universal networks for medical image segmentation, +aiming to integrate the advantages of both worlds at the infrastructure design +level. In order to leverage the inductive bias inherent in CNNs, we abstract a +Transformer-like lightweight CNNs block (ConvUtr) as the patch embeddings of +ViTs, feeding Transformer with denoised, non-redundant and highly condensed +semantic information. Moreover, an adaptive Local-Global-Local (LGL) block is +introduced to facilitate efficient local-to-global information flow exchange, +maximizing Transformer's global context information extraction capabilities. +Finally, we build an efficient medical image segmentation model (MobileUtr) +based on CNN and Transformer. Extensive experiments on five public medical +image datasets with three different modalities demonstrate the superiority of +MobileUtr over the state-of-the-art methods, while boasting lighter weights and +lower computational cost. Code is available at +https://github.com/FengheTan9/MobileUtr. + +
+
+ comment: 13 pages +
+
+
+
+
+ + ☆ Effective Adapter for Face Recognition in the Wild + + +
+ In this paper, we tackle the challenge of face recognition in the wild, where +images often suffer from low quality and real-world distortions. Traditional +heuristic approaches-either training models directly on these degraded images +or their enhanced counterparts using face restoration techniques-have proven +ineffective, primarily due to the degradation of facial features and the +discrepancy in image domains. To overcome these issues, we propose an effective +adapter for augmenting existing face recognition models trained on high-quality +facial datasets. The key of our adapter is to process both the unrefined and +the enhanced images by two similar structures where one is fixed and the other +trainable. Such design can confer two benefits. First, the dual-input system +minimizes the domain gap while providing varied perspectives for the face +recognition model, where the enhanced image can be regarded as a complex +non-linear transformation of the original one by the restoration model. Second, +both two similar structures can be initialized by the pre-trained models +without dropping the past knowledge. The extensive experiments in zero-shot +settings show the effectiveness of our method by surpassing baselines of about +3%, 4%, and 7% in three datasets. Our code will be publicly available at +https://github.com/liuyunhaozz/FaceAdapter/. + +
+
+
+
+
+ + ☆ Likelihood-Aware Semantic Alignment for Full-Spectrum + Out-of-Distribution Detection + + +
+ Full-spectrum out-of-distribution (F-OOD) detection aims to accurately +recognize in-distribution (ID) samples while encountering semantic and +covariate shifts simultaneously. However, existing out-of-distribution (OOD) +detectors tend to overfit the covariance information and ignore intrinsic +semantic correlation, inadequate for adapting to complex domain +transformations. To address this issue, we propose a Likelihood-Aware Semantic +Alignment (LSA) framework to promote the image-text correspondence into +semantically high-likelihood regions. LSA consists of an offline Gaussian +sampling strategy which efficiently samples semantic-relevant visual embeddings +from the class-conditional Gaussian distribution, and a bidirectional prompt +customization mechanism that adjusts both ID-related and negative context for +discriminative ID/OOD boundary. Extensive experiments demonstrate the +remarkable OOD detection performance of our proposed LSA especially on the +intractable Near-OOD setting, surpassing existing methods by a margin of +$15.26\%$ and $18.88\%$ on two F-OOD benchmarks, respectively. + +
+
+ comment: 16 pages, 7 figures +
+
+
+
+
+ + ☆ Simultaneous Alignment and Surface Regression Using Hybrid 2D-3D + Networks for 3D Coherent Layer Segmentation of Retinal OCT Images with Full + and Sparse Annotations + + +
+ Layer segmentation is important to quantitative analysis of retinal optical +coherence tomography (OCT). Recently, deep learning based methods have been +developed to automate this task and yield remarkable performance. However, due +to the large spatial gap and potential mismatch between the B-scans of an OCT +volume, all of them were based on 2D segmentation of individual B-scans, which +may lose the continuity and diagnostic information of the retinal layers in 3D +space. Besides, most of these methods required dense annotation of the OCT +volumes, which is labor-intensive and expertise-demanding. This work presents a +novel framework based on hybrid 2D-3D convolutional neural networks (CNNs) to +obtain continuous 3D retinal layer surfaces from OCT volumes, which works well +with both full and sparse annotations. The 2D features of individual B-scans +are extracted by an encoder consisting of 2D convolutions. These 2D features +are then used to produce the alignment displacement vectors and layer +segmentation by two 3D decoders coupled via a spatial transformer module. Two +losses are proposed to utilize the retinal layers' natural property of being +smooth for B-scan alignment and layer segmentation, respectively, and are the +key to the semi-supervised learning with sparse annotation. The entire +framework is trained end-to-end. To the best of our knowledge, this is the +first work that attempts 3D retinal layer segmentation in volumetric OCT images +based on CNNs. Experiments on a synthetic dataset and three public clinical +datasets show that our framework can effectively align the B-scans for +potential motion correction, and achieves superior performance to +state-of-the-art 2D deep learning methods in terms of both layer segmentation +accuracy and cross-B-scan 3D continuity in both fully and semi-supervised +settings, thus offering more clinical values than previous works. + +
+
+ comment: Accepted by MIA +
+
+
+
+
+ + ☆ StableVITON: Learning Semantic Correspondence with Latent Diffusion + Model for Virtual Try-On + + +
+ Given a clothing image and a person image, an image-based virtual try-on aims +to generate a customized image that appears natural and accurately reflects the +characteristics of the clothing image. In this work, we aim to expand the +applicability of the pre-trained diffusion model so that it can be utilized +independently for the virtual try-on task.The main challenge is to preserve the +clothing details while effectively utilizing the robust generative capability +of the pre-trained model. In order to tackle these issues, we propose +StableVITON, learning the semantic correspondence between the clothing and the +human body within the latent space of the pre-trained diffusion model in an +end-to-end manner. Our proposed zero cross-attention blocks not only preserve +the clothing details by learning the semantic correspondence but also generate +high-fidelity images by utilizing the inherent knowledge of the pre-trained +model in the warping process. Through our proposed novel attention total +variation loss and applying augmentation, we achieve the sharp attention map, +resulting in a more precise representation of clothing details. StableVITON +outperforms the baselines in qualitative and quantitative evaluation, showing +promising quality in arbitrary person images. Our code is available at +https://github.com/rlawjdghek/StableVITON. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Disentangled Interaction Representation for One-Stage Human-Object + Interaction Detection + + +
+ Human-Object Interaction (HOI) detection is a core task for human-centric +image understanding. Recent one-stage methods adopt a transformer decoder to +collect image-wide cues that are useful for interaction prediction; however, +the interaction representations obtained using this method are entangled and +lack interpretability. In contrast, traditional two-stage methods benefit +significantly from their ability to compose interaction features in a +disentangled and explainable manner. In this paper, we improve the performance +of one-stage methods by enabling them to extract disentangled interaction +representations. First, we propose Shunted Cross-Attention (SCA) to extract +human appearance, object appearance, and global context features using +different cross-attention heads. This is achieved by imposing different masks +on the cross-attention maps produced by the different heads. Second, we +introduce the Interaction-aware Pose Estimation (IPE) task to learn +interaction-relevant human pose features using a disentangled decoder. This is +achieved with a novel attention module that accurately captures the human +keypoints relevant to the current interaction category. Finally, our approach +fuses the appearance feature and pose feature via element-wise addition to form +the interaction representation. Experimental results show that our approach can +be readily applied to existing one-stage HOI detectors. Moreover, we achieve +state-of-the-art performance on two benchmarks: HICO-DET and V-COCO. + +
+
+
+
+
+ + ☆ Regressor-Segmenter Mutual Prompt Learning for Crowd Counting + + +
+ Crowd counting has achieved significant progress by training regressors to +predict instance positions. In heavily crowded scenarios, however, regressors +are challenged by uncontrollable annotation variance, which causes density map +bias and context information inaccuracy. In this study, we propose mutual +prompt learning (mPrompt), which leverages a regressor and a segmenter as +guidance for each other, solving bias and inaccuracy caused by annotation +variance while distinguishing foreground from background. In specific, mPrompt +leverages point annotations to tune the segmenter and predict pseudo head masks +in a way of point prompt learning. It then uses the predicted segmentation +masks, which serve as spatial constraint, to rectify biased point annotations +as context prompt learning. mPrompt defines a way of mutual information +maximization from prompt learning, mitigating the impact of annotation variance +while improving model accuracy. Experiments show that mPrompt significantly +reduces the Mean Average Error (MAE), demonstrating the potential to be general +framework for down-stream vision tasks. + +
+
+ comment: 17 pages, 15 figures +
+
+
+
+
+ + ☆ Mitigating Fine-Grained Hallucination by Fine-Tuning Large + Vision-Language Models with Caption Rewrites + + +
+ Large language models (LLMs) have shown remarkable performance in natural +language processing (NLP) tasks. To comprehend and execute diverse human +instructions over image data, instruction-tuned large vision-language models +(LVLMs) have been introduced. However, LVLMs may suffer from different types of +object hallucinations. Nevertheless, LVLMs are evaluated for coarse-grained +object hallucinations only (i.e., generated objects non-existent in the input +image). The fine-grained object attributes and behaviors non-existent in the +image may still be generated but not measured by the current evaluation +methods. In this paper, we thus focus on reducing fine-grained hallucinations +of LVLMs. We propose \textit{ReCaption}, a framework that consists of two +components: rewriting captions using ChatGPT and fine-tuning the +instruction-tuned LVLMs on the rewritten captions. We also propose a +fine-grained probing-based evaluation method named \textit{Fine-Grained Object +Hallucination Evaluation} (\textit{FGHE}). Our experiment results demonstrate +that ReCaption effectively reduces fine-grained object hallucination for +different LVLM options and improves their text generation quality. The code can +be found at https://github.com/Anonymousanoy/FOHE. + +
+
+ comment: MMM 2024 +
+
+
+
+
+ + ☆ Hulk: A Universal Knowledge Translator for Human-Centric Tasks + + +
+ Human-centric perception tasks, e.g., human mesh recovery, pedestrian +detection, skeleton-based action recognition, and pose estimation, have wide +industrial applications, such as metaverse and sports analysis. There is a +recent surge to develop human-centric foundation models that can benefit a +broad range of human-centric perception tasks. While many human-centric +foundation models have achieved success, most of them only excel in 2D vision +tasks or require extensive fine-tuning for practical deployment in real-world +scenarios. These limitations severely restrict their usability across various +downstream tasks and situations. To tackle these problems, we present Hulk, the +first multimodal human-centric generalist model, capable of addressing most of +the mainstream tasks simultaneously without task-specific finetuning, covering +2D vision, 3D vision, skeleton-based, and vision-language tasks. The key to +achieving this is condensing various task-specific heads into two general +heads, one for discrete representations, e.g., languages, and the other for +continuous representations, e.g., location coordinates. The outputs of two +heads can be further stacked into four distinct input and output modalities. +This uniform representation enables Hulk to treat human-centric tasks as +modality translation, integrating knowledge across a wide range of tasks. To +validate the effectiveness of our proposed method, we conduct comprehensive +experiments on 11 benchmarks across 8 human-centric tasks. Experimental results +surpass previous methods substantially, demonstrating the superiority of our +proposed method. The code will be available on +https://github.com/OpenGVLab/HumanBench. + +
+
+ comment: 24 pages, 5 figures +
+
+
+
+
+ + ☆ BEVNeXt: Reviving Dense BEV Frameworks for 3D Object Detection + + +
+ Recently, the rise of query-based Transformer decoders is reshaping +camera-based 3D object detection. These query-based decoders are surpassing the +traditional dense BEV (Bird's Eye View)-based methods. However, we argue that +dense BEV frameworks remain important due to their outstanding abilities in +depth estimation and object localization, depicting 3D scenes accurately and +comprehensively. This paper aims to address the drawbacks of the existing dense +BEV-based 3D object detectors by introducing our proposed enhanced components, +including a CRF-modulated depth estimation module enforcing object-level +consistencies, a long-term temporal aggregation module with extended receptive +fields, and a two-stage object decoder combining perspective techniques with +CRF-modulated depth embedding. These enhancements lead to a "modernized" dense +BEV framework dubbed BEVNeXt. On the nuScenes benchmark, BEVNeXt outperforms +both BEV-based and query-based frameworks under various settings, achieving a +state-of-the-art result of 64.2 NDS on the nuScenes test set. + +
+
+
+
+
+ + ☆ Fast and accurate sparse-view CBCT reconstruction using meta-learned + neural attenuation field and hash-encoding regularization + + +
+ Cone beam computed tomography (CBCT) is an emerging medical imaging technique +to visualize the internal anatomical structures of patients. During a CBCT +scan, several projection images of different angles or views are collectively +utilized to reconstruct a tomographic image. However, reducing the number of +projections in a CBCT scan while preserving the quality of a reconstructed +image is challenging due to the nature of an ill-posed inverse problem. +Recently, a neural attenuation field (NAF) method was proposed by adopting a +neural radiance field algorithm as a new way for CBCT reconstruction, +demonstrating fast and promising results using only 50 views. However, +decreasing the number of projections is still preferable to reduce potential +radiation exposure, and a faster reconstruction time is required considering a +typical scan time. In this work, we propose a fast and accurate sparse-view +CBCT reconstruction (FACT) method to provide better reconstruction quality and +faster optimization speed in the minimal number of view acquisitions ($<$ 50 +views). In the FACT method, we meta-trained a neural network and a hash-encoder +using a few scans (= 15), and a new regularization technique is utilized to +reconstruct the details of an anatomical structure. In conclusion, we have +shown that the FACT method produced better, and faster reconstruction results +over the other conventional algorithms based on CBCT scans of different body +parts (chest, head, and abdomen) and CT vendors (Siemens, Phillips, and GE). + +
+
+
+
+
+ + ☆ ResEnsemble-DDPM: Residual Denoising Diffusion Probabilistic Models for + Ensemble Learning + + +
+ Nowadays, denoising diffusion probabilistic models have been adapted for many +image segmentation tasks. However, existing end-to-end models have already +demonstrated remarkable capabilities. Rather than using denoising diffusion +probabilistic models alone, integrating the abilities of both denoising +diffusion probabilistic models and existing end-to-end models can better +improve the performance of image segmentation. Based on this, we implicitly +introduce residual term into the diffusion process and propose +ResEnsemble-DDPM, which seamlessly integrates the diffusion model and the +end-to-end model through ensemble learning. The output distributions of these +two models are strictly symmetric with respect to the ground truth +distribution, allowing us to integrate the two models by reducing the residual +term. Experimental results demonstrate that our ResEnsemble-DDPM can further +improve the capabilities of existing models. Furthermore, its ensemble learning +strategy can be generalized to other downstream tasks in image generation and +get strong competitiveness. + +
+
+
+
+
+ + ☆ Adversarial Medical Image with Hierarchical Feature Hiding + + +
+ Deep learning based methods for medical images can be easily compromised by +adversarial examples (AEs), posing a great security flaw in clinical +decision-making. It has been discovered that conventional adversarial attacks +like PGD which optimize the classification logits, are easy to distinguish in +the feature space, resulting in accurate reactive defenses. To better +understand this phenomenon and reassess the reliability of the reactive +defenses for medical AEs, we thoroughly investigate the characteristic of +conventional medical AEs. Specifically, we first theoretically prove that +conventional adversarial attacks change the outputs by continuously optimizing +vulnerable features in a fixed direction, thereby leading to outlier +representations in the feature space. Then, a stress test is conducted to +reveal the vulnerability of medical images, by comparing with natural images. +Interestingly, this vulnerability is a double-edged sword, which can be +exploited to hide AEs. We then propose a simple-yet-effective hierarchical +feature constraint (HFC), a novel add-on to conventional white-box attacks, +which assists to hide the adversarial feature in the target feature +distribution. The proposed method is evaluated on three medical datasets, both +2D and 3D, with different modalities. The experimental results demonstrate the +superiority of HFC, \emph{i.e.,} it bypasses an array of state-of-the-art +adversarial medical AE detectors more efficiently than competing adaptive +attacks, which reveals the deficiencies of medical reactive defense and allows +to develop more robust defenses in future. + +
+
+ comment: Our code is available at + \url{https://github.com/qsyao/Hierarchical_Feature_Constraint} +
+
+
+
+
+ + ☆ Multi-task Image Restoration Guided By Robust DINO Features + + +
+ Multi-task image restoration has gained significant interest due to its +inherent versatility and efficiency compared to its single-task counterpart. +Despite its potential, performance degradation is observed with an increase in +the number of tasks, primarily attributed to the distinct nature of each +restoration task. Addressing this challenge, we introduce +\mbox{\textbf{DINO-IR}}, a novel multi-task image restoration approach +leveraging robust features extracted from DINOv2. Our empirical analysis shows +that while shallow features of DINOv2 capture rich low-level image +characteristics, the deep features ensure a robust semantic representation +insensitive to degradations while preserving high-frequency contour details. +Building on these features, we devise specialized components, including +multi-layer semantic fusion module, DINO-Restore adaption and fusion module, +and DINO perception contrastive loss, to integrate DINOv2 features into the +restoration paradigm. Equipped with the aforementioned components, our DINO-IR +performs favorably against existing multi-task image restoration approaches in +various tasks by a large margin, indicating the superiority and necessity of +reinforcing the robust features for multi-task image restoration. + +
+
+
+
+
+ + ☆ Multimodality-guided Image Style Transfer using Cross-modal GAN + Inversion WACV 2024 + + +
+ Image Style Transfer (IST) is an interdisciplinary topic of computer vision +and art that continuously attracts researchers' interests. Different from +traditional Image-guided Image Style Transfer (IIST) methods that require a +style reference image as input to define the desired style, recent works start +to tackle the problem in a text-guided manner, i.e., Text-guided Image Style +Transfer (TIST). Compared to IIST, such approaches provide more flexibility +with text-specified styles, which are useful in scenarios where the style is +hard to define with reference images. Unfortunately, many TIST approaches +produce undesirable artifacts in the transferred images. To address this issue, +we present a novel method to achieve much improved style transfer based on text +guidance. Meanwhile, to offer more flexibility than IIST and TIST, our method +allows style inputs from multiple sources and modalities, enabling +MultiModality-guided Image Style Transfer (MMIST). Specifically, we realize +MMIST with a novel cross-modal GAN inversion method, which generates style +representations consistent with specified styles. Such style representations +facilitate style transfer and in principle generalize any IIST methods to +MMIST. Large-scale experiments and user studies demonstrate that our method +achieves state-of-the-art performance on TIST task. Furthermore, comprehensive +qualitative results confirm the effectiveness of our method on MMIST task and +cross-modal style interpolation. + +
+
+ comment: WACV 2024. Project website: https://hywang66.github.io/mmist/ +
+
+
+
+
+ + ☆ Customize your NeRF: Adaptive Source Driven 3D Scene Editing via + Local-Global Iterative Training + + +
+ In this paper, we target the adaptive source driven 3D scene editing task by +proposing a CustomNeRF model that unifies a text description or a reference +image as the editing prompt. However, obtaining desired editing results +conformed with the editing prompt is nontrivial since there exist two +significant challenges, including accurate editing of only foreground regions +and multi-view consistency given a single-view reference image. To tackle the +first challenge, we propose a Local-Global Iterative Editing (LGIE) training +scheme that alternates between foreground region editing and full-image +editing, aimed at foreground-only manipulation while preserving the background. +For the second challenge, we also design a class-guided regularization that +exploits class priors within the generation model to alleviate the +inconsistency problem among different views in image-driven editing. Extensive +experiments show that our CustomNeRF produces precise editing results under +various real scenes for both text- and image-driven settings. + +
+
+ comment: 14 pages, 13 figures, project website: https://customnerf.github.io/ +
+
+
+
+
+ + ☆ RiskBench: A Scenario-based Benchmark for Risk Identification + + +
+ Intelligent driving systems aim to achieve a zero-collision mobility +experience, requiring interdisciplinary efforts to enhance safety performance. +This work focuses on risk identification, the process of identifying and +analyzing risks stemming from dynamic traffic participants and unexpected +events. While significant advances have been made in the community, the current +evaluation of different risk identification algorithms uses independent +datasets, leading to difficulty in direct comparison and hindering collective +progress toward safety performance enhancement. To address this limitation, we +introduce \textbf{RiskBench}, a large-scale scenario-based benchmark for risk +identification. We design a scenario taxonomy and augmentation pipeline to +enable a systematic collection of ground truth risks under diverse scenarios. +We assess the ability of ten algorithms to (1) detect and locate risks, (2) +anticipate risks, and (3) facilitate decision-making. We conduct extensive +experiments and summarize future research on risk identification. Our aim is to +encourage collaborative endeavors in achieving a society with zero collisions. +We have made our dataset and benchmark toolkit publicly on the project page: +https://hcis-lab.github.io/RiskBench/ + +
+
+
+
+
+ + ☆ The Contemporary Art of Image Search: Iterative User Intent Expansion + via Vision-Language Model SC + + +
+ Image search is an essential and user-friendly method to explore vast +galleries of digital images. However, existing image search methods heavily +rely on proximity measurements like tag matching or image similarity, requiring +precise user inputs for satisfactory results.To meet the growing demand for a +contemporary image search engine that enables accurate comprehension of users' +search intentions, we introduce an innovative user intent expansion framework. +Our framework leverages visual-language models to parse and compose multi-modal +user inputs to provide more accurate and satisfying results. It comprises +two-stage processes: 1) a parsing stage that incorporates a language parsing +module with large language models to enhance the comprehension of textual +inputs, along with a visual parsing module that integrates an interactive +segmentation module to swiftly identify detailed visual elements within images; +and 2) a logic composition stage that combines multiple user search intents +into a unified logic expression for more sophisticated operations in complex +searching scenarios. Moreover, the intent expansion framework enables users to +perform flexible contextualized interactions with the search results to further +specify or adjust their detailed search intents iteratively. We implemented the +framework into an image search system for NFT (non-fungible token) search and +conducted a user study to evaluate its usability and novel properties. The +results indicate that the proposed framework significantly improves users' +image search experience. Particularly the parsing and contextualized +interactions prove useful in allowing users to express their search intents +more accurately and engage in a more enjoyable iterative search experience. + +
+
+ comment: Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported + Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024) +
+
+
+
+
+ + ☆ Adaptive Confidence Threshold for ByteTrack in Multi-Object Tracking + + +
+ We investigate the application of ByteTrack in the realm of multiple object +tracking. ByteTrack, a simple tracking algorithm, enables the simultaneous +tracking of multiple objects by strategically incorporating detections with a +low confidence threshold. Conventionally, objects are initially associated with +high confidence threshold detections. When the association between objects and +detections becomes ambiguous, ByteTrack extends the association to lower +confidence threshold detections. One notable drawback of the existing ByteTrack +approach is its reliance on a fixed threshold to differentiate between high and +low-confidence detections. In response to this limitation, we introduce a novel +and adaptive approach. Our proposed method entails a dynamic adjustment of the +confidence threshold, leveraging insights derived from overall detections. +Through experimentation, we demonstrate the effectiveness of our adaptive +confidence threshold technique while maintaining running time compared to +ByteTrack. + +
+
+
+
+
+ + ☆ TMSR: Tiny Multi-path CNNs for Super Resolution + + +
+ In this paper, we proposed a tiny multi-path CNN-based Super-Resolution (SR) +method, called TMSR. We mainly refer to some tiny CNN-based SR methods, under +5k parameters. The main contribution of the proposed method is the improved +multi-path learning and self-defined activated function. The experimental +results show that TMSR obtains competitive image quality (i.e. PSNR and SSIM) +compared to the related works under 5k parameters. + +
+
+ comment: 5 pages, 7 figures, published in the IEEE Eurasia Conference on IoT, + Communication and Engineering proceedings 2023 +
+
+
+
+
+ + ☆ SequencePAR: Understanding Pedestrian Attributes via A Sequence + Generation Paradigm + + +
+ Current pedestrian attribute recognition (PAR) algorithms are developed based +on multi-label or multi-task learning frameworks, which aim to discriminate the +attributes using specific classification heads. However, these discriminative +models are easily influenced by imbalanced data or noisy samples. Inspired by +the success of generative models, we rethink the pedestrian attribute +recognition scheme and believe the generative models may perform better on +modeling dependencies and complexity between human attributes. In this paper, +we propose a novel sequence generation paradigm for pedestrian attribute +recognition, termed SequencePAR. It extracts the pedestrian features using a +pre-trained CLIP model and embeds the attribute set into query tokens under the +guidance of text prompts. Then, a Transformer decoder is proposed to generate +the human attributes by incorporating the visual features and attribute query +tokens. The masked multi-head attention layer is introduced into the decoder +module to prevent the model from remembering the next attribute while making +attribute predictions during training. Extensive experiments on multiple widely +used pedestrian attribute recognition datasets fully validated the +effectiveness of our proposed SequencePAR. The source code and pre-trained +models will be released at https://github.com/Event-AHU/OpenPAR. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ J-Net: Improved U-Net for Terahertz Image Super-Resolution + + +
+ Terahertz (THz) waves are electromagnetic waves in the 0.1 to 10 THz +frequency range, and THz imaging is utilized in a range of applications, +including security inspections, biomedical fields, and the non-destructive +examination of materials. However, THz images have low resolution due to the +long wavelength of THz waves. Therefore, improving the resolution of THz images +is one of the current hot research topics. We propose a novel network +architecture called J-Net which is improved version of U-Net to solve the THz +image super-resolution. It employs the simple baseline blocks which can extract +low resolution (LR) image features and learn the mapping of LR images to +highresolution (HR) images efficiently. All training was conducted using the +DIV2K+Flickr2K dataset, and we employed the peak signal-to-noise ratio (PSNR) +for quantitative comparison. In our comparisons with other THz image +super-resolution methods, JNet achieved a PSNR of 32.52 dB, surpassing other +techniques by more than 1 dB. J-Net also demonstrates superior performance on +real THz images compared to other methods. Experiments show that the proposed +J-Net achieves better PSNR and visual improvement compared with other THz image +super-resolution methods. + +
+
+
+
+
+ + ☆ GaussianHead: Impressive 3D Gaussian-based Head Avatars with Dynamic + Hybrid Neural Field + + +
+ Previous head avatar methods have mostly relied on fixed explicit primitives +(mesh, point) or implicit surfaces (Sign Distance Function) and volumetric +neural radiance field, it challenging to strike a balance among high fidelity, +training speed, and resource consumption. The recent popularity of hybrid field +has brought novel representation, but is limited by relying on parameterization +factors obtained through fixed mappings. We propose GaussianHead: an head +avatar algorithm based on anisotropic 3D gaussian primitives. We leverage +canonical gaussians to represent dynamic scenes. Using explicit "dynamic" +tri-plane as an efficient container for parameterized head geometry, aligned +well with factors in the underlying geometry and tri-plane, we obtain aligned +canonical factors for the canonical gaussians. With a tiny MLP, factors are +decoded into opacity and spherical harmonic coefficients of 3D gaussian +primitives. Finally, we use efficient differentiable gaussian rasterizer for +rendering. Our approach benefits significantly from our novel representation +based on 3D gaussians, and the proper alignment transformation of underlying +geometry structures and factors in tri-plane eliminates biases introduced by +fixed mappings. Compared to state-of-the-art techniques, we achieve optimal +visual results in tasks such as self-reconstruction, novel view synthesis, and +cross-identity reenactment while maintaining high rendering efficiency (0.12s +per frame). Even the pores around the nose are clearly visible in some cases. +Code and additional video can be found on the project homepage. + +
+
+
+
+
+ + ☆ CLAMP: Contrastive LAnguage Model Prompt-tuning + + +
+ Large language models (LLMs) have emerged as powerful general-purpose +interfaces for many machine learning problems. Recent work has adapted LLMs to +generative visual tasks like image captioning, visual question answering, and +visual chat, using a relatively small amount of instruction-tuning data. In +this paper, we explore whether modern LLMs can also be adapted to classifying +an image into a set of categories. First, we evaluate multimodal LLMs that are +tuned for generative tasks on zero-shot image classification and find that +their performance is far below that of specialized models like CLIP. We then +propose an approach for light fine-tuning of LLMs using the same contrastive +image-caption matching objective as CLIP. Our results show that LLMs can, +indeed, achieve good image classification performance when adapted this way. +Our approach beats state-of-the-art mLLMs by 13% and slightly outperforms +contrastive learning with a custom text model, while also retaining the LLM's +generative abilities. LLM initialization appears to particularly help +classification in domains under-represented in the visual pre-training data. + +
+
+
+
+
+ + ☆ Universal Segmentation at Arbitrary Granularity with Language + Instruction + + +
+ This paper aims to achieve universal segmentation of arbitrary semantic +level. Despite significant progress in recent years, specialist segmentation +approaches are limited to specific tasks and data distribution. Retraining a +new model for adaptation to new scenarios or settings takes expensive +computation and time cost, which raises the demand for versatile and universal +segmentation model that can cater to various granularity. Although some +attempts have been made for unifying different segmentation tasks or +generalization to various scenarios, limitations in the definition of paradigms +and input-output spaces make it difficult for them to achieve accurate +understanding of content at arbitrary granularity. To this end, we present +UniLSeg, a universal segmentation model that can perform segmentation at any +semantic level with the guidance of language instructions. For training +UniLSeg, we reorganize a group of tasks from original diverse distributions +into a unified data format, where images with texts describing segmentation +targets as input and corresponding masks are output. Combined with a automatic +annotation engine for utilizing numerous unlabeled data, UniLSeg achieves +excellent performance on various tasks and settings, surpassing both specialist +and unified segmentation models. + +
+
+
+
+
+ + ☆ SchurVINS: Schur Complement-Based Lightweight Visual Inertial Navigation + System + + +
+ Accuracy and computational efficiency are the most important metrics to +Visual Inertial Navigation System (VINS). The existing VINS algorithms with +either high accuracy or low computational complexity, are difficult to provide +the high precision localization in resource-constrained devices. To this end, +we propose a novel filter-based VINS framework named SchurVINS, which could +guarantee both high accuracy by building a complete residual model and low +computational complexity with Schur complement. Technically, we first formulate +the full residual model where Gradient, Hessian and observation covariance are +explicitly modeled. Then Schur complement is employed to decompose the full +model into ego-motion residual model and landmark residual model. Finally, +Extended Kalman Filter (EKF) update is implemented in these two models with +high efficiency. Experiments on EuRoC and TUM-VI datasets show that our method +notably outperforms state-of-the-art (SOTA) methods in both accuracy and +computational complexity. We will open source our experimental code to benefit +the community. + +
+
+
+
+
+ + ☆ TextAug: Test time Text Augmentation for Multimodal Person + Re-identification + + +
+ Multimodal Person Reidentification is gaining popularity in the research +community due to its effectiveness compared to counter-part unimodal +frameworks. However, the bottleneck for multimodal deep learning is the need +for a large volume of multimodal training examples. Data augmentation +techniques such as cropping, flipping, rotation, etc. are often employed in the +image domain to improve the generalization of deep learning models. Augmenting +in other modalities than images, such as text, is challenging and requires +significant computational resources and external data sources. In this study, +we investigate the effectiveness of two computer vision data augmentation +techniques: cutout and cutmix, for text augmentation in multi-modal person +re-identification. Our approach merges these two augmentation strategies into +one strategy called CutMixOut which involves randomly removing words or +sub-phrases from a sentence (Cutout) and blending parts of two or more +sentences to create diverse examples (CutMix) with a certain probability +assigned to each operation. This augmentation was implemented at inference time +without any prior training. Our results demonstrate that the proposed technique +is simple and effective in improving the performance on multiple multimodal +person re-identification benchmarks. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ Good Questions Help Zero-Shot Image Reasoning + + +
+ Aligning the recent large language models (LLMs) with computer vision models +leads to large vision-language models (LVLMs), which have paved the way for +zero-shot image reasoning tasks. However, LVLMs are usually trained on short +high-level captions only referring to sparse focus regions in images. Such a +``tunnel vision'' limits LVLMs to exploring other relevant contexts in complex +scenes. To address this challenge, we introduce Question-Driven Visual +Exploration (QVix), a novel prompting strategy that enhances the exploratory +capabilities of LVLMs in zero-shot reasoning tasks. QVix leverages LLMs' strong +language prior to generate input-exploratory questions with more details than +the original query, guiding LVLMs to explore visual content more +comprehensively and uncover subtle or peripheral details. QVix enables a wider +exploration of visual scenes, improving the LVLMs' reasoning accuracy and depth +in tasks such as visual question answering and visual entailment. Our +evaluations on various challenging zero-shot vision-language benchmarks, +including ScienceQA and fine-grained visual classification, demonstrate that +QVix significantly outperforms existing methods, highlighting its effectiveness +in bridging the gap between complex visual data and LVLMs' exploratory +abilities. + +
+
+
+
+
+ + ☆ SCLIP: Rethinking Self-Attention for Dense Vision-Language Inference + + +
+ Recent advances in contrastive language-image pretraining (CLIP) have +demonstrated strong capabilities in zero-shot classification by aligning visual +representations with target text embeddings in an image level. However, in +dense prediction tasks, CLIP often struggles to localize visual features within +an image and fails to give accurate pixel-level predictions, which prevents it +from functioning as a generalized visual foundation model. In this work, we aim +to enhance CLIP's potential for semantic segmentation with minimal +modifications to its pretrained models. By rethinking self-attention, we +surprisingly find that CLIP can adapt to dense prediction tasks by simply +introducing a novel Correlative Self-Attention (CSA) mechanism. Specifically, +we replace the traditional self-attention block of CLIP vision encoder's last +layer by our CSA module and reuse its pretrained projection matrices of query, +key, and value, leading to a training-free adaptation approach for CLIP's +zero-shot semantic segmentation. Extensive experiments show the advantage of +CSA: we obtain a 38.2% average zero-shot mIoU across eight semantic +segmentation benchmarks highlighted in this paper, significantly outperforming +the existing SoTA's 33.9% and the vanilla CLIP's 14.1%. + +
+
+
+
+
+ + ☆ Signed Binarization: Unlocking Efficiency Through Repetition-Sparsity + Trade-Off + + +
+ Efficient inference of Deep Neural Networks (DNNs) on resource-constrained +edge devices is essential. Quantization and sparsity are key algorithmic +techniques that translate to repetition and sparsity within tensors at the +hardware-software interface. This paper introduces the concept of +repetition-sparsity trade-off that helps explain computational efficiency +during inference. We propose Signed Binarization, a unified co-design framework +that synergistically integrates hardware-software systems, quantization +functions, and representation learning techniques to address this trade-off. +Our results demonstrate that Signed Binarization is more accurate than +binarization with the same number of non-zero weights. Detailed analysis +indicates that signed binarization generates a smaller distribution of +effectual (non-zero) parameters nested within a larger distribution of total +parameters, both of the same type, for a DNN block. Finally, our approach +achieves a 26% speedup on real hardware, doubles energy efficiency, and reduces +density by 2.8x compared to binary methods for ResNet 18, presenting an +alternative solution for deploying efficient models in resource-limited +environments. + +
+
+
+
+
+ + ☆ Learning Efficient Unsupervised Satellite Image-based Building Damage + Detection ICDM 2023 + + +
+ Existing Building Damage Detection (BDD) methods always require +labour-intensive pixel-level annotations of buildings and their conditions, +hence largely limiting their applications. In this paper, we investigate a +challenging yet practical scenario of BDD, Unsupervised Building Damage +Detection (U-BDD), where only unlabelled pre- and post-disaster satellite image +pairs are provided. As a pilot study, we have first proposed an advanced U-BDD +baseline that leverages pre-trained vision-language foundation models (i.e., +Grounding DINO, SAM and CLIP) to address the U-BDD task. However, the apparent +domain gap between satellite and generic images causes low confidence in the +foundation models used to identify buildings and their damages. In response, we +further present a novel self-supervised framework, U-BDD++, which improves upon +the U-BDD baseline by addressing domain-specific issues associated with +satellite imagery. Furthermore, the new Building Proposal Generation (BPG) +module and the CLIP-enabled noisy Building Proposal Selection (CLIP-BPS) module +in U-BDD++ ensure high-quality self-training. Extensive experiments on the +widely used building damage assessment benchmark demonstrate the effectiveness +of the proposed method for unsupervised building damage detection. The +presented annotation-free and foundation model-based paradigm ensures an +efficient learning phase. This study opens a new direction for real-world BDD +and sets a strong baseline for future research. + +
+
+ comment: ICDM 2023 +
+
+
+
+
+ + ☆ A Challenging Multimodal Video Summary: Simultaneously Extracting and + Generating Keyframe-Caption Pairs from Video + + +
+ This paper proposes a practical multimodal video summarization task setting +and a dataset to train and evaluate the task. The target task involves +summarizing a given video into a predefined number of keyframe-caption pairs +and displaying them in a listable format to grasp the video content quickly. +This task aims to extract crucial scenes from the video in the form of images +(keyframes) and generate corresponding captions explaining each keyframe's +situation. This task is useful as a practical application and presents a highly +challenging problem worthy of study. Specifically, achieving simultaneous +optimization of the keyframe selection performance and caption quality +necessitates careful consideration of the mutual dependence on both preceding +and subsequent keyframes and captions. To facilitate subsequent research in +this field, we also construct a dataset by expanding upon existing datasets and +propose an evaluation framework. Furthermore, we develop two baseline systems +and report their respective performance. + +
+
+
+
+
+ + ☆ Survey on deep learning in multimodal medical imaging for cancer + detection + + +
+ The task of multimodal cancer detection is to determine the locations and +categories of lesions by using different imaging techniques, which is one of +the key research methods for cancer diagnosis. Recently, deep learning-based +object detection has made significant developments due to its strength in +semantic feature extraction and nonlinear function fitting. However, multimodal +cancer detection remains challenging due to morphological differences in +lesions, interpatient variability, difficulty in annotation, and imaging +artifacts. In this survey, we mainly investigate over 150 papers in recent +years with respect to multimodal cancer detection using deep learning, with a +focus on datasets and solutions to various challenges such as data annotation, +variance between classes, small-scale lesions, and occlusion. We also provide +an overview of the advantages and drawbacks of each approach. Finally, we +discuss the current scope of work and provide directions for the future +development of multimodal cancer detection. + +
+
+
+
+
+ + ☆ How to Configure Good In-Context Sequence for Visual Question Answering + + +
+ Inspired by the success of Large Language Models in dealing with new tasks +via In-Context Learning (ICL) in NLP, researchers have also developed Large +Vision-Language Models (LVLMs) with ICL capabilities. However, when +implementing ICL using these LVLMs, researchers usually resort to the simplest +way like random sampling to configure the in-context sequence, thus leading to +sub-optimal results. To enhance the ICL performance, in this study, we use +Visual Question Answering (VQA) as case study to explore diverse in-context +configurations to find the powerful ones. Additionally, through observing the +changes of the LVLM outputs by altering the in-context sequence, we gain +insights into the inner properties of LVLMs, improving our understanding of +them. Specifically, to explore in-context configurations, we design diverse +retrieval methods and employ different strategies to manipulate the retrieved +demonstrations. Through exhaustive experiments on three VQA datasets: VQAv2, +VizWiz, and OK-VQA, we uncover three important inner properties of the applied +LVLM and demonstrate which strategies can consistently improve the ICL VQA +performance. Our code is provided in: +https://github.com/GaryJiajia/OFv2_ICL_VQA. + +
+
+ comment: 8 pages, 6 figures +
+
+
+
+
+ + ☆ APoLLo: Unified Adapter and Prompt Learning for Vision Language Models EMNLP 2023 + + +
+ The choice of input text prompt plays a critical role in the performance of +Vision-Language Pretrained (VLP) models such as CLIP. We present APoLLo, a +unified multi-modal approach that combines Adapter and Prompt learning for +Vision-Language models. Our method is designed to substantially improve the +generalization capabilities of VLP models when they are fine-tuned in a +few-shot setting. We introduce trainable cross-attention-based adapter layers +in conjunction with vision and language encoders to strengthen the alignment +between the two modalities. We enforce consistency between the respective +encoder branches (receiving augmented inputs) to prevent overfitting in +downstream tasks. Our method is evaluated on three representative tasks: +generalization to novel classes, cross-dataset evaluation, and unseen domain +shifts. In practice, APoLLo achieves a relative gain up to 6.03% over MaPLe +(SOTA) on novel classes for 10 diverse image recognition datasets. + +
+
+ comment: Accepted at EMNLP 2023 (Main track) +
+
+
+
+
+ + ☆ Multi-View Person Matching and 3D Pose Estimation with Arbitrary + Uncalibrated Camera Networks + + +
+ Cross-view person matching and 3D human pose estimation in multi-camera +networks are particularly difficult when the cameras are extrinsically +uncalibrated. Existing efforts generally require large amounts of 3D data for +training neural networks or known camera poses for geometric constraints to +solve the problem. However, camera poses and 3D data annotation are usually +expensive and not always available. We present a method, PME, that solves the +two tasks without requiring either information. Our idea is to address +cross-view person matching as a clustering problem using each person as a +cluster center, then obtain correspondences from person matches, and estimate +3D human poses through multi-view triangulation and bundle adjustment. We solve +the clustering problem by introducing a "size constraint" using the number of +cameras and a "source constraint" using the fact that two people from the same +camera view should not match, to narrow the solution space to a small feasible +region. The 2D human poses used in clustering are obtained through a +pre-trained 2D pose detector, so our method does not require expensive 3D +training data for each new scene. We extensively evaluate our method on three +open datasets and two indoor and outdoor datasets collected using arbitrarily +set cameras. Our method outperforms other methods by a large margin on +cross-view person matching, reaches SOTA performance on 3D human pose +estimation without using either camera poses or 3D training data, and shows +good generalization ability across five datasets of various environment +settings. + +
+
+
+
+
+ + ☆ Hyperspectral Image Compression Using Sampling and Implicit Neural + Representations + + +
+ Hyperspectral images, which record the electromagnetic spectrum for a pixel +in the image of a scene, often store hundreds of channels per pixel and contain +an order of magnitude more information than a similarly-sized RBG color image. +Consequently, concomitant with the decreasing cost of capturing these images, +there is a need to develop efficient techniques for storing, transmitting, and +analyzing hyperspectral images. This paper develops a method for hyperspectral +image compression using implicit neural representations where a multilayer +perceptron network F with sinusoidal activation functions "learns" to map pixel +locations to pixel intensities for a given hyperspectral image I. F thus acts +as a compressed encoding of this image, and the original image is reconstructed +by evaluating F at each pixel location. We use a sampling method with two +factors: window size and sampling rate to reduce the compression time. We have +evaluated our method on four benchmarks -- Indian Pines, Jasper Ridge, Pavia +University, and Cuprite using PSNR and SSIM -- and we show that the proposed +method achieves better compression than JPEG, JPEG2000, and PCA-DCT at low +bitrates. Besides, we compare our results with the learning-based methods like +PCA+JPEG2000, FPCA+JPEG2000, 3D DCT, 3D DWT+SVR, and WSRC and show the +corresponding results in the "Compression Results" section. We also show that +our methods with sampling achieve better speed and performance than our method +without sampling. + +
+
+
+
+
+ + ☆ Unsupervised Change Detection for Space Habitats Using 3D Point Clouds + + +
+ This work presents an algorithm for scene change detection from point clouds +to enable autonomous robotic caretaking in future space habitats. Autonomous +robotic systems will help maintain future deep-space habitats, such as the +Gateway space station, which will be uncrewed for extended periods. Existing +scene analysis software used on the International Space Station (ISS) relies on +manually-labeled images for detecting changes. In contrast, the algorithm +presented in this work uses raw, unlabeled point clouds as inputs. The +algorithm first applies modified Expectation-Maximization Gaussian Mixture +Model (GMM) clustering to two input point clouds. It then performs change +detection by comparing the GMMs using the Earth Mover's Distance. The algorithm +is validated quantitatively and qualitatively using a test dataset collected by +an Astrobee robot in the NASA Ames Granite Lab comprising single frame depth +images taken directly by Astrobee and full-scene reconstructed maps built with +RGB-D and pose data from Astrobee. The runtimes of the approach are also +analyzed in depth. The source code is publicly released to promote further +development. + +
+
+ comment: 15 pages, 7 figures, Manuscript will be presented at the AIAA SciTech + Forum in Orlando, FL, USA, 8 - 12 January 2024 +
+
+
+
+
+ + ♻ ☆ ContriMix: Unsupervised disentanglement of content and attribute for + domain generalization in microscopy image analysis + + +
+ Domain generalization is critical for real-world applications of machine +learning to microscopy images, including histopathology and fluorescence +imaging. Artifacts in these modalities arise through a complex combination of +factors relating to tissue collection and laboratory processing, as well as +factors intrinsic to patient samples. In fluorescence imaging, these artifacts +stem from variations across experimental batches. The complexity and subtlety +of these artifacts make the enumeration of data domains intractable. Therefore, +augmentation-based methods of domain generalization that require domain +identifiers and manual fine-tuning are inadequate in this setting. To overcome +this challenge, we introduce ContriMix, a domain generalization technique that +learns to generate synthetic images by disentangling and permuting the +biological content ("content") and technical variations ("attributes") in +microscopy images. ContriMix does not rely on domain identifiers or handcrafted +augmentations and makes no assumptions about the input characteristics of +images. We assess the performance of ContriMix on two pathology datasets +dealing with patch classification and Whole Slide Image label prediction tasks +respectively (Camelyon17-WILDS and RCC subtyping), and one fluorescence +microscopy dataset (RxRx1-WILDS). Without any access to domain identifiers at +train or test time, ContriMix performs similar or better than current +state-of-the-art methods in all these datasets, motivating its usage for +microscopy image analysis in real-world settings where domain information is +hard to come by. The code for ContriMix can be found at +https://gitlab.com/huutan86/contrimix + +
+
+
+
+
+ + ♻ ☆ A new sampling methodology for defining heterogeneous subsets of samples + for training image segmentation algorithms + + +
+ Creating a dataset for training supervised machine learning algorithms can be +a demanding task. This is especially true for medical image segmentation since +one or more specialists are usually required for image annotation, and creating +ground truth labels for just a single image can take up to several hours. In +addition, it is paramount that the annotated samples represent well the +different conditions that might affect the imaged tissues as well as possible +changes in the image acquisition process. This can only be achieved by +considering samples that are typical in the dataset as well as atypical, or +even outlier, samples. We introduce a new sampling methodology for selecting +relevant images from a large dataset in a way that evenly considers both +prototypical as well as atypical samples. The methodology involves the +generation of a uniform grid from a feature space representing the samples, +which is then used for randomly drawing relevant images. The selected images +provide a uniform covering of the original dataset, and thus define a +heterogeneous set of images that can be annotated and used for training +supervised segmentation algorithms. We provide a case example by creating a +dataset containing a representative set of blood vessel microscopy images +selected from a larger dataset containing thousands of images. The dataset, +which we call VessMAP, is being made available online to aid the development of +new blood vessel segmentation algorithms. + +
+
+ comment: 10 pages, 9 figures. Under review +
+
+
+
+
+ + ♻ ☆ I-AI: A Controllable & Interpretable AI System for Decoding + Radiologists' Intense Focus for Accurate CXR Diagnoses + + +
+ In the field of chest X-ray (CXR) diagnosis, existing works often focus +solely on determining where a radiologist looks, typically through tasks such +as detection, segmentation, or classification. However, these approaches are +often designed as black-box models, lacking interpretability. In this paper, we +introduce Interpretable Artificial Intelligence (I-AI) a novel and unified +controllable interpretable pipeline for decoding the intense focus of +radiologists in CXR diagnosis. Our I-AI addresses three key questions: where a +radiologist looks, how long they focus on specific areas, and what findings +they diagnose. By capturing the intensity of the radiologist's gaze, we provide +a unified solution that offers insights into the cognitive process underlying +radiological interpretation. Unlike current methods that rely on black-box +machine learning models, which can be prone to extracting erroneous information +from the entire input image during the diagnosis process, we tackle this issue +by effectively masking out irrelevant information. Our proposed I-AI leverages +a vision-language model, allowing for precise control over the interpretation +process while ensuring the exclusion of irrelevant features. To train our I-AI +model, we utilize an eye gaze dataset to extract anatomical gaze information +and generate ground truth heatmaps. Through extensive experimentation, we +demonstrate the efficacy of our method. We showcase that the attention +heatmaps, designed to mimic radiologists' focus, encode sufficient and relevant +information, enabling accurate classification tasks using only a portion of +CXR. + +
+
+
+
+
+ + ♻ ☆ Direct Unsupervised Denoising + + +
+ Traditional supervised denoisers are trained using pairs of noisy input and +clean target images. They learn to predict a central tendency of the posterior +distribution over possible clean images. When, e.g., trained with the popular +quadratic loss function, the network's output will correspond to the minimum +mean square error (MMSE) estimate. Unsupervised denoisers based on Variational +AutoEncoders (VAEs) have succeeded in achieving state-of-the-art results while +requiring only unpaired noisy data as training input. In contrast to the +traditional supervised approach, unsupervised denoisers do not directly produce +a single prediction, such as the MMSE estimate, but allow us to draw samples +from the posterior distribution of clean solutions corresponding to the noisy +input. To approximate the MMSE estimate during inference, unsupervised methods +have to create and draw a large number of samples - a computationally expensive +process - rendering the approach inapplicable in many situations. Here, we +present an alternative approach that trains a deterministic network alongside +the VAE to directly predict a central tendency. Our method achieves results +that surpass the results achieved by the unsupervised method at a fraction of +the computational cost. + +
+
+
+
+
+ + ♻ ☆ How Deep Neural Networks Learn Compositional Data: The Random Hierarchy + Model + + +
+ Deep learning algorithms demonstrate a surprising ability to learn +high-dimensional tasks from limited examples. This is commonly attributed to +the depth of neural networks, enabling them to build a hierarchy of abstract, +low-dimensional data representations. However, how many training examples are +required to learn such representations remains unknown. To quantitatively study +this question, we introduce the Random Hierarchy Model: a family of synthetic +tasks inspired by the hierarchical structure of language and images. The model +is a classification task where each class corresponds to a group of high-level +features, chosen among several equivalent groups associated with the same +class. In turn, each feature corresponds to a group of sub-features chosen +among several equivalent ones and so on, following a hierarchy of composition +rules. We find that deep networks learn the task by developing internal +representations invariant to exchanging equivalent groups. Moreover, the number +of data required corresponds to the point where correlations between low-level +features and classes become detectable. Overall, our results indicate how deep +networks overcome the curse of dimensionality by building invariant +representations, and provide an estimate of the number of data required to +learn a hierarchical task. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Generating Realistic Counterfactuals for Retinal Fundus and OCT Images + using Diffusion Models + + +
+ Counterfactual reasoning is often used in clinical settings to explain +decisions or weigh alternatives. Therefore, for imaging based specialties such +as ophthalmology, it would be beneficial to be able to create counterfactual +images, illustrating answers to questions like "If the subject had had diabetic +retinopathy, how would the fundus image have looked?". Here, we demonstrate +that using a diffusion model in combination with an adversarially robust +classifier trained on retinal disease classification tasks enables the +generation of highly realistic counterfactuals of retinal fundus images and +optical coherence tomography (OCT) B-scans. The key to the realism of +counterfactuals is that these classifiers encode salient features indicative +for each disease class and can steer the diffusion model to depict disease +signs or remove disease-related lesions in a realistic way. In a user study, +domain experts also found the counterfactuals generated using our method +significantly more realistic than counterfactuals generated from a previous +method, and even indistinguishable from real images. + +
+
+
+
+
+ + ♻ ☆ Interpreting and Improving Diffusion Models Using the Euclidean Distance + Function + + +
+ Denoising is intuitively related to projection. Indeed, under the manifold +hypothesis, adding random noise is approximately equivalent to orthogonal +perturbation. Hence, learning to denoise is approximately learning to project. +In this paper, we use this observation to reinterpret denoising diffusion +models as approximate gradient descent applied to the Euclidean distance +function. We then provide straight-forward convergence analysis of the DDIM +sampler under simple assumptions on the projection-error of the denoiser. +Finally, we propose a new sampler based on two simple modifications to DDIM +using insights from our theoretical results. In as few as 5-10 function +evaluations, our sampler achieves state-of-the-art FID scores on pretrained +CIFAR-10 and CelebA models and can generate high quality samples on latent +diffusion models. + +
+
+ comment: 19 pages, 8 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ AgentAvatar: Disentangling Planning, Driving and Rendering for + Photorealistic Avatar Agents + + +
+ In this study, our goal is to create interactive avatar agents that can +autonomously plan and animate nuanced facial movements realistically, from both +visual and behavioral perspectives. Given high-level inputs about the +environment and agent profile, our framework harnesses LLMs to produce a series +of detailed text descriptions of the avatar agents' facial motions. These +descriptions are then processed by our task-agnostic driving engine into motion +token sequences, which are subsequently converted into continuous motion +embeddings that are further consumed by our standalone neural-based renderer to +generate the final photorealistic avatar animations. These streamlined +processes allow our framework to adapt to a variety of non-verbal avatar +interactions, both monadic and dyadic. Our extensive study, which includes +experiments on both newly compiled and existing datasets featuring two types of +agents -- one capable of monadic interaction with the environment, and the +other designed for dyadic conversation -- validates the effectiveness and +versatility of our approach. To our knowledge, we advanced a leap step by +combining LLMs and neural rendering for generalized non-verbal prediction and +photo-realistic rendering of avatar agents. + +
+
+ comment: Project page: https://dorniwang.github.io/AgentAvatar_project/ +
+
+
+
+
+ + ♻ ☆ Space-Time Attention with Shifted Non-Local Search + + +
+ Efficiently computing attention maps for videos is challenging due to the +motion of objects between frames. While a standard non-local search is +high-quality for a window surrounding each query point, the window's small size +cannot accommodate motion. Methods for long-range motion use an auxiliary +network to predict the most similar key coordinates as offsets from each query +location. However, accurately predicting this flow field of offsets remains +challenging, even for large-scale networks. Small spatial inaccuracies +significantly impact the attention module's quality. This paper proposes a +search strategy that combines the quality of a non-local search with the range +of predicted offsets. The method, named Shifted Non-Local Search, executes a +small grid search surrounding the predicted offsets to correct small spatial +errors. Our method's in-place computation consumes 10 times less memory and is +over 3 times faster than previous work. Experimentally, correcting the small +spatial errors improves the video frame alignment quality by over 3 dB PSNR. +Our search upgrades existing space-time attention modules, which improves video +denoising results by 0.30 dB PSNR for a 7.5% increase in overall runtime. We +integrate our space-time attention module into a UNet-like architecture to +achieve state-of-the-art results on video denoising. + +
+
+ comment: 15 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ Depth-Regularized Optimization for 3D Gaussian Splatting in Few-Shot + Images + + +
+ In this paper, we present a method to optimize Gaussian splatting with a +limited number of images while avoiding overfitting. Representing a 3D scene by +combining numerous Gaussian splats has yielded outstanding visual quality. +However, it tends to overfit the training views when only a small number of +images are available. To address this issue, we introduce a dense depth map as +a geometry guide to mitigate overfitting. We obtained the depth map using a +pre-trained monocular depth estimation model and aligning the scale and offset +using sparse COLMAP feature points. The adjusted depth aids in the color-based +optimization of 3D Gaussian splatting, mitigating floating artifacts, and +ensuring adherence to geometric constraints. We verify the proposed method on +the NeRF-LLFF dataset with varying numbers of few images. Our approach +demonstrates robust geometry compared to the original method that relies solely +on images. Project page: robot0321.github.io/DepthRegGS + +
+
+ comment: 10 pages, 5 figures; Project page: robot0321.github.io/DepthRegGS +
+
+
+
+
+ + ♻ ☆ FaceDNeRF: Semantics-Driven Face Reconstruction, Prompt Editing and + Relighting with Diffusion Models + + +
+ The ability to create high-quality 3D faces from a single image has become +increasingly important with wide applications in video conferencing, AR/VR, and +advanced video editing in movie industries. In this paper, we propose Face +Diffusion NeRF (FaceDNeRF), a new generative method to reconstruct high-quality +Face NeRFs from single images, complete with semantic editing and relighting +capabilities. FaceDNeRF utilizes high-resolution 3D GAN inversion and expertly +trained 2D latent-diffusion model, allowing users to manipulate and construct +Face NeRFs in zero-shot learning without the need for explicit 3D data. With +carefully designed illumination and identity preserving loss, as well as +multi-modal pre-training, FaceDNeRF offers users unparalleled control over the +editing process enabling them to create and edit face NeRFs using just +single-view images, text prompts, and explicit target lighting. The advanced +features of FaceDNeRF have been designed to produce more impressive results +than existing 2D editing approaches that rely on 2D segmentation maps for +editable attributes. Experiments show that our FaceDNeRF achieves exceptionally +realistic results and unprecedented flexibility in editing compared with +state-of-the-art 3D face reconstruction and editing methods. Our code will be +available at https://github.com/BillyXYB/FaceDNeRF. + +
+
+
+
+
+ + ♻ ☆ BioCLIP: A Vision Foundation Model for the Tree of Life + + +
+ Images of the natural world, collected by a variety of cameras, from drones +to individual phones, are increasingly abundant sources of biological +information. There is an explosion of computational methods and tools, +particularly computer vision, for extracting biologically relevant information +from images for science and conservation. Yet most of these are bespoke +approaches designed for a specific task and are not easily adaptable or +extendable to new questions, contexts, and datasets. A vision model for general +organismal biology questions on images is of timely need. To approach this, we +curate and release TreeOfLife-10M, the largest and most diverse ML-ready +dataset of biology images. We then develop BioCLIP, a foundation model for the +tree of life, leveraging the unique properties of biology captured by +TreeOfLife-10M, namely the abundance and variety of images of plants, animals, +and fungi, together with the availability of rich structured biological +knowledge. We rigorously benchmark our approach on diverse fine-grained biology +classification tasks, and find that BioCLIP consistently and substantially +outperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation +reveals that BioCLIP has learned a hierarchical representation conforming to +the tree of life, shedding light on its strong generalizability. Our code, +models and data will be made available at +https://github.com/Imageomics/bioclip. + +
+
+ comment: 18 pages; updated title +
+
+
+
+
+ + ♻ ☆ Student Classroom Behavior Detection based on Spatio-Temporal Network + and Multi-Model Fusion + + +
+ Using deep learning methods to detect students' classroom behavior +automatically is a promising approach for analyzing their class performance and +improving teaching effectiveness. However, the lack of publicly available +spatio-temporal datasets on student behavior, as well as the high cost of +manually labeling such datasets, pose significant challenges for researchers in +this field. To address this issue, we proposed a method for extending the +spatio-temporal behavior dataset in Student Classroom Scenarios +(SCB-ST-Dataset4) through image dataset. Our SCB-ST-Dataset4 comprises 757265 +images with 25810 labels, focusing on 3 behaviors: hand-raising, reading, +writing. Our proposed method can rapidly generate spatio-temporal behavior +datasets without requiring extra manual labeling. Furthermore, we proposed a +Behavior Similarity Index (BSI) to explore the similarity of behaviors. We +evaluated the dataset using the YOLOv5, YOLOv7, YOLOv8, and SlowFast +algorithms, achieving a mean average precision (map) of up to 82.3%. Last, we +fused multiple models to generate student behavior-related data from various +perspectives. The experiment further demonstrates the effectiveness of our +method. And SCB-ST-Dataset4 provides a robust foundation for future research in +student behavior detection, potentially contributing to advancements in this +field. The SCB-ST-Dataset4 is available for download at: +https://github.com/Whiffe/SCB-dataset. + +
+
+ comment: arXiv admin note: substantial text overlap with arXiv:2310.02522; + text overlap with arXiv:2306.03318 +
+
+
+
+
+ + ♻ ☆ High-Fidelity Zero-Shot Texture Anomaly Localization Using Feature + Correspondence Analysis + + +
+ We propose a novel method for Zero-Shot Anomaly Localization on textures. The +task refers to identifying abnormal regions in an otherwise homogeneous image. +To obtain a high-fidelity localization, we leverage a bijective mapping derived +from the 1-dimensional Wasserstein Distance. As opposed to using holistic +distances between distributions, the proposed approach allows pinpointing the +non-conformity of a pixel in a local context with increased precision. By +aggregating the contribution of the pixel to the errors of all nearby patches +we obtain a reliable anomaly score estimate. We validate our solution on +several datasets and obtain more than a 40% reduction in error over the +previous state of the art on the MVTec AD dataset in a zero-shot setting. Also +see https://reality.tf.fau.de/pub/ardelean2024highfidelity.html. + +
+
+
+
+
+ + ♻ ☆ Continual Learning with Dynamic Sparse Training: Exploring Algorithms + for Effective Model Updates + + +
+ Continual learning (CL) refers to the ability of an intelligent system to +sequentially acquire and retain knowledge from a stream of data with as little +computational overhead as possible. To this end; regularization, replay, +architecture, and parameter isolation approaches were introduced to the +literature. Parameter isolation using a sparse network which enables to +allocate distinct parts of the neural network to different tasks and also +allows to share of parameters between tasks if they are similar. Dynamic Sparse +Training (DST) is a prominent way to find these sparse networks and isolate +them for each task. This paper is the first empirical study investigating the +effect of different DST components under the CL paradigm to fill a critical +research gap and shed light on the optimal configuration of DST for CL if it +exists. Therefore, we perform a comprehensive study in which we investigate +various DST components to find the best topology per task on well-known +CIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our +primary focus is to evaluate the performance of various DST criteria, rather +than the process of mask selection. We found that, at a low sparsity level, +Erdos-R\'enyi Kernel (ERK) initialization utilizes the backbone more +efficiently and allows to effectively learn increments of tasks. At a high +sparsity level, unless it is extreme, uniform initialization demonstrates a +more reliable and robust performance. In terms of growth strategy; performance +is dependent on the defined initialization strategy and the extent of sparsity. +Finally, adaptivity within DST components is a promising way for better +continual learners. + +
+
+
+
+
+ + ♻ ☆ Can GPT-4V(ision) Serve Medical Applications? Case Studies on GPT-4V for + Multimodal Medical Diagnosis + + +
+ Driven by the large foundation models, the development of artificial +intelligence has witnessed tremendous progress lately, leading to a surge of +general interest from the public. In this study, we aim to assess the +performance of OpenAI's newest model, GPT-4V(ision), specifically in the realm +of multimodal medical diagnosis. Our evaluation encompasses 17 human body +systems, including Central Nervous System, Head and Neck, Cardiac, Chest, +Hematology, Hepatobiliary, Gastrointestinal, Urogenital, Gynecology, +Obstetrics, Breast, Musculoskeletal, Spine, Vascular, Oncology, Trauma, +Pediatrics, with images taken from 8 modalities used in daily clinic routine, +e.g., X-ray, Computed Tomography (CT), Magnetic Resonance Imaging (MRI), +Positron Emission Tomography (PET), Digital Subtraction Angiography (DSA), +Mammography, Ultrasound, and Pathology. We probe the GPT-4V's ability on +multiple clinical tasks with or without patent history provided, including +imaging modality and anatomy recognition, disease diagnosis, report generation, +disease localisation. + Our observation shows that, while GPT-4V demonstrates proficiency in +distinguishing between medical image modalities and anatomy, it faces +significant challenges in disease diagnosis and generating comprehensive +reports. These findings underscore that while large multimodal models have made +significant advancements in computer vision and natural language processing, it +remains far from being used to effectively support real-world medical +applications and clinical decision-making. + All images used in this report can be found in +https://github.com/chaoyi-wu/GPT-4V_Medical_Evaluation. + +
+
+
+
+
+ + ♻ ☆ Towards Surveillance Video-and-Language Understanding: New Dataset, + Baselines, and Challenges + + +
+ Surveillance videos are an essential component of daily life with various +critical applications, particularly in public security. However, current +surveillance video tasks mainly focus on classifying and localizing anomalous +events. Existing methods are limited to detecting and classifying the +predefined events with unsatisfactory semantic understanding, although they +have obtained considerable performance. To address this issue, we propose a new +research direction of surveillance video-and-language understanding, and +construct the first multimodal surveillance video dataset. We manually annotate +the real-world surveillance dataset UCF-Crime with fine-grained event content +and timing. Our newly annotated dataset, UCA (UCF-Crime Annotation), contains +23,542 sentences, with an average length of 20 words, and its annotated videos +are as long as 110.7 hours. Furthermore, we benchmark SOTA models for four +multimodal tasks on this newly created dataset, which serve as new baselines +for surveillance video-and-language understanding. Through our experiments, we +find that mainstream models used in previously publicly available datasets +perform poorly on surveillance video, which demonstrates the new challenges in +surveillance video-and-language understanding. To validate the effectiveness of +our UCA, we conducted experiments on multimodal anomaly detection. The results +demonstrate that our multimodal surveillance learning can improve the +performance of conventional anomaly detection tasks. All the experiments +highlight the necessity of constructing this dataset to advance surveillance +AI. The link to our dataset is provided at: +https://xuange923.github.io/Surveillance-Video-Understanding. + +
+
+
+
+
+ + ♻ ☆ HoVer-UNet: Accelerating HoVerNet with UNet-based multi-class nuclei + segmentation via knowledge distillation + + +
+ We present HoVer-UNet, an approach to distill the knowledge of the +multi-branch HoVerNet framework for nuclei instance segmentation and +classification in histopathology. We propose a compact, streamlined single UNet +network with a Mix Vision Transformer backbone, and equip it with a custom loss +function to optimally encode the distilled knowledge of HoVerNet, reducing +computational requirements without compromising performances. We show that our +model achieved results comparable to HoVerNet on the public PanNuke and Consep +datasets with a three-fold reduction in inference time. We make the code of our +model publicly available at https://github.com/DIAGNijmegen/HoVer-UNet. + +
+
+ comment: 4 pages, 2 figures, submitted to ISBI 2024 +
+
+
+
+
+ + ♻ ☆ Interpretable 2D Vision Models for 3D Medical Images + + +
+ Training Artificial Intelligence (AI) models on 3D images presents unique +challenges compared to the 2D case: Firstly, the demand for computational +resources is significantly higher, and secondly, the availability of large +datasets for pre-training is often limited, impeding training success. This +study proposes a simple approach of adapting 2D networks with an intermediate +feature representation for processing 3D images. Our method employs attention +pooling to learn to assign each slice an importance weight and, by that, obtain +a weighted average of all 2D slices. These weights directly quantify the +contribution of each slice to the contribution and thus make the model +prediction inspectable. We show on all 3D MedMNIST datasets as benchmark and +two real-world datasets consisting of several hundred high-resolution CT or MRI +scans that our approach performs on par with existing methods. Furthermore, we +compare the in-built interpretability of our approach to HiResCam, a +state-of-the-art retrospective interpretability approach. + +
+
+
+
+
+ + ♻ ☆ Visualizing key features in X-ray images of epoxy resins for improved + material classification using singular value decomposition of deep learning + features + + +
+ Although the process variables of epoxy resins alter their mechanical +properties, the visual identification of the characteristic features of X-ray +images of samples of these materials is challenging. To facilitate the +identification, we approximate the magnitude of the gradient of the intensity +field of the X-ray images of different kinds of epoxy resins and then we use +deep learning to discover the most representative features of the transformed +images. In this solution of the inverse problem to finding characteristic +features to discriminate samples of heterogeneous materials, we use the +eigenvectors obtained from the singular value decomposition of all the channels +of the feature maps of the early layers in a convolutional neural network. +While the strongest activated channel gives a visual representation of the +characteristic features, often these are not robust enough in some practical +settings. On the other hand, the left singular vectors of the matrix +decomposition of the feature maps, barely change when variables such as the +capacity of the network or network architecture change. High classification +accuracy and robustness of characteristic features are presented in this work. + +
+
+ comment: 43 pages, 16 figures +
+
+
+
+
+ + ♻ ☆ Label-Efficient Deep Learning in Medical Image Analysis: Challenges and + Future Directions + + +
+ Deep learning has seen rapid growth in recent years and achieved +state-of-the-art performance in a wide range of applications. However, training +models typically requires expensive and time-consuming collection of large +quantities of labeled data. This is particularly true within the scope of +medical imaging analysis (MIA), where data are limited and labels are expensive +to be acquired. Thus, label-efficient deep learning methods are developed to +make comprehensive use of the labeled data as well as the abundance of +unlabeled and weak-labeled data. In this survey, we extensively investigated +over 300 recent papers to provide a comprehensive overview of recent progress +on label-efficient learning strategies in MIA. We first present the background +of label-efficient learning and categorize the approaches into different +schemes. Next, we examine the current state-of-the-art methods in detail +through each scheme. Specifically, we provide an in-depth investigation, +covering not only canonical semi-supervised, self-supervised, and +multi-instance learning schemes, but also recently emerged active and +annotation-efficient learning strategies. Moreover, as a comprehensive +contribution to the field, this survey not only elucidates the commonalities +and unique features of the surveyed methods but also presents a detailed +analysis of the current challenges in the field and suggests potential avenues +for future research. + +
+
+ comment: Update 2023 papers +
+
+
+
+
+ + ♻ ☆ CNN Feature Map Augmentation for Single-Source Domain Generalization + + +
+ In search of robust and generalizable machine learning models, Domain +Generalization (DG) has gained significant traction during the past few years. +The goal in DG is to produce models which continue to perform well when +presented with data distributions different from the ones available during +training. While deep convolutional neural networks (CNN) have been able to +achieve outstanding performance on downstream computer vision tasks, they still +often fail to generalize on previously unseen data Domains. Therefore, in this +work we focus on producing a model which is able to remain robust under data +distribution shift and propose an alternative regularization technique for +convolutional neural network architectures in the single-source DG image +classification setting. To mitigate the problem caused by domain shift between +source and target data, we propose augmenting intermediate feature maps of +CNNs. Specifically, we pass them through a novel Augmentation Layer} to prevent +models from overfitting on the training set and improve their cross-domain +generalization. To the best of our knowledge, this is the first paper proposing +such a setup for the DG image classification setting. Experiments on the DG +benchmark datasets of PACS, VLCS, Office-Home and TerraIncognita validate the +effectiveness of our method, in which our model surpasses state-of-the-art +algorithms in most cases. + +
+
+ comment: In proceedings of IEEE BigDataService2023 + (https://ieeebigdataservice.com/) +
+
+
+
+
+ + ♻ ☆ 3D Point Cloud Registration with Learning-based Matching Algorithm + + +
+ We present a novel differential matching algorithm for 3D point cloud +registration. Instead of only optimizing the feature extractor for a matching +algorithm, we propose a learning-based matching module optimized to the +jointly-trained feature extractor. We focused on edge-wise feature-forwarding +architectures, which are memory-consuming but can avoid the over-smoothing +effect that GNNs suffer. We improve its memory efficiency to scale it for point +cloud registration while investigating the best way of connecting it to the +feature extractor. Experimental results show our matching module's significant +impact on performance improvement in rigid/non-rigid and whole/partial point +cloud registration datasets with multiple contemporary feature extractors. For +example, our module boosted the current SOTA method, RoITr, by +5.4%, and +7.2% +in the NFMR metric and +6.1% and +8.5% in the IR metric on the 4DMatch and +4DLoMatch datasets, respectively. + +
+
+
+
+
+ + ♻ ☆ Omnipotent Adversarial Training in the Wild + + +
+ Adversarial training is an important topic in robust deep learning, but the +community lacks attention to its practical usage. In this paper, we aim to +resolve a real-world challenge, i.e., training a model on an imbalanced and +noisy dataset to achieve high clean accuracy and adversarial robustness, with +our proposed Omnipotent Adversarial Training (OAT) strategy. OAT consists of +two innovative methodologies to address the imperfection in the training set. +We first introduce an oracle into the adversarial training process to help the +model learn a correct data-label conditional distribution. This +carefully-designed oracle can provide correct label annotations for adversarial +training. We further propose logits adjustment adversarial training to overcome +the data imbalance issue, which can help the model learn a Bayes-optimal +distribution. Our comprehensive evaluation results show that OAT outperforms +other baselines by more than 20% clean accuracy improvement and 10% robust +accuracy improvement under complex combinations of data imbalance and label +noise scenarios. The code can be found in https://github.com/GuanlinLee/OAT. + +
+
+
+
+
+ + ♻ ☆ Alleviating the Effect of Data Imbalance on Adversarial Training + + +
+ In this paper, we study adversarial training on datasets that obey the +long-tailed distribution, which is practical but rarely explored in previous +works. Compared with conventional adversarial training on balanced datasets, +this process falls into the dilemma of generating uneven adversarial examples +(AEs) and an unbalanced feature embedding space, causing the resulting model to +exhibit low robustness and accuracy on tail data. To combat that, we +theoretically analyze the lower bound of the robust risk to train a model on a +long-tailed dataset to obtain the key challenges in addressing the +aforementioned dilemmas. Based on it, we propose a new adversarial training +framework -- Re-balancing Adversarial Training (REAT). This framework consists +of two components: (1) a new training strategy inspired by the effective number +to guide the model to generate more balanced and informative AEs; (2) a +carefully constructed penalty function to force a satisfactory feature space. +Evaluation results on different datasets and model structures prove that REAT +can effectively enhance the model's robustness and preserve the model's clean +accuracy. The code can be found in https://github.com/GuanlinLee/REAT. + +
+
+
+
+
+ + ♻ ☆ Learning Structure-from-Motion with Graph Attention Networks + + +
+ In this paper we tackle the problem of learning Structure-from-Motion (SfM) +through the use of graph attention networks. SfM is a classic computer vision +problem that is solved though iterative minimization of reprojection errors, +referred to as Bundle Adjustment (BA), starting from a good initialization. In +order to obtain a good enough initialization to BA, conventional methods rely +on a sequence of sub-problems (such as pairwise pose estimation, pose averaging +or triangulation) which provides an initial solution that can then be refined +using BA. In this work we replace these sub-problems by learning a model that +takes as input the 2D keypoints detected across multiple views, and outputs the +corresponding camera poses and 3D keypoint coordinates. Our model takes +advantage of graph neural networks to learn SfM-specific primitives, and we +show that it can be used for fast inference of the reconstruction for new and +unseen sequences. The experimental results show that the proposed model +outperforms competing learning-based methods, and challenges COLMAP while +having lower runtime. + +
+
+ comment: Added additional metrics +
+
+
+
+
+ + ♻ ☆ MineGAN++: Mining Generative Models for Efficient Knowledge Transfer to + Limited Data Domains + + +
+ GANs largely increases the potential impact of generative models. Therefore, +we propose a novel knowledge transfer method for generative models based on +mining the knowledge that is most beneficial to a specific target domain, +either from a single or multiple pretrained GANs. This is done using a miner +network that identifies which part of the generative distribution of each +pretrained GAN outputs samples closest to the target domain. Mining effectively +steers GAN sampling towards suitable regions of the latent space, which +facilitates the posterior finetuning and avoids pathologies of other methods, +such as mode collapse and lack of flexibility. Furthermore, to prevent +overfitting on small target domains, we introduce sparse subnetwork selection, +that restricts the set of trainable neurons to those that are relevant for the +target dataset. We perform comprehensive experiments on several challenging +datasets using various GAN architectures (BigGAN, Progressive GAN, and +StyleGAN) and show that the proposed method, called MineGAN, effectively +transfers knowledge to domains with few target images, outperforming existing +methods. In addition, MineGAN can successfully transfer knowledge from multiple +pretrained GANs. + +
+
+ comment: accepted at IJCV. arXiv admin note: substantial text overlap with + arXiv:1912.05270 +
+
+
+
+
+ + ♻ ☆ Learning Physics-Inspired Regularization for Medical Image Registration + with Hypernetworks SP + + +
+ Medical image registration aims at identifying the spatial deformation +between images of the same anatomical region and is fundamental to image-based +diagnostics and therapy. To date, the majority of the deep learning-based +registration methods employ regularizers that enforce global spatial +smoothness, e.g., the diffusion regularizer. However, such regularizers are not +tailored to the data and might not be capable of reflecting the complex +underlying deformation. In contrast, physics-inspired regularizers promote +physically plausible deformations. One such regularizer is the linear elastic +regularizer which models the deformation of elastic material. These +regularizers are driven by parameters that define the material's physical +properties. For biological tissue, a wide range of estimations of such +parameters can be found in the literature and it remains an open challenge to +identify suitable parameter values for successful registration. To overcome +this problem and to incorporate physical properties into learning-based +registration, we propose to use a hypernetwork that learns the effect of the +physical parameters of a physics-inspired regularizer on the resulting spatial +deformation field. In particular, we adapt the HyperMorph framework to learn +the effect of the two elasticity parameters of the linear elastic regularizer. +Our approach enables the efficient discovery of suitable, data-specific +physical parameters at test time. + +
+
+ comment: Manuscript accepted at SPIE Medical Imaging 2024 +
+
+
+
+
+ + ♻ ☆ Tissue Segmentation of Thick-Slice Fetal Brain MR Scans with Guidance + from High-Quality Isotropic Volumes + + +
+ Accurate tissue segmentation of thick-slice fetal brain magnetic resonance +(MR) scans is crucial for both reconstruction of isotropic brain MR volumes and +the quantification of fetal brain development. However, this task is +challenging due to the use of thick-slice scans in clinically-acquired fetal +brain data. To address this issue, we propose to leverage high-quality +isotropic fetal brain MR volumes (and also their corresponding annotations) as +guidance for segmentation of thick-slice scans. Due to existence of significant +domain gap between high-quality isotropic volume (i.e., source data) and +thick-slice scans (i.e., target data), we employ a domain adaptation technique +to achieve the associated knowledge transfer (from high-quality +volumes to thick-slice scans). Specifically, we first register the +available high-quality isotropic fetal brain MR volumes across different +gestational weeks to construct longitudinally-complete source data. To capture +domain-invariant information, we then perform Fourier decomposition to extract +image content and style codes. Finally, we propose a novel Cycle-Consistent +Domain Adaptation Network (C2DA-Net) to efficiently transfer the knowledge +learned from high-quality isotropic volumes for accurate tissue segmentation of +thick-slice scans. Our C2DA-Net can fully utilize a small set of annotated +isotropic volumes to guide tissue segmentation on unannotated thick-slice +scans. Extensive experiments on a large-scale dataset of 372 clinically +acquired thick-slice MR scans demonstrate that our C2DA-Net achieves much +better performance than cutting-edge methods quantitatively and qualitatively. + +
+
+ comment: 10 pages, 9 figures, 5 tables, Fetal MRI, Brain tissue segmentation, + Unsupervised domain adaptation, Cycle-consistency +
+
+
+
+
+ + ♻ ☆ Typhoon Intensity Prediction with Vision Transformer NeurIPS 2023 + + +
+ Predicting typhoon intensity accurately across space and time is crucial for +issuing timely disaster warnings and facilitating emergency response. This has +vast potential for minimizing life losses and property damages as well as +reducing economic and environmental impacts. Leveraging satellite imagery for +scenario analysis is effective but also introduces additional challenges due to +the complex relations among clouds and the highly dynamic context. Existing +deep learning methods in this domain rely on convolutional neural networks +(CNNs), which suffer from limited per-layer receptive fields. This limitation +hinders their ability to capture long-range dependencies and global contextual +knowledge during inference. In response, we introduce a novel approach, namely +"Typhoon Intensity Transformer" (Tint), which leverages self-attention +mechanisms with global receptive fields per layer. Tint adopts a +sequence-to-sequence feature representation learning perspective. It begins by +cutting a given satellite image into a sequence of patches and recursively +employs self-attention operations to extract both local and global contextual +relations between all patch pairs simultaneously, thereby enhancing per-patch +feature representation learning. Extensive experiments on a publicly available +typhoon benchmark validate the efficacy of Tint in comparison with both +state-of-the-art deep learning and conventional meteorological methods. Our +code is available at https://github.com/chen-huanxin/Tint. + +
+
+ comment: 8 pages, 2 figures, accepted by Tackling Climate Change with Machine + Learning: workshop at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ LightGaussian: Unbounded 3D Gaussian Compression with 15x Reduction and + 200+ FPS + + +
+ Recent advancements in real-time neural rendering using point-based +techniques have paved the way for the widespread adoption of 3D +representations. However, foundational approaches like 3D Gaussian Splatting +come with a substantial storage overhead caused by growing the SfM points to +millions, often demanding gigabyte-level disk space for a single unbounded +scene, posing significant scalability challenges and hindering the splatting +efficiency. + To address this challenge, we introduce LightGaussian, a novel method +designed to transform 3D Gaussians into a more efficient and compact format. +Drawing inspiration from the concept of Network Pruning, LightGaussian +identifies Gaussians that are insignificant in contributing to the scene +reconstruction and adopts a pruning and recovery process, effectively reducing +redundancy in Gaussian counts while preserving visual effects. Additionally, +LightGaussian employs distillation and pseudo-view augmentation to distill +spherical harmonics to a lower degree, allowing knowledge transfer to more +compact representations while maintaining reflectance. Furthermore, we propose +a hybrid scheme, VecTree Quantization, to quantize all attributes, resulting in +lower bitwidth representations with minimal accuracy losses. + In summary, LightGaussian achieves an averaged compression rate over 15x +while boosting the FPS from 139 to 215, enabling an efficient representation of +complex scenes on Mip-NeRF 360, Tank and Temple datasets. + Project website: https://lightgaussian.github.io/ + +
+
+ comment: 16pages, 8figures +
+
+
+
+
+ + ♻ ☆ 3D Face Reconstruction with the Geometric Guidance of Facial Part + Segmentation + + +
+ 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in +various applications. However, existing methods struggle to reconstruct faces +with extreme expressions due to deficiencies in supervisory signals, such as +sparse or inaccurate landmarks. Segmentation information contains effective +geometric contexts for face reconstruction. Certain attempts intuitively depend +on differentiable renderers to compare the rendered silhouettes of +reconstruction with segmentation, which is prone to issues like local optima +and gradient instability. In this paper, we fully utilize the facial part +segmentation geometry by introducing Part Re-projection Distance Loss (PRDL). +Specifically, PRDL transforms facial part segmentation into 2D points and +re-projects the reconstruction onto the image plane. Subsequently, by +introducing grid anchors and computing different statistical distances from +these anchors to the point sets, PRDL establishes geometry descriptors to +optimize the distribution of the point sets for face reconstruction. PRDL +exhibits a clear gradient compared to the renderer-based methods and presents +state-of-the-art reconstruction performance in extensive quantitative and +qualitative experiments. The project will be publicly available. + +
+
+
+
+
+ + ♻ ☆ Binary Radiance Fields NeurIPS 2023 + + +
+ In this paper, we propose \textit{binary radiance fields} (BiRF), a +storage-efficient radiance field representation employing binary feature +encoding that encodes local features using binary encoding parameters in a +format of either $+1$ or $-1$. This binarization strategy lets us represent the +feature grid with highly compact feature encoding and a dramatic reduction in +storage size. Furthermore, our 2D-3D hybrid feature grid design enhances the +compactness of feature encoding as the 3D grid includes main components while +2D grids capture details. In our experiments, binary radiance field +representation successfully outperforms the reconstruction performance of +state-of-the-art (SOTA) efficient radiance field models with lower storage +allocation. In particular, our model achieves impressive results in static +scene reconstruction, with a PSNR of 32.03 dB for Synthetic-NeRF scenes, 34.48 +dB for Synthetic-NSVF scenes, 28.20 dB for Tanks and Temples scenes while only +utilizing 0.5 MB of storage space, respectively. We hope the proposed binary +radiance field representation will make radiance fields more accessible without +a storage bottleneck. + +
+
+ comment: Accepted to NeurIPS 2023. Project page: + https://seungjooshin.github.io/BiRF +
+
+
+
+
+ + ♻ ☆ Learning Triangular Distribution in Visual World + + +
+ Convolution neural network is successful in pervasive vision tasks, including +label distribution learning, which usually takes the form of learning an +injection from the non-linear visual features to the well-defined labels. +However, how the discrepancy between features is mapped to the label +discrepancy is ambient, and its correctness is not guaranteed. To address these +problems, we study the mathematical connection between feature and its label, +presenting a general and simple framework for label distribution learning. We +propose a so-called Triangular Distribution Transform (TDT) to build an +injective function between feature and label, guaranteeing that any symmetric +feature discrepancy linearly reflects the difference between labels. The +proposed TDT can be used as a plug-in in mainstream backbone networks to +address different label distribution learning tasks. Experiments on Facial Age +Recognition, Illumination Chromaticity Estimation, and Aesthetics assessment +show that TDT achieves on-par or better results than the prior arts. + +
+
+ comment: 11 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Knowledge Diffusion for Distillation NeurIPS 2023 + + +
+ The representation gap between teacher and student is an emerging topic in +knowledge distillation (KD). To reduce the gap and improve the performance, +current methods often resort to complicated training schemes, loss functions, +and feature alignments, which are task-specific and feature-specific. In this +paper, we state that the essence of these methods is to discard the noisy +information and distill the valuable information in the feature, and propose a +novel KD method dubbed DiffKD, to explicitly denoise and match features using +diffusion models. Our approach is based on the observation that student +features typically contain more noises than teacher features due to the smaller +capacity of student model. To address this, we propose to denoise student +features using a diffusion model trained by teacher features. This allows us to +perform better distillation between the refined clean feature and teacher +feature. Additionally, we introduce a light-weight diffusion model with a +linear autoencoder to reduce the computation cost and an adaptive noise +matching module to improve the denoising performance. Extensive experiments +demonstrate that DiffKD is effective across various types of features and +achieves state-of-the-art performance consistently on image classification, +object detection, and semantic segmentation tasks. Code is available at +https://github.com/hunto/DiffKD. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Efficient Multimodal Semantic Segmentation via Dual-Prompt Learning + + +
+ Multimodal (e.g., RGB-Depth/RGB-Thermal) fusion has shown great potential for +improving semantic segmentation in complex scenes (e.g., indoor/low-light +conditions). Existing approaches often fully fine-tune a dual-branch +encoder-decoder framework with a complicated feature fusion strategy for +achieving multimodal semantic segmentation, which is training-costly due to the +massive parameter updates in feature extraction and fusion. To address this +issue, we propose a surprisingly simple yet effective dual-prompt learning +network (dubbed DPLNet) for training-efficient multimodal (e.g., RGB-D/T) +semantic segmentation. The core of DPLNet is to directly adapt a frozen +pre-trained RGB model to multimodal semantic segmentation, reducing parameter +updates. For this purpose, we present two prompt learning modules, comprising +multimodal prompt generator (MPG) and multimodal feature adapter (MFA). MPG +works to fuse the features from different modalities in a compact manner and is +inserted from shadow to deep stages to generate the multi-level multimodal +prompts that are injected into the frozen backbone, while MPG adapts prompted +multimodal features in the frozen backbone for better multimodal semantic +segmentation. Since both the MPG and MFA are lightweight, only a few trainable +parameters (3.88M, 4.4% of the pre-trained backbone parameters) are introduced +for multimodal feature fusion and learning. Using a simple decoder (3.27M +parameters), DPLNet achieves new state-of-the-art performance or is on a par +with other complex approaches on four RGB-D/T semantic segmentation datasets +while satisfying parameter efficiency. Moreover, we show that DPLNet is general +and applicable to other multimodal tasks such as salient object detection and +video semantic segmentation. Without special design, DPLNet outperforms many +complicated models. Our code will be available at +github.com/ShaohuaDong2021/DPLNet. + +
+
+ comment: 11 pages, 4 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Invariance is Key to Generalization: Examining the Role of + Representation in Sim-to-Real Transfer for Visual Navigation SP + + +
+ The data-driven approach to robot control has been gathering pace rapidly, +yet generalization to unseen task domains remains a critical challenge. We +argue that the key to generalization is representations that are (i) rich +enough to capture all task-relevant information and (ii) invariant to +superfluous variability between the training and the test domains. We +experimentally study such a representation -- containing both depth and +semantic information -- for visual navigation and show that it enables a +control policy trained entirely in simulated indoor scenes to generalize to +diverse real-world environments, both indoors and outdoors. Further, we show +that our representation reduces the A-distance between the training and test +domains, improving the generalization error bound as a result. Our proposed +approach is scalable: the learned policy improves continuously, as the +foundation models that it exploits absorb more diverse data during +pre-training. + +
+
+ comment: 11 pages, accepted by the 18th International Symposium on + Experimental Robotics (ISER 2023) and published within the Springer + Proceedings in Advanced Robotics (SPAR) +
+
+
+
+
+ + ♻ ☆ Open-Set Object Detection Using Classification-free Object Proposal and + Instance-level Contrastive Learning + + +
+ Detecting both known and unknown objects is a fundamental skill for robot +manipulation in unstructured environments. Open-set object detection (OSOD) is +a promising direction to handle the problem consisting of two subtasks: objects +and background separation, and open-set object classification. In this paper, +we present Openset RCNN to address the challenging OSOD. To disambiguate +unknown objects and background in the first subtask, we propose to use +classification-free region proposal network (CF-RPN) which estimates the +objectness score of each region purely using cues from object's location and +shape preventing overfitting to the training categories. To identify unknown +objects in the second subtask, we propose to represent them using the +complementary region of known categories in a latent space which is +accomplished by a prototype learning network (PLN). PLN performs instance-level +contrastive learning to encode proposals to a latent space and builds a compact +region centering with a prototype for each known category. Further, we note +that the detection performance of unknown objects can not be unbiasedly +evaluated on the situation that commonly used object detection datasets are not +fully annotated. Thus, a new benchmark is introduced by reorganizing +GraspNet-1billion, a robotic grasp pose detection dataset with complete +annotation. Extensive experiments demonstrate the merits of our method. We +finally show that our Openset RCNN can endow the robot with an open-set +perception ability to support robotic rearrangement tasks in cluttered +environments. More details can be found in +https://sites.google.com/view/openset-rcnn/ + +
+
+ comment: IEEE Robotics and Automation Letters +
+
+
+
+
+ + ♻ ☆ What User Behaviors Make the Differences During the Process of Visual + Analytics? + + +
+ The understanding of visual analytics process can benefit visualization +researchers from multiple aspects, including improving visual designs and +developing advanced interaction functions. However, the log files of user +behaviors are still hard to analyze due to the complexity of sensemaking and +our lack of knowledge on the related user behaviors. This work presents a study +on a comprehensive data collection of user behaviors, and our analysis approach +with time-series classification methods. We have chosen a classical +visualization application, Covid-19 data analysis, with common analysis tasks +covering geo-spatial, time-series and multi-attributes. Our user study collects +user behaviors on a diverse set of visualization tasks with two comparable +systems, desktop and immersive visualizations. We summarize the classification +results with three time-series machine learning algorithms at two scales, and +explore the influences of behavior features. Our results reveal that user +behaviors can be distinguished during the process of visual analytics and there +is a potentially strong association between the physical behaviors of users and +the visualization tasks they perform. We also demonstrate the usage of our +models by interpreting open sessions of visual analytics, which provides an +automatic way to study sensemaking without tedious manual annotations. + +
+
+ comment: This version corrects the issues of previous versions +
+
+
+
+
+ + ♻ ☆ Deep Unlearning: Fast and Efficient Training-free Approach to Controlled + Forgetting + + +
+ Machine unlearning has emerged as a prominent and challenging area of +interest, driven in large part by the rising regulatory demands for industries +to delete user data upon request and the heightened awareness of privacy. +Existing approaches either retrain models from scratch or use several +finetuning steps for every deletion request, often constrained by computational +resource limitations and restricted access to the original training data. In +this work, we introduce a novel class unlearning algorithm designed to +strategically eliminate an entire class or a group of classes from the learned +model. To that end, our algorithm first estimates the Retain Space and the +Forget Space, representing the feature or activation spaces for samples from +classes to be retained and unlearned, respectively. To obtain these spaces, we +propose a novel singular value decomposition-based technique that requires +layer wise collection of network activations from a few forward passes through +the network. We then compute the shared information between these spaces and +remove it from the forget space to isolate class-discriminatory feature space +for unlearning. Finally, we project the model weights in the orthogonal +direction of the class-discriminatory space to obtain the unlearned model. We +demonstrate our algorithm's efficacy on ImageNet using a Vision Transformer +with only $\sim$1.5% drop in retain accuracy compared to the original model +while maintaining under 1% accuracy on the unlearned class samples. Further, +our algorithm consistently performs well when subject to Membership Inference +Attacks showing 7.8% improvement on average across a variety of image +classification datasets and network architectures, as compared to other +baselines while being $\sim$6x more computationally efficient. + +
+
+
+
+
+ + ♻ ☆ Efficient and Effective Deep Multi-view Subspace Clustering + + +
+ Recent multi-view subspace clustering achieves impressive results utilizing +deep networks, where the self-expressive correlation is typically modeled by a +fully connected (FC) layer. However, they still suffer from two limitations. i) +The parameter scale of the FC layer is quadratic to sample numbers, resulting +in high time and memory costs that significantly degrade their feasibility in +large-scale datasets. ii) It is under-explored to extract a unified +representation that simultaneously satisfies minimal sufficiency and +discriminability. To this end, we propose a novel deep framework, termed +Efficient and Effective deep Multi-View Subspace Clustering (E$^2$MVSC). +Instead of a parameterized FC layer, we design a Relation-Metric Net that +decouples network parameter scale from sample numbers for greater computational +efficiency. Most importantly, the proposed method devises a multi-type +auto-encoder to explicitly decouple consistent, complementary, and superfluous +information from every view, which is supervised by a soft clustering +assignment similarity constraint. Following information bottleneck theory and +the maximal coding rate reduction principle, a sufficient yet minimal unified +representation can be obtained, as well as pursuing intra-cluster aggregation +and inter-cluster separability within it. Extensive experiments show that +E$^2$MVSC yields comparable results to existing methods and achieves +state-of-the-art performance in various types of multi-view datasets. + +
+
+
+
+
+ + ♻ ☆ Quantitative Analysis of Primary Attribution Explainable Artificial + Intelligence Methods for Remote Sensing Image Classification + + +
+ We present a comprehensive analysis of quantitatively evaluating explainable +artificial intelligence (XAI) techniques for remote sensing image +classification. Our approach leverages state-of-the-art machine learning +approaches to perform remote sensing image classification across multiple +modalities. We investigate the results of the models qualitatively through XAI +methods. Additionally, we compare the XAI methods quantitatively through +various categories of desired properties. Through our analysis, we offer +insights and recommendations for selecting the most appropriate XAI method(s) +to gain a deeper understanding of the models' decision-making processes. The +code for this work is publicly available. + +
+
+ comment: 4 pages, 3 figures, Accepted to 2023 IGARSS Community-Contributed + Sessions - Opening the Black Box: Explainable AI/ML in Remote Sensing + Analysis +
+
+
+
+
+
+
+
+ + Information Retrieval 11 + +
+
+
+ + ☆ Virtual Quantum Markov Chains + + +
+ Quantum Markov chains generalize classical Markov chains for random variables +to the quantum realm and exhibit unique inherent properties, making them an +important feature in quantum information theory. In this work, we propose the +concept of virtual quantum Markov chains (VQMCs), focusing on scenarios where +subsystems retain classical information about global systems from measurement +statistics. As a generalization of quantum Markov chains, VQMCs characterize +states where arbitrary global shadow information can be recovered from +subsystems through local quantum operations and measurements. We present an +algebraic characterization for virtual quantum Markov chains and show that the +virtual quantum recovery is fully determined by the block matrices of a quantum +state on its subsystems. Notably, we find a distinction between two classes of +tripartite entanglement by showing that the W state is a VQMC while the GHZ +state is not. Furthermore, we establish semidefinite programs to determine the +optimal sampling overhead and the robustness of virtual quantum Markov chains. +We demonstrate the optimal sampling overhead is additive, indicating no free +lunch to further reduce the sampling cost of recovery from parallel calls of +the VQMC states. Our findings elucidate distinctions between quantum Markov +chains and virtual quantum Markov chains, extending our understanding of +quantum recovery to scenarios prioritizing classical information from +measurement statistics. + +
+
+ comment: 19 pages including appendix +
+
+
+
+
+ + ☆ Language-only Efficient Training of Zero-shot Composed Image Retrieval + + +
+ Composed image retrieval (CIR) task takes a composed query of image and text, +aiming to search relative images for both conditions. Conventional CIR +approaches need a training dataset composed of triplets of query image, query +text, and target image, which is very expensive to collect. Several recent +works have worked on the zero-shot (ZS) CIR paradigm to tackle the issue +without using pre-collected triplets. However, the existing ZS-CIR methods show +limited backbone scalability and generalizability due to the lack of diversity +of the input texts during training. We propose a novel CIR framework, only +using language for its training. Our LinCIR (Language-only training for CIR) +can be trained only with text datasets by a novel self-supervision named +self-masking projection (SMP). We project the text latent embedding to the +token embedding space and construct a new text by replacing the keyword tokens +of the original text. Then, we let the new and original texts have the same +latent embedding vector. With this simple strategy, LinCIR is surprisingly +efficient and highly effective; LinCIR with CLIP ViT-G backbone is trained in +48 minutes and shows the best ZS-CIR performances on four different CIR +benchmarks, CIRCO, GeneCIS, FashionIQ, and CIRR, even outperforming supervised +method on FashionIQ. Code is available at https://github.com/navervision/lincir + +
+
+ comment: First two authors contributed equally; 16 pages, 2.9MB +
+
+
+
+
+ + ☆ PEACE: Prototype lEarning Augmented transferable framework for + Cross-domain rEcommendation WSDM 2024 + + +
+ To help merchants/customers to provide/access a variety of services through +miniapps, online service platforms have occupied a critical position in the +effective content delivery, in which how to recommend items in the new domain +launched by the service provider for customers has become more urgent. However, +the non-negligible gap between the source and diversified target domains poses +a considerable challenge to cross-domain recommendation systems, which often +leads to performance bottlenecks in industrial settings. While entity graphs +have the potential to serve as a bridge between domains, rudimentary +utilization still fail to distill useful knowledge and even induce the negative +transfer issue. To this end, we propose PEACE, a Prototype lEarning Augmented +transferable framework for Cross-domain rEcommendation. For domain gap +bridging, PEACE is built upon a multi-interest and entity-oriented pre-training +architecture which could not only benefit the learning of generalized knowledge +in a multi-granularity manner, but also help leverage more structural +information in the entity graph. Then, we bring the prototype learning into the +pre-training over source domains, so that representations of users and items +are greatly improved by the contrastive prototype learning module and the +prototype enhanced attention mechanism for adaptive knowledge utilization. To +ease the pressure of online serving, PEACE is carefully deployed in a +lightweight manner, and significant performance improvements are observed in +both online and offline environments. + +
+
+ comment: Accepted by WSDM 2024 +
+
+
+
+
+ + ☆ An AI-based solution for the cold start and data sparsity problems in + the recommendation systems + + +
+ In recent years, the amount of data available on the internet and the number +of users who utilize the Internet have increased at an unparalleled pace. The +exponential development in the quantity of digital information accessible and +the number of Internet users has created the possibility for information +overload, impeding fast access to items of interest on the Internet. +Information retrieval systems like as Google, DevilFinder, and Altavista have +partly overcome this challenge, but prioritizing and customization of +information (where a system maps accessible material to a user's interests and +preferences) were lacking. This has resulted in a higher-than-ever need for +recommender systems. Recommender systems are information filtering systems that +address the issue of information overload by filtering important information +fragments from a huge volume of dynamically produced data based on the user's +interests, favorite things, preferences and ratings on the desired item. +Recommender systems can figure out if a person would like an item or not based +on their profile. + +
+
+
+
+
+ + ☆ On Gradient Boosted Decision Trees and Neural Rankers: A Case-Study on + Short-Video Recommendations at ShareChat + + +
+ Practitioners who wish to build real-world applications that rely on ranking +models, need to decide which modelling paradigm to follow. This is not an easy +choice to make, as the research literature on this topic has been shifting in +recent years. In particular, whilst Gradient Boosted Decision Trees (GBDTs) +have reigned supreme for more than a decade, the flexibility of neural networks +has allowed them to catch up, and recent works report accuracy metrics that are +on par. Nevertheless, practical systems require considerations beyond mere +accuracy metrics to decide on a modelling approach. + This work describes our experiences in balancing some of the trade-offs that +arise, presenting a case study on a short-video recommendation application. We +highlight (1) neural networks' ability to handle large training data size, +user- and item-embeddings allows for more accurate models than GBDTs in this +setting, and (2) because GBDTs are less reliant on specialised hardware, they +can provide an equally accurate model at a lower cost. We believe these +findings are of relevance to researchers in both academia and industry, and +hope they can inspire practitioners who need to make similar modelling choices +in the future. + +
+
+ comment: Appearing in the Industry Track Proceedings of the Forum for + Information Retrieval Evaluation (FIRE '23) +
+
+
+
+
+ + ☆ The Contemporary Art of Image Search: Iterative User Intent Expansion + via Vision-Language Model SC + + +
+ Image search is an essential and user-friendly method to explore vast +galleries of digital images. However, existing image search methods heavily +rely on proximity measurements like tag matching or image similarity, requiring +precise user inputs for satisfactory results.To meet the growing demand for a +contemporary image search engine that enables accurate comprehension of users' +search intentions, we introduce an innovative user intent expansion framework. +Our framework leverages visual-language models to parse and compose multi-modal +user inputs to provide more accurate and satisfying results. It comprises +two-stage processes: 1) a parsing stage that incorporates a language parsing +module with large language models to enhance the comprehension of textual +inputs, along with a visual parsing module that integrates an interactive +segmentation module to swiftly identify detailed visual elements within images; +and 2) a logic composition stage that combines multiple user search intents +into a unified logic expression for more sophisticated operations in complex +searching scenarios. Moreover, the intent expansion framework enables users to +perform flexible contextualized interactions with the search results to further +specify or adjust their detailed search intents iteratively. We implemented the +framework into an image search system for NFT (non-fungible token) search and +conducted a user study to evaluate its usability and novel properties. The +results indicate that the proposed framework significantly improves users' +image search experience. Particularly the parsing and contextualized +interactions prove useful in allowing users to express their search intents +more accurately and engage in a more enjoyable iterative search experience. + +
+
+ comment: Accepted by The 2024 ACM SIGCHI Conference on Computer-Supported + Cooperative Work & Social Computing (CSCW) (Proc. CSCW 2024) +
+
+
+
+
+ + ☆ Searching Dense Representations with Inverted Indexes + + +
+ Nearly all implementations of top-$k$ retrieval with dense vector +representations today take advantage of hierarchical navigable small-world +network (HNSW) indexes. However, the generation of vector representations and +efficiently searching large collections of vectors are distinct challenges that +can be decoupled. In this work, we explore the contrarian approach of +performing top-$k$ retrieval on dense vector representations using inverted +indexes. We present experiments on the MS MARCO passage ranking dataset, +evaluating three dimensions of interest: output quality, speed, and index size. +Results show that searching dense representations using inverted indexes is +possible. Our approach exhibits reasonable effectiveness with compact indexes, +but is impractically slow. Thus, while workable, our solution does not provide +a compelling tradeoff and is perhaps best characterized today as a "technical +curiosity". + +
+
+
+
+
+ + ♻ ☆ Towards Open-World Recommendation with Knowledge Augmentation from Large + Language Models + + +
+ Recommender systems play a vital role in various online services. However, +the insulated nature of training and deploying separately within a specific +domain limits their access to open-world knowledge. Recently, the emergence of +large language models (LLMs) has shown promise in bridging this gap by encoding +extensive world knowledge and demonstrating reasoning capability. Nevertheless, +previous attempts to directly use LLMs as recommenders have not achieved +satisfactory results. In this work, we propose an Open-World Knowledge +Augmented Recommendation Framework with Large Language Models, dubbed KAR, to +acquire two types of external knowledge from LLMs -- the reasoning knowledge on +user preferences and the factual knowledge on items. We introduce factorization +prompting to elicit accurate reasoning on user preferences. The generated +reasoning and factual knowledge are effectively transformed and condensed into +augmented vectors by a hybrid-expert adaptor in order to be compatible with the +recommendation task. The obtained vectors can then be directly used to enhance +the performance of any recommendation model. We also ensure efficient inference +by preprocessing and prestoring the knowledge from the LLM. Extensive +experiments show that KAR significantly outperforms the state-of-the-art +baselines and is compatible with a wide range of recommendation algorithms. We +deploy KAR to Huawei's news and music recommendation platforms and gain a 7\% +and 1.7\% improvement in the online A/B test, respectively. + +
+
+
+
+
+ + ♻ ☆ Event-driven Real-time Retrieval in Web Search + + +
+ Information retrieval in real-time search presents unique challenges distinct +from those encountered in classical web search. These challenges are +particularly pronounced due to the rapid change of user search intent, which is +influenced by the occurrence and evolution of breaking news events, such as +earthquakes, elections, and wars. Previous dense retrieval methods, which +primarily focused on static semantic representation, lack the capacity to +capture immediate search intent, leading to inferior performance in retrieving +the most recent event-related documents in time-sensitive scenarios. To address +this issue, this paper expands the query with event information that represents +real-time search intent. The Event information is then integrated with the +query through a cross-attention mechanism, resulting in a time-context query +representation. We further enhance the model's capacity for event +representation through multi-task training. Since publicly available datasets +such as MS-MARCO do not contain any event information on the query side and +have few time-sensitive queries, we design an automatic data collection and +annotation pipeline to address this issue, which includes ModelZoo-based Coarse +Annotation and LLM-driven Fine Annotation processes. In addition, we share the +training tricks such as two-stage training and hard negative sampling. Finally, +we conduct a set of offline experiments on a million-scale production dataset +to evaluate our approach and deploy an A/B testing in a real online system to +verify the performance. Extensive experimental results demonstrate that our +proposed approach significantly outperforms existing state-of-the-art baseline +methods. + +
+
+
+
+
+ + ♻ ☆ A Matrix Decomposition Model Based on Feature Factors in Movie + Recommendation System + + +
+ Currently, matrix decomposition is one of the most widely used collaborative +filtering algorithms by using factor decomposition to effectively deal with +large-scale rating matrix. It mainly uses the interaction records between users +and items to predict ratings. Based on the characteristic attributes of items +and users, this paper proposes a new UISVD++ model that fuses the type +attributes of movies and the age attributes of users into SVD++ framework. By +projecting the age attribute into the user's implicit space and the type +attribute into the item's implicit space, the model enriches the side +information of the users and items. At last, we conduct comparative experiments +on two public data sets, Movielens-100K and Movielens-1M. Experiment results +express that the prediction accuracy of this model is better than other +baselines in the task of predicting scores. In addition, these results also +show that UISVD++ can effectively alleviate the cold start situation. + +
+
+ comment: 11 pages, 8 figures, 9 tables +
+
+
+
+
+ + ♻ ☆ Adaptive Latent Entity Expansion for Document Retrieval + + +
+ Despite considerable progress in neural relevance ranking techniques, search +engines still struggle to process complex queries effectively - both in terms +of precision and recall. Sparse and dense Pseudo-Relevance Feedback (PRF) +approaches have the potential to overcome limitations in recall, but are only +effective with high precision in the top ranks. In this work, we tackle the +problem of search over complex queries using three complementary techniques. +First, we demonstrate that applying a strong neural re-ranker before sparse or +dense PRF can improve the retrieval effectiveness by 5-8%. This improvement in +PRF effectiveness can be attributed directly to improving the precision of the +feedback set. Second, we propose an enhanced expansion model, Latent Entity +Expansion (LEE), which applies fine-grained word and entity-based relevance +modelling incorporating localized features. Specifically, we find that by +including both words and entities for expansion achieve a further 2-8% +improvement in NDCG. Our analysis also demonstrated that LEE is largely robust +to its parameters across datasets and performs well on entity-centric queries. +And third, we include an 'adaptive' component in the retrieval process, which +iteratively refines the re-ranking pool during scoring using the expansion +model and avoids re-ranking additional documents. We find that this combination +of techniques achieves the best NDCG, MAP and R@1000 results on the TREC Robust +2004 and CODEC document datasets, demonstrating a significant advancement in +expansion effectiveness. + +
+
+
+
+
+
+
+
+ + Machine Learning 137 + +
+
+
+ + ☆ Guarding Barlow Twins Against Overfitting with Mixed Samples + + +
+ Self-supervised Learning (SSL) aims to learn transferable feature +representations for downstream applications without relying on labeled data. +The Barlow Twins algorithm, renowned for its widespread adoption and +straightforward implementation compared to its counterparts like contrastive +learning methods, minimizes feature redundancy while maximizing invariance to +common corruptions. Optimizing for the above objective forces the network to +learn useful representations, while avoiding noisy or constant features, +resulting in improved downstream task performance with limited adaptation. +Despite Barlow Twins' proven effectiveness in pre-training, the underlying SSL +objective can inadvertently cause feature overfitting due to the lack of strong +interaction between the samples unlike the contrastive learning approaches. +From our experiments, we observe that optimizing for the Barlow Twins objective +doesn't necessarily guarantee sustained improvements in representation quality +beyond a certain pre-training phase, and can potentially degrade downstream +performance on some datasets. To address this challenge, we introduce Mixed +Barlow Twins, which aims to improve sample interaction during Barlow Twins +training via linearly interpolated samples. This results in an additional +regularization term to the original Barlow Twins objective, assuming linear +interpolation in the input space translates to linearly interpolated features +in the feature space. Pre-training with this regularization effectively +mitigates feature overfitting and further enhances the downstream performance +on CIFAR-10, CIFAR-100, TinyImageNet, STL-10, and ImageNet datasets. The code +and checkpoints are available at: https://github.com/wgcban/mix-bt.git + +
+
+ comment: Code and checkpoints are available at: + https://github.com/wgcban/mix-bt.git +
+
+
+
+
+ + ☆ Learning Polynomial Problems with $SL(2,\mathbb{R})$ Equivariance + + +
+ Optimizing and certifying the positivity of polynomials are fundamental +primitives across mathematics and engineering applications, from dynamical +systems to operations research. However, solving these problems in practice +requires large semidefinite programs, with poor scaling in dimension and +degree. In this work, we demonstrate for the first time that neural networks +can effectively solve such problems in a data-driven fashion, achieving tenfold +speedups while retaining high accuracy. Moreover, we observe that these +polynomial learning problems are equivariant to the non-compact group +$SL(2,\mathbb{R})$, which consists of area-preserving linear transformations. +We therefore adapt our learning pipelines to accommodate this structure, +including data augmentation, a new $SL(2,\mathbb{R})$-equivariant architecture, +and an architecture equivariant with respect to its maximal compact subgroup, +$SO(2, \mathbb{R})$. Surprisingly, the most successful approaches in practice +do not enforce equivariance to the entire group, which we prove arises from an +unusual lack of architecture universality for $SL(2,\mathbb{R})$ in particular. +A consequence of this result, which is of independent interest, is that there +exists an equivariant function for which there is no sequence of equivariant +polynomials multiplied by arbitrary invariants that approximates the original +function. This is a rare example of a symmetric problem where data augmentation +outperforms a fully equivariant architecture, and provides interesting lessons +in both theory and practice for other problems with non-compact symmetries. + +
+
+
+
+
+ + ☆ DiffiT: Diffusion Vision Transformers for Image Generation + + +
+ Diffusion models with their powerful expressivity and high sample quality +have enabled many new applications and use-cases in various domains. For sample +generation, these models rely on a denoising neural network that generates +images by iterative denoising. Yet, the role of denoising network architecture +is not well-studied with most efforts relying on convolutional residual U-Nets. +In this paper, we study the effectiveness of vision transformers in +diffusion-based generative learning. Specifically, we propose a new model, +denoted as Diffusion Vision Transformers (DiffiT), which consists of a hybrid +hierarchical architecture with a U-shaped encoder and decoder. We introduce a +novel time-dependent self-attention module that allows attention layers to +adapt their behavior at different stages of the denoising process in an +efficient manner. We also introduce latent DiffiT which consists of transformer +model with the proposed self-attention layers, for high-resolution image +generation. Our results show that DiffiT is surprisingly effective in +generating high-fidelity images, and it achieves state-of-the-art (SOTA) +benchmarks on a variety of class-conditional and unconditional synthesis tasks. +In the latent space, DiffiT achieves a new SOTA FID score of 1.73 on +ImageNet-256 dataset. Repository: https://github.com/NVlabs/DiffiT + +
+
+ comment: Tech report +
+
+
+
+
+ + ☆ Style Aligned Image Generation via Shared Attention + + +
+ Large-scale Text-to-Image (T2I) models have rapidly gained prominence across +creative fields, generating visually compelling outputs from textual prompts. +However, controlling these models to ensure consistent style remains +challenging, with existing methods necessitating fine-tuning and manual +intervention to disentangle content and style. In this paper, we introduce +StyleAligned, a novel technique designed to establish style alignment among a +series of generated images. By employing minimal `attention sharing' during the +diffusion process, our method maintains style consistency across images within +T2I models. This approach allows for the creation of style-consistent images +using a reference style through a straightforward inversion operation. Our +method's evaluation across diverse styles and text prompts demonstrates +high-quality synthesis and fidelity, underscoring its efficacy in achieving +consistent style across various inputs. + +
+
+ comment: Project page at style-aligned-gen.github.io +
+
+
+
+
+ + ☆ Hot PATE: Private Aggregation of Distributions for Diverse Task + + +
+ The Private Aggregation of Teacher Ensembles (PATE) +framework~\cite{PapernotAEGT:ICLR2017} is a versatile approach to +privacy-preserving machine learning. In PATE, teacher models are trained on +distinct portions of sensitive data, and their predictions are privately +aggregated to label new training examples for a student model. + Until now, PATE has primarily been explored with classification-like tasks, +where each example possesses a ground-truth label, and knowledge is transferred +to the student by labeling public examples. Generative AI models, however, +excel in open ended \emph{diverse} tasks with multiple valid responses and +scenarios that may not align with traditional labeled examples. Furthermore, +the knowledge of models is often encapsulated in the response distribution +itself and may be transferred from teachers to student in a more fluid way. We +propose \emph{hot PATE}, tailored for the diverse setting. In hot PATE, each +teacher model produces a response distribution and the aggregation method must +preserve both privacy and diversity of responses. We demonstrate, analytically +and empirically, that hot PATE achieves privacy-utility tradeoffs that are +comparable to, and in diverse settings, significantly surpass, the baseline +``cold'' PATE. + +
+
+ comment: 19 pages +
+
+
+
+
+ + ☆ TPPoet: Transformer-Based Persian Poem Generation using Minimal Data and + Advanced Decoding Techniques + + +
+ Recent advances in language models (LMs), have demonstrated significant +efficacy in tasks related to the arts and humanities. While LMs have exhibited +exceptional performance across a wide range of natural language processing +tasks, there are notable challenges associated with their utilization on small +datasets and their ability to replicate more creative human capacities. In this +study, we aim to address these challenges by training a Persian classical +poetry generation model using a transformer architecture on a specialized +dataset with no pretraining. Additionally, we propose a novel decoding method +to enhance coherence and meaningfulness in the generated poetry, effectively +managing the tradeoff between diversity and quality. Furthermore, the results +of our training approach and the proposed decoding method are evaluated through +comprehensive set of automatic and human evaluations and showed its superior +capability to generate coherent and meaningful poetry in compare to other +decoding methods and an existing Persian large language model (LLM). + +
+
+
+
+
+ + ☆ VerA: Versatile Anonymization Fit for Clinical Facial Images + + +
+ The escalating legislative demand for data privacy in facial image +dissemination has underscored the significance of image anonymization. Recent +advancements in the field surpass traditional pixelation or blur methods, yet +they predominantly address regular single images. This leaves clinical image +anonymization -- a necessity for illustrating medical interventions -- largely +unaddressed. We present VerA, a versatile facial image anonymization that is +fit for clinical facial images where: (1) certain semantic areas must be +preserved to show medical intervention results, and (2) anonymizing image pairs +is crucial for showing before-and-after results. VerA outperforms or is on par +with state-of-the-art methods in de-identification and photorealism for regular +images. In addition, we validate our results on paired anonymization, and on +the anonymization of both single and paired clinical images with extensive +quantitative and qualitative evaluation. + +
+
+
+
+
+ + ☆ Tree of Attacks: Jailbreaking Black-Box LLMs Automatically + + +
+ While Large Language Models (LLMs) display versatile functionality, they +continue to generate harmful, biased, and toxic content, as demonstrated by the +prevalence of human-designed jailbreaks. In this work, we present Tree of +Attacks with Pruning (TAP), an automated method for generating jailbreaks that +only requires black-box access to the target LLM. TAP utilizes an LLM to +iteratively refine candidate (attack) prompts using tree-of-thoughts reasoning +until one of the generated prompts jailbreaks the target. Crucially, before +sending prompts to the target, TAP assesses them and prunes the ones unlikely +to result in jailbreaks. Using tree-of-thought reasoning allows TAP to navigate +a large search space of prompts and pruning reduces the total number of queries +sent to the target. In empirical evaluations, we observe that TAP generates +prompts that jailbreak state-of-the-art LLMs (including GPT4 and GPT4-Turbo) +for more than 80% of the prompts using only a small number of queries. This +significantly improves upon the previous state-of-the-art black-box method for +generating jailbreaks. + +
+
+ comment: An implementation of the presented method is available at + https://github.com/RICommunity/TAP +
+
+
+
+
+ + ☆ TriDeNT: Triple Deep Network Training for Privileged Knowledge + Distillation in Histopathology + + +
+ Computational pathology models rarely utilise data that will not be available +for inference. This means most models cannot learn from highly informative data +such as additional immunohistochemical (IHC) stains and spatial +transcriptomics. We present TriDeNT, a novel self-supervised method for +utilising privileged data that is not available during inference to improve +performance. We demonstrate the efficacy of this method for a range of +different paired data including immunohistochemistry, spatial transcriptomics +and expert nuclei annotations. In all settings, TriDeNT outperforms other +state-of-the-art methods in downstream tasks, with observed improvements of up +to 101%. Furthermore, we provide qualitative and quantitative measurements of +the features learned by these models and how they differ from baselines. +TriDeNT offers a novel method to distil knowledge from scarce or costly data +during training, to create significantly better models for routine inputs. + +
+
+
+
+
+ + ☆ Mitigating Data Injection Attacks on Federated Learning + + +
+ Federated learning is a technique that allows multiple entities to +collaboratively train models using their data without compromising data +privacy. However, despite its advantages, federated learning can be susceptible +to false data injection attacks. In these scenarios, a malicious entity with +control over specific agents in the network can manipulate the learning +process, leading to a suboptimal model. Consequently, addressing these data +injection attacks presents a significant research challenge in federated +learning systems. In this paper, we propose a novel technique to detect and +mitigate data injection attacks on federated learning systems. Our mitigation +method is a local scheme, performed during a single instance of training by the +coordinating node, allowing the mitigation during the convergence of the +algorithm. Whenever an agent is suspected to be an attacker, its data will be +ignored for a certain period, this decision will often be re-evaluated. We +prove that with probability 1, after a finite time, all attackers will be +ignored while the probability of ignoring a trustful agent becomes 0, provided +that there is a majority of truthful agents. Simulations show that when the +coordinating node detects and isolates all the attackers, the model recovers +and converges to the truthful model. + +
+
+
+
+
+ + ☆ Single-sample versus case-control sampling scheme for Positive Unlabeled + data: the story of two scenarios + + +
+ In the paper we argue that performance of the classifiers based on Empirical +Risk Minimization (ERM) for positive unlabeled data, which are designed for +case-control sampling scheme may significantly deteriorate when applied to a +single-sample scenario. We reveal why their behavior depends, in all but very +specific cases, on the scenario. Also, we introduce a single-sample case +analogue of the popular non-negative risk classifier designed for case-control +data and compare its performance with the original proposal. We show that the +significant differences occur between them, especiall when half or more +positive of observations are labeled. The opposite case when ERM minimizer +designed for the case-control case is applied for single-sample data is also +considered and similar conclusions are drawn. Taking into account difference of +scenarios requires a sole, but crucial, change in the definition of the +Empirical Risk. + +
+
+
+
+
+ + ☆ Deep Set Neural Networks for forecasting asynchronous bioprocess + timeseries + + +
+ Cultivation experiments often produce sparse and irregular time series. +Classical approaches based on mechanistic models, like Maximum Likelihood +fitting or Monte-Carlo Markov chain sampling, can easily account for sparsity +and time-grid irregularities, but most statistical and Machine Learning tools +are not designed for handling sparse data out-of-the-box. Among popular +approaches there are various schemes for filling missing values (imputation) +and interpolation into a regular grid (alignment). However, such methods +transfer the biases of the interpolation or imputation models to the target +model. We show that Deep Set Neural Networks equipped with triplet encoding of +the input data can successfully handle bio-process data without any need for +imputation or alignment procedures. The method is agnostic to the particular +nature of the time series and can be adapted for any task, for example, online +monitoring, predictive control, design of experiments, etc. In this work, we +focus on forecasting. We argue that such an approach is especially suitable for +typical cultivation processes, demonstrate the performance of the method on +several forecasting tasks using data generated from macrokinetic growth models +under realistic conditions, and compare the method to a conventional fitting +procedure and methods based on imputation and alignment. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ Integrating AI into CCTV Systems: A Comprehensive Evaluation of Smart + Video Surveillance in Community Space + + +
+ This article presents an AI-enabled Smart Video Surveillance (SVS) designed +to enhance safety in community spaces such as educational and recreational +areas, and small businesses. The proposed system innovatively integrates with +existing CCTV and wired camera networks, simplifying its adoption across +various community cases to leverage recent AI advancements. Our SVS system, +focusing on privacy, uses metadata instead of pixel data for activity +recognition, aligning with ethical standards. It features cloud-based +infrastructure and a mobile app for real-time, privacy-conscious alerts in +communities. + This article notably pioneers a comprehensive real-world evaluation of the +SVS system, covering AI-driven visual processing, statistical analysis, +database management, cloud communication, and user notifications. It's also the +first to assess an end-to-end anomaly detection system's performance, vital for +identifying potential public safety incidents. + For our evaluation, we implemented the system in a community college, serving +as an ideal model to exemplify the proposed system's capabilities. Our findings +in this setting demonstrate the system's robustness, with throughput, latency, +and scalability effectively managing 16 CCTV cameras. The system maintained a +consistent 16.5 frames per second (FPS) over a 21-hour operation. The average +end-to-end latency for detecting behavioral anomalies and alerting users was +26.76 seconds. + +
+
+
+
+
+ + ☆ Federated Learning is Better with Non-Homomorphic Encryption + + +
+ Traditional AI methodologies necessitate centralized data collection, which +becomes impractical when facing problems with network communication, data +privacy, or storage capacity. Federated Learning (FL) offers a paradigm that +empowers distributed AI model training without collecting raw data. There are +different choices for providing privacy during FL training. One of the popular +methodologies is employing Homomorphic Encryption (HE) - a breakthrough in +privacy-preserving computation from Cryptography. However, these methods have a +price in the form of extra computation and memory footprint. To resolve these +issues, we propose an innovative framework that synergizes permutation-based +compressors with Classical Cryptography, even though employing Classical +Cryptography was assumed to be impossible in the past in the context of FL. Our +framework offers a way to replace HE with cheaper Classical Cryptography +primitives which provides security for the training process. It fosters +asynchronous communication and provides flexible deployment options in various +communication topologies. + +
+
+ comment: 56 pages, 10 figures, Accepted to presentation and proceedings to 4th + ACM International Workshop on Distributed Machine Learning +
+
+
+
+
+ + ☆ The GPU Phase Folding and Deep Learning Method for Detecting Exoplanet + Transits + + +
+ This paper presents GPFC, a novel Graphics Processing Unit (GPU) Phase +Folding and Convolutional Neural Network (CNN) system to detect exoplanets +using the transit method. We devise a fast folding algorithm parallelized on a +GPU to amplify low signal-to-noise ratio transit signals, allowing a search at +high precision and speed. A CNN trained on two million synthetic light curves +reports a score indicating the likelihood of a planetary signal at each period. +GPFC improves on speed by three orders of magnitude over the predominant +Box-fitting Least Squares (BLS) method. Our simulation results show GPFC +achieves 97% training accuracy, higher true positive rate at the same false +positive rate of detection, and higher precision at the same recall rate when +compared to BLS. GPFC recovers 100% of known ultra-short-period planets in +Kepler light curves from a blind search. These results highlight the promise of +GPFC as an alternative approach to the traditional BLS algorithm for finding +new transiting exoplanets in data taken with Kepler and other space transit +missions such as K2, TESS and future PLATO and Earth 2.0. + +
+
+ comment: 12 pages, 17 figures; To be published in the Monthly Notices of the + Royal Astronomical Society (MNRAS) +
+
+
+
+
+ + ☆ DUCK: Distance-based Unlearning via Centroid Kinematics + + +
+ Machine Unlearning is rising as a new field, driven by the pressing necessity +of ensuring privacy in modern artificial intelligence models. This technique +primarily aims to eradicate any residual influence of a specific subset of data +from the knowledge acquired by a neural model during its training. This work +introduces a novel unlearning algorithm, denoted as Distance-based Unlearning +via Centroid Kinematics (DUCK), which employs metric learning to guide the +removal of samples matching the nearest incorrect centroid in the embedding +space. Evaluation of the algorithm's performance is conducted across various +benchmark datasets in two distinct scenarios, class removal, and homogeneous +sampling removal, obtaining state-of-the-art performance. We introduce a novel +metric, called Adaptive Unlearning Score (AUS), encompassing not only the +efficacy of the unlearning process in forgetting target data but also +quantifying the performance loss relative to the original model. Moreover, we +propose a novel membership inference attack to assess the algorithm's capacity +to erase previously acquired knowledge, designed to be adaptable to future +methodologies. + +
+
+
+
+
+ + ☆ GFS: Graph-based Feature Synthesis for Prediction over Relational + Databases VLDB 2024 + + +
+ Relational databases are extensively utilized in a variety of modern +information system applications, and they always carry valuable data patterns. +There are a huge number of data mining or machine learning tasks conducted on +relational databases. However, it is worth noting that there are limited +machine learning models specifically designed for relational databases, as most +models are primarily tailored for single table settings. Consequently, the +prevalent approach for training machine learning models on data stored in +relational databases involves performing feature engineering to merge the data +from multiple tables into a single table and subsequently applying single table +models. This approach not only requires significant effort in feature +engineering but also destroys the inherent relational structure present in the +data. To address these challenges, we propose a novel framework called +Graph-based Feature Synthesis (GFS). GFS formulates the relational database as +a heterogeneous graph, thereby preserving the relational structure within the +data. By leveraging the inductive bias from single table models, GFS +effectively captures the intricate relationships inherent in each table. +Additionally, the whole framework eliminates the need for manual feature +engineering. In the extensive experiment over four real-world multi-table +relational databases, GFS outperforms previous methods designed for relational +databases, demonstrating its superior performance. + +
+
+ comment: 13 pages, 5 figures, VLDB 2024 under review +
+
+
+
+
+ + ☆ Stochastic Optimal Control Matching + + +
+ Stochastic optimal control, which has the goal of driving the behavior of +noisy systems, is broadly applicable in science, engineering and artificial +intelligence. Our work introduces Stochastic Optimal Control Matching (SOCM), a +novel Iterative Diffusion Optimization (IDO) technique for stochastic optimal +control that stems from the same philosophy as the conditional score matching +loss for diffusion models. That is, the control is learned via a least squares +problem by trying to fit a matching vector field. The training loss, which is +closely connected to the cross-entropy loss, is optimized with respect to both +the control function and a family of reparameterization matrices which appear +in the matching vector field. The optimization with respect to the +reparameterization matrices aims at minimizing the variance of the matching +vector field. Experimentally, our algorithm achieves lower error than all the +existing IDO techniques for stochastic optimal control for four different +control settings. The key idea underlying SOCM is the path-wise +reparameterization trick, a novel technique that is of independent interest, +e.g., for generative modeling. + +
+
+
+
+
+ + ☆ VLTSeg: Simple Transfer of CLIP-Based Vision-Language Representations + for Domain Generalized Semantic Segmentation + + +
+ Domain generalization (DG) remains a significant challenge for perception +based on deep neural networks (DNN), where domain shifts occur due to lighting, +weather, or geolocation changes. In this work, we propose VLTSeg to enhance +domain generalization in semantic segmentation, where the network is solely +trained on the source domain and evaluated on unseen target domains. Our method +leverages the inherent semantic robustness of vision-language models. First, by +substituting traditional vision-only backbones with pre-trained encoders from +CLIP and EVA-CLIP as transfer learning setting we find that in the field of DG, +vision-language pre-training significantly outperforms supervised and +self-supervised vision pre-training. We thus propose a new vision-language +approach for domain generalized segmentation, which improves the domain +generalization SOTA by 7.6% mIoU when training on the synthetic GTA5 dataset. +We further show the superior generalization capabilities of vision-language +segmentation models by reaching 76.48% mIoU on the popular Cityscapes-to-ACDC +benchmark, outperforming the previous SOTA approach by 6.9% mIoU on the test +set at the time of writing. Additionally, our approach shows strong in-domain +generalization capabilities indicated by 86.1% mIoU on the Cityscapes test set, +resulting in a shared first place with the previous SOTA on the current +leaderboard at the time of submission. + +
+
+
+
+
+ + ☆ Action Inference by Maximising Evidence: Zero-Shot Imitation from + Observation with World Models NeurIPS 2023 + + +
+ Unlike most reinforcement learning agents which require an unrealistic amount +of environment interactions to learn a new behaviour, humans excel at learning +quickly by merely observing and imitating others. This ability highly depends +on the fact that humans have a model of their own embodiment that allows them +to infer the most likely actions that led to the observed behaviour. In this +paper, we propose Action Inference by Maximising Evidence (AIME) to replicate +this behaviour using world models. AIME consists of two distinct phases. In the +first phase, the agent learns a world model from its past experience to +understand its own body by maximising the ELBO. While in the second phase, the +agent is given some observation-only demonstrations of an expert performing a +novel task and tries to imitate the expert's behaviour. AIME achieves this by +defining a policy as an inference model and maximising the evidence of the +demonstration under the policy and world model. Our method is "zero-shot" in +the sense that it does not require further training for the world model or +online interactions with the environment after given the demonstration. We +empirically validate the zero-shot imitation performance of our method on the +Walker and Cheetah embodiment of the DeepMind Control Suite and find it +outperforms the state-of-the-art baselines. Code is available at: +https://github.com/argmax-ai/aime. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ☆ Optimal Data Generation in Multi-Dimensional Parameter Spaces, using + Bayesian Optimization + + +
+ Acquiring a substantial number of data points for training accurate machine +learning (ML) models is a big challenge in scientific fields where data +collection is resource-intensive. Here, we propose a novel approach for +constructing a minimal yet highly informative database for training ML models +in complex multi-dimensional parameter spaces. To achieve this, we mimic the +underlying relation between the output and input parameters using Gaussian +process regression (GPR). Using a set of known data, GPR provides predictive +means and standard deviation for the unknown data. Given the predicted standard +deviation by GPR, we select data points using Bayesian optimization to obtain +an efficient database for training ML models. We compare the performance of ML +models trained on databases obtained through this method, with databases +obtained using traditional approaches. Our results demonstrate that the ML +models trained on the database obtained using Bayesian optimization approach +consistently outperform the other two databases, achieving high accuracy with a +significantly smaller number of data points. Our work contributes to the +resource-efficient collection of data in high-dimensional complex parameter +spaces, to achieve high precision machine learning predictions. + +
+
+ comment: 9 pages, 3 figures +
+
+
+
+
+ + ☆ A Generative Self-Supervised Framework using Functional Connectivity in + fMRI Data NeurIPS 2023 + + +
+ Deep neural networks trained on Functional Connectivity (FC) networks +extracted from functional Magnetic Resonance Imaging (fMRI) data have gained +popularity due to the increasing availability of data and advances in model +architectures, including Graph Neural Network (GNN). Recent research on the +application of GNN to FC suggests that exploiting the time-varying properties +of the FC could significantly improve the accuracy and interpretability of the +model prediction. However, the high cost of acquiring high-quality fMRI data +and corresponding phenotypic labels poses a hurdle to their application in +real-world settings, such that a model na\"ively trained in a supervised +fashion can suffer from insufficient performance or a lack of generalization on +a small number of data. In addition, most Self-Supervised Learning (SSL) +approaches for GNNs to date adopt a contrastive strategy, which tends to lose +appropriate semantic information when the graph structure is perturbed or does +not leverage both spatial and temporal information simultaneously. In light of +these challenges, we propose a generative SSL approach that is tailored to +effectively harness spatio-temporal information within dynamic FC. Our +empirical results, experimented with large-scale (>50,000) fMRI datasets, +demonstrate that our approach learns valuable representations and enables the +construction of accurate and robust models when fine-tuned for downstream +tasks. + +
+
+ comment: NeurIPS 2023 Temporal Graph Learning Workshop +
+
+
+
+
+ + ☆ Information Modified K-Nearest Neighbor + + +
+ In this research paper, we introduce a novel classification method aimed at +improving the performance of the K-Nearest Neighbors (KNN) algorithm. Our +approach leverages Mutual Information (MI) to enhance the significance of +weights and draw inspiration from Shapley values, a concept originating from +cooperative game theory, to refine value allocation. The fundamental concept +underlying KNN is the classification of samples based on the majority thorough +their k-nearest neighbors. While both the distances and labels of these +neighbors are crucial, traditional KNN assigns equal weight to all samples and +prevance considering the varying importance of each neighbor based on their +distances and labels. + In the proposed method, known as Information-Modified KNN (IMKNN), we address +this issue by introducing a straightforward algorithm. To evaluate the +effectiveness of our approach, it is compared with 7 contemporary variants of +KNN, as well as the traditional KNN. Each of these variants exhibits its unique +advantages and limitations. We conduct experiments on 12 widely-used datasets, +assessing the methods' performance in terms of accuracy, precision and recall. + Our study demonstrates that IMKNN consistently outperforms other methods +across different datasets and criteria by highlighting its superior performance +in various classification tasks. These findings underscore the potential of +IMKNN as a valuable tool for enhancing the capabilities of the KNN algorithm in +diverse applications. + +
+
+
+
+
+ + ☆ Learning-Based Approaches to Predictive Monitoring with Conformal + Statistical Guarantees + + +
+ This tutorial focuses on efficient methods to predictive monitoring (PM), the +problem of detecting at runtime future violations of a given requirement from +the current state of a system. While performing model checking at runtime would +offer a precise solution to the PM problem, it is generally computationally +expensive. To address this scalability issue, several lightweight approaches +based on machine learning have recently been proposed. These approaches work by +learning an approximate yet efficient surrogate (deep learning) model of the +expensive model checker. A key challenge remains to ensure reliable +predictions, especially in safety-critical applications. We review our recent +work on predictive monitoring, one of the first to propose learning-based +approximations for CPS verification of temporal logic specifications and the +first in this context to apply conformal prediction (CP) for rigorous +uncertainty quantification. These CP-based uncertainty estimators offer +statistical guarantees regarding the generalization error of the learning +model, and they can be used to determine unreliable predictions that should be +rejected. In this tutorial, we present a general and comprehensive framework +summarizing our approach to the predictive monitoring of CPSs, examining in +detail several variants determined by three main dimensions: system dynamics +(deterministic, non-deterministic, stochastic), state observability, and +semantics of requirements' satisfaction (Boolean or quantitative). + +
+
+
+
+
+ + ☆ Distilled Self-Critique of LLMs with Synthetic Data: a Bayesian + Perspective ICLR 2024 + + +
+ This paper proposes an interpretation of RLAIF as Bayesian inference by +introducing distilled Self-Critique (dSC), which refines the outputs of a LLM +through a Gibbs sampler that is later distilled into a fine-tuned model. Only +requiring synthetic data, dSC is exercised in experiments regarding safety, +sentiment, and privacy control, showing it can be a viable and cheap +alternative to align LLMs. Code released at +\url{https://github.com/vicgalle/distilled-self-critique}. + +
+
+ comment: Submitted to ICLR 2024 (TinyPapers track) +
+
+
+
+
+ + ☆ Maximising Quantum-Computing Expressive Power through Randomised + Circuits + + +
+ In the noisy intermediate-scale quantum era, variational quantum algorithms +(VQAs) have emerged as a promising avenue to obtain quantum advantage. However, +the success of VQAs depends on the expressive power of parameterised quantum +circuits, which is constrained by the limited gate number and the presence of +barren plateaus. In this work, we propose and numerically demonstrate a novel +approach for VQAs, utilizing randomised quantum circuits to generate the +variational wavefunction. We parameterize the distribution function of these +random circuits using artificial neural networks and optimize it to find the +solution. This random-circuit approach presents a trade-off between the +expressive power of the variational wavefunction and time cost, in terms of the +sampling cost of quantum circuits. Given a fixed gate number, we can +systematically increase the expressive power by extending the quantum-computing +time. With a sufficiently large permissible time cost, the variational +wavefunction can approximate any quantum state with arbitrary accuracy. +Furthermore, we establish explicit relationships between expressive power, time +cost, and gate number for variational quantum eigensolvers. These results +highlight the promising potential of the random-circuit approach in achieving a +high expressive power in quantum computing. + +
+
+ comment: 19 pages, 10 figures +
+
+
+
+
+ + ☆ Intrusion Detection System with Machine Learning and Multiple Datasets + + +
+ As Artificial Intelligence (AI) technologies continue to gain traction in the +modern-day world, they ultimately pose an immediate threat to current +cybersecurity systems via exploitative methods. Prompt engineering is a +relatively new field that explores various prompt designs that can hijack large +language models (LLMs). If used by an unethical attacker, it can enable an AI +system to offer malicious insights and code to them. In this paper, an enhanced +intrusion detection system (IDS) that utilizes machine learning (ML) and +hyperparameter tuning is explored, which can improve a model's performance in +terms of accuracy and efficacy. Ultimately, this improved system can be used to +combat the attacks made by unethical hackers. A standard IDS is solely +configured with pre-configured rules and patterns; however, with the +utilization of machine learning, implicit and different patterns can be +generated through the models' hyperparameter settings and parameters. In +addition, the IDS will be equipped with multiple datasets so that the accuracy +of the models improves. We evaluate the performance of multiple ML models and +their respective hyperparameter settings through various metrics to compare +their results to other models and past research work. The results of the +proposed multi-dataset integration method yielded an accuracy score of 99.9% +when equipped with the XGBoost and random forest classifiers and +RandomizedSearchCV hyperparameter technique. + +
+
+ comment: 12 pages, 2 figures, 2 tables +
+
+
+
+
+ + ☆ Foundations for Transfer in Reinforcement Learning: A Taxonomy of + Knowledge Modalities + + +
+ Contemporary artificial intelligence systems exhibit rapidly growing +abilities accompanied by the growth of required resources, expansive datasets +and corresponding investments into computing infrastructure. Although earlier +successes predominantly focus on constrained settings, recent strides in +fundamental research and applications aspire to create increasingly general +systems. This evolving landscape presents a dual panorama of opportunities and +challenges in refining the generalisation and transfer of knowledge - the +extraction from existing sources and adaptation as a comprehensive foundation +for tackling new problems. Within the domain of reinforcement learning (RL), +the representation of knowledge manifests through various modalities, including +dynamics and reward models, value functions, policies, and the original data. +This taxonomy systematically targets these modalities and frames its discussion +based on their inherent properties and alignment with different objectives and +mechanisms for transfer. Where possible, we aim to provide coarse guidance +delineating approaches which address requirements such as limiting environment +interactions, maximising computational efficiency, and enhancing generalisation +across varying axes of change. Finally, we analyse reasons contributing to the +prevalence or scarcity of specific forms of transfer, the inherent potential +behind pushing these frontiers, and underscore the significance of +transitioning from designed to learned transfer. + +
+
+
+
+
+ + ☆ Unsupervised Anomaly Detection using Aggregated Normative Diffusion + + +
+ Early detection of anomalies in medical images such as brain MRI is highly +relevant for diagnosis and treatment of many conditions. Supervised machine +learning methods are limited to a small number of pathologies where there is +good availability of labeled data. In contrast, unsupervised anomaly detection +(UAD) has the potential to identify a broader spectrum of anomalies by spotting +deviations from normal patterns. Our research demonstrates that existing +state-of-the-art UAD approaches do not generalise well to diverse types of +anomalies in realistic multi-modal MR data. To overcome this, we introduce a +new UAD method named Aggregated Normative Diffusion (ANDi). ANDi operates by +aggregating differences between predicted denoising steps and ground truth +backwards transitions in Denoising Diffusion Probabilistic Models (DDPMs) that +have been trained on pyramidal Gaussian noise. We validate ANDi against three +recent UAD baselines, and across three diverse brain MRI datasets. We show that +ANDi, in some cases, substantially surpasses these baselines and shows +increased robustness to varying types of anomalies. Particularly in detecting +multiple sclerosis (MS) lesions, ANDi achieves improvements of up to 178% in +terms of AUPRC. + +
+
+
+
+
+ + ☆ Unlocking optimal batch size schedules using continuous-time control and + perturbation theory + + +
+ Stochastic Gradient Descent (SGD) and its variants are almost universally +used to train neural networks and to fit a variety of other parametric models. +An important hyperparameter in this context is the batch size, which determines +how many samples are processed before an update of the parameters occurs. +Previous studies have demonstrated the benefits of using variable batch sizes. +In this work, we will theoretically derive optimal batch size schedules for SGD +and similar algorithms, up to an error that is quadratic in the learning rate. +To achieve this, we approximate the discrete process of parameter updates using +a family of stochastic differential equations indexed by the learning rate. To +better handle the state-dependent diffusion coefficient, we further expand the +solution of this family into a series with respect to the learning rate. Using +this setup, we derive a continuous-time optimal batch size schedule for a large +family of diffusion coefficients and then apply the results in the setting of +linear regression. + +
+
+
+
+
+ + ☆ Non-Intrusive Load Monitoring for Feeder-Level EV Charging Detection: + Sliding Window-based Approaches to Offline and Online Detection + + +
+ Understanding electric vehicle (EV) charging on the distribution network is +key to effective EV charging management and aiding decarbonization across the +energy and transport sectors. Advanced metering infrastructure has allowed +distribution system operators and utility companies to collect high-resolution +load data from their networks. These advancements enable the non-intrusive load +monitoring (NILM) technique to detect EV charging using load measurement data. +While existing studies primarily focused on NILM for EV charging detection in +individual households, there is a research gap on EV charging detection at the +feeder level, presenting unique challenges due to the combined load measurement +from multiple households. In this paper, we develop a novel and effective +approach for EV detection at the feeder level, involving sliding-window feature +extraction and classical machine learning techniques, specifically models like +XGBoost and Random Forest. Our developed method offers a lightweight and +efficient solution, capable of quick training. Moreover, our developed method +is versatile, supporting both offline and online EV charging detection. Our +experimental results demonstrate high-accuracy EV charging detection at the +feeder level, achieving an F-Score of 98.88% in offline detection and 93.01% in +online detection. + +
+
+ comment: The 7th IEEE Conference on Energy Internet and Energy System + Integration (EI2 2023) +
+
+
+
+
+ + ☆ Correlation and Unintended Biases on Univariate and Multivariate + Decision Trees + + +
+ Decision Trees are accessible, interpretable, and well-performing +classification models. A plethora of variants with increasing expressiveness +has been proposed in the last forty years. We contrast the two families of +univariate DTs, whose split functions partition data through axis-parallel +hyperplanes, and multivariate DTs, whose splits instead partition data through +oblique hyperplanes. The latter include the former, hence multivariate DTs are +in principle more powerful. Surprisingly enough, however, univariate DTs +consistently show comparable performances in the literature. We analyze the +reasons behind this, both with synthetic and real-world benchmark datasets. Our +research questions test whether the pre-processing phase of removing +correlation among features in datasets has an impact on the relative +performances of univariate vs multivariate DTs. We find that existing benchmark +datasets are likely biased towards favoring univariate DTs. + +
+
+
+
+
+ + ☆ HGPROMPT: Bridging Homogeneous and Heterogeneous Graphs for Few-shot + Prompt Learning + + +
+ Graph neural networks (GNNs) and heterogeneous graph neural networks (HGNNs) +are prominent techniques for homogeneous and heterogeneous graph representation +learning, yet their performance in an end-to-end supervised framework greatly +depends on the availability of task-specific supervision. To reduce the +labeling cost, pre-training on self-supervised pretext tasks has become a +popular paradigm,but there is often a gap between the pre-trained model and +downstream tasks, stemming from the divergence in their objectives. To bridge +the gap, prompt learning has risen as a promising direction especially in +few-shot settings, without the need to fully fine-tune the pre-trained model. +While there has been some early exploration of prompt-based learning on graphs, +they primarily deal with homogeneous graphs, ignoring the heterogeneous graphs +that are prevalent in downstream applications. In this paper, we propose +HGPROMPT, a novel pre-training and prompting framework to unify not only +pre-training and downstream tasks but also homogeneous and heterogeneous graphs +via a dual-template design. Moreover, we propose dual-prompt in HGPROMPT to +assist a downstream task in locating the most relevant prior to bridge the gaps +caused by not only feature variations but also heterogeneity differences across +tasks. Finally, we thoroughly evaluate and analyze HGPROMPT through extensive +experiments on three public datasets. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Robot Synesthesia: In-Hand Manipulation with Visuotactile Sensing + + +
+ Executing contact-rich manipulation tasks necessitates the fusion of tactile +and visual feedback. However, the distinct nature of these modalities poses +significant challenges. In this paper, we introduce a system that leverages +visual and tactile sensory inputs to enable dexterous in-hand manipulation. +Specifically, we propose Robot Synesthesia, a novel point cloud-based tactile +representation inspired by human tactile-visual synesthesia. This approach +allows for the simultaneous and seamless integration of both sensory inputs, +offering richer spatial information and facilitating better reasoning about +robot actions. The method, trained in a simulated environment and then deployed +to a real robot, is applicable to various in-hand object rotation tasks. +Comprehensive ablations are performed on how the integration of vision and +touch can improve reinforcement learning and Sim2Real performance. Our project +page is available at https://yingyuan0414.github.io/visuotactile/ . + +
+
+ comment: Project page: https://yingyuan0414.github.io/visuotactile/ +
+
+
+
+
+ + ☆ Generalization by Adaptation: Diffusion-Based Domain Extension for + Domain-Generalized Semantic Segmentation WACV 2024 + + +
+ When models, e.g., for semantic segmentation, are applied to images that are +vastly different from training data, the performance will drop significantly. +Domain adaptation methods try to overcome this issue, but need samples from the +target domain. However, this might not always be feasible for various reasons +and therefore domain generalization methods are useful as they do not require +any target data. We present a new diffusion-based domain extension (DIDEX) +method and employ a diffusion model to generate a pseudo-target domain with +diverse text prompts. In contrast to existing methods, this allows to control +the style and content of the generated images and to introduce a high +diversity. In a second step, we train a generalizing model by adapting towards +this pseudo-target domain. We outperform previous approaches by a large margin +across various datasets and architectures without using any real data. For the +generalization from GTA5, we improve state-of-the-art mIoU performance by 3.8% +absolute on average and for SYNTHIA by 11.8% absolute, marking a big step for +the generalization performance on these benchmarks. Code is available at +https://github.com/JNiemeijer/DIDEX + +
+
+ comment: Accepted to WACV 2024 +
+
+
+
+
+ + ☆ Learning Machine Morality through Experience and Interaction + + +
+ Increasing interest in ensuring safety of next-generation Artificial +Intelligence (AI) systems calls for novel approaches to embedding morality into +autonomous agents. Traditionally, this has been done by imposing explicit +top-down rules or hard constraints on systems, for example by filtering system +outputs through pre-defined ethical rules. Recently, instead, entirely +bottom-up methods for learning implicit preferences from human behavior have +become increasingly popular, such as those for training and fine-tuning Large +Language Models. In this paper, we provide a systematization of existing +approaches to the problem of introducing morality in machines - modeled as a +continuum, and argue that the majority of popular techniques lie at the +extremes - either being fully hard-coded, or entirely learned, where no +explicit statement of any moral principle is required. Given the relative +strengths and weaknesses of each type of methodology, we argue that more hybrid +solutions are needed to create adaptable and robust, yet more controllable and +interpretable agents. + In particular, we present three case studies of recent works which use +learning from experience (i.e., Reinforcement Learning) to explicitly provide +moral principles to learning agents - either as intrinsic rewards, moral +logical constraints or textual principles for language models. For example, +using intrinsic rewards in Social Dilemma games, we demonstrate how it is +possible to represent classical moral frameworks for agents. We also present an +overview of the existing work in this area in order to provide empirical +evidence for the potential of this hybrid approach. We then discuss strategies +for evaluating the effectiveness of moral learning agents. Finally, we present +open research questions and implications for the future of AI safety and ethics +which are emerging from this framework. + +
+
+
+
+
+ + ☆ Class Symbolic Regression: Gotta Fit 'Em All + + +
+ We introduce "Class Symbolic Regression" a first framework for automatically +finding a single analytical functional form that accurately fits multiple +datasets - each governed by its own (possibly) unique set of fitting +parameters. This hierarchical framework leverages the common constraint that +all the members of a single class of physical phenomena follow a common +governing law. Our approach extends the capabilities of our earlier Physical +Symbolic Optimization ($\Phi$-SO) framework for Symbolic Regression, which +integrates dimensional analysis constraints and deep reinforcement learning for +symbolic analytical function discovery from data. We demonstrate the efficacy +of this novel approach by applying it to a panel of synthetic toy case datasets +and showcase its practical utility for astrophysics by successfully extracting +an analytic galaxy potential from a set of simulated orbits approximating +stellar streams. + +
+
+ comment: 7 pages, 1 figure, 2 tables. Submitted to ApJL +
+
+
+
+
+ + ☆ Energy-based Potential Games for Joint Motion Forecasting and Control + + +
+ This work uses game theory as a mathematical framework to address interaction +modeling in multi-agent motion forecasting and control. Despite its +interpretability, applying game theory to real-world robotics, like automated +driving, faces challenges such as unknown game parameters. To tackle these, we +establish a connection between differential games, optimal control, and +energy-based models, demonstrating how existing approaches can be unified under +our proposed Energy-based Potential Game formulation. Building upon this, we +introduce a new end-to-end learning application that combines neural networks +for game-parameter inference with a differentiable game-theoretic optimization +layer, acting as an inductive bias. The analysis provides empirical evidence +that the game-theoretic layer adds interpretability and improves the predictive +performance of various neural network backbones using two simulations and two +real-world driving datasets. + +
+
+ comment: Conference on Robot Learning, CoRL 2023 +
+
+
+
+
+ + ☆ Distributed Continual Learning with CoCoA in High-dimensional Linear + Regression + + +
+ We consider estimation under scenarios where the signals of interest exhibit +change of characteristics over time. In particular, we consider the continual +learning problem where different tasks, e.g., data with different +distributions, arrive sequentially and the aim is to perform well on the newly +arrived task without performance degradation on the previously seen tasks. In +contrast to the continual learning literature focusing on the centralized +setting, we investigate the problem from a distributed estimation perspective. +We consider the well-established distributed learning algorithm COCOA, which +distributes the model parameters and the corresponding features over the +network. We provide exact analytical characterization for the generalization +error of COCOA under continual learning for linear regression in a range of +scenarios, where overparameterization is of particular interest. These +analytical results characterize how the generalization error depends on the +network structure, the task similarity and the number of tasks, and show how +these dependencies are intertwined. In particular, our results show that the +generalization error can be significantly reduced by adjusting the network +size, where the most favorable network size depends on task similarity and the +number of tasks. We present numerical results verifying the theoretical +analysis and illustrate the continual learning performance of COCOA with a +digit classification task. + +
+
+
+
+
+ + ☆ Wild-Tab: A Benchmark For Out-Of-Distribution Generalization In Tabular + Regression + + +
+ Out-of-Distribution (OOD) generalization, a cornerstone for building robust +machine learning models capable of handling data diverging from the training +set's distribution, is an ongoing challenge in deep learning. While significant +progress has been observed in computer vision and natural language processing, +its exploration in tabular data, ubiquitous in many industrial applications, +remains nascent. To bridge this gap, we present Wild-Tab, a large-scale +benchmark tailored for OOD generalization in tabular regression tasks. The +benchmark incorporates 3 industrial datasets sourced from fields like weather +prediction and power consumption estimation, providing a challenging testbed +for evaluating OOD performance under real-world conditions. Our extensive +experiments, evaluating 10 distinct OOD generalization methods on Wild-Tab, +reveal nuanced insights. We observe that many of these methods often struggle +to maintain high-performance levels on unseen data, with OOD performance +showing a marked drop compared to in-distribution performance. At the same +time, Empirical Risk Minimization (ERM), despite its simplicity, delivers +robust performance across all evaluations, rivaling the results of +state-of-the-art methods. Looking forward, we hope that the release of Wild-Tab +will facilitate further research on OOD generalization and aid in the +deployment of machine learning models in various real-world contexts where +handling distribution shifts is a crucial requirement. + +
+
+
+
+
+ + ☆ Long-Tail Learning with Rebalanced Contrastive Loss + + +
+ Integrating supervised contrastive loss to cross entropy-based communication +has recently been proposed as a solution to address the long-tail learning +problem. However, when the class imbalance ratio is high, it requires adjusting +the supervised contrastive loss to support the tail classes, as the +conventional contrastive learning is biased towards head classes by default. To +this end, we present Rebalanced Contrastive Learning (RCL), an efficient means +to increase the long tail classification accuracy by addressing three main +aspects: 1. Feature space balancedness - Equal division of the feature space +among all the classes, 2. Intra-Class compactness - Reducing the distance +between same-class embeddings, 3. Regularization - Enforcing larger margins for +tail classes to reduce overfitting. RCL adopts class frequency-based SoftMax +loss balancing to supervised contrastive learning loss and exploits scalar +multiplied features fed to the contrastive learning loss to enforce +compactness. We implement RCL on the Balanced Contrastive Learning (BCL) +Framework, which has the SOTA performance. Our experiments on three benchmark +datasets demonstrate the richness of the learnt embeddings and increased top-1 +balanced accuracy RCL provides to the BCL framework. We further demonstrate +that the performance of RCL as a standalone loss also achieves state-of-the-art +level accuracy. + +
+
+
+
+
+ + ☆ Divide-and-Conquer Strategy for Large-Scale Dynamic Bayesian Network + Structure Learning + + +
+ Dynamic Bayesian Networks (DBNs), renowned for their interpretability, have +become increasingly vital in representing complex stochastic processes in +various domains such as gene expression analysis, healthcare, and traffic +prediction. Structure learning of DBNs from data is challenging, particularly +for datasets with thousands of variables. Most current algorithms for DBN +structure learning are adaptations from those used in static Bayesian Networks +(BNs), and are typically focused on small-scale problems. In order to solve +large-scale problems while taking full advantage of existing algorithms, this +paper introduces a novel divide-and-conquer strategy, originally developed for +static BNs, and adapts it for large-scale DBN structure learning. In this work, +we specifically concentrate on 2 Time-sliced Bayesian Networks (2-TBNs), a +special class of DBNs. Furthermore, we leverage the prior knowledge of 2-TBNs +to enhance the performance of the strategy we introduce. Our approach +significantly improves the scalability and accuracy of 2-TBN structure +learning. Experimental results demonstrate the effectiveness of our method, +showing substantial improvements over existing algorithms in both computational +efficiency and structure learning accuracy. On problem instances with more than +1,000 variables, our approach improves two accuracy metrics by 74.45% and +110.94% on average , respectively, while reducing runtime by 93.65% on average. + +
+
+
+
+
+ + ☆ EdgeConvFormer: Dynamic Graph CNN and Transformer based Anomaly + Detection in Multivariate Time Series + + +
+ Transformer-based models for anomaly detection in multivariate time series +can benefit from the self-attention mechanism due to its advantage in modeling +long-term dependencies. However, Transformer-based anomaly detection models +have problems such as a large amount of data being required for training, +standard positional encoding is not suitable for multivariate time series data, +and the interdependence between time series is not considered. To address these +limitations, we propose a novel anomaly detection method, named EdgeConvFormer, +which integrates Time2vec embedding, stacked dynamic graph CNN, and Transformer +to extract global and local spatial-time information. This design of +EdgeConvFormer empowers it with decomposition capacities for complex time +series, progressive spatiotemporal correlation discovery between time series, +and representation aggregation of multi-scale features. Experiments demonstrate +that EdgeConvFormer can learn the spatial-temporal correlations from +multivariate time series data and achieve better anomaly detection performance +than the state-of-the-art approaches on many real-world datasets of different +scales. + +
+
+
+
+
+ + ☆ ImputeFormer: Graph Transformers for Generalizable Spatiotemporal + Imputation + + +
+ This paper focuses on the multivariate time series imputation problem using +deep neural architectures. The ubiquitous issue of missing data in both +scientific and engineering tasks necessitates the development of an effective +and general imputation model. Leveraging the wisdom and expertise garnered from +low-rank imputation methods, we power the canonical Transformers with three key +knowledge-driven enhancements, including projected temporal attention, global +adaptive graph convolution, and Fourier imputation loss. These task-agnostic +inductive biases exploit the inherent structures of incomplete time series, and +thus make our model versatile for a variety of imputation problems. We +demonstrate its superiority in terms of accuracy, efficiency, and flexibility +on heterogeneous datasets, including traffic speed, traffic volume, solar +energy, smart metering, and air quality. Comprehensive case studies are +performed to further strengthen the interpretability. Promising empirical +results provide strong conviction that incorporating time series primitives, +such as low-rank properties, can substantially facilitate the development of a +generalizable model to approach a wide range of spatiotemporal imputation +problems. + +
+
+
+
+
+ + ☆ The Self-Loop Paradox: Investigating the Impact of Self-Loops on Graph + Neural Networks + + +
+ Many Graph Neural Networks (GNNs) add self-loops to a graph to include +feature information about a node itself at each layer. However, if the GNN +consists of more than one layer, this information can return to its origin via +cycles in the graph topology. Intuition suggests that this "backflow" of +information should be larger in graphs with self-loops compared to graphs +without. In this work, we counter this intuition and show that for certain GNN +architectures, the information a node gains from itself can be smaller in +graphs with self-loops compared to the same graphs without. We adopt an +analytical approach for the study of statistical graph ensembles with a given +degree sequence and show that this phenomenon, which we call the self-loop +paradox, can depend both on the number of GNN layers $k$ and whether $k$ is +even or odd. We experimentally validate our theoretical findings in a synthetic +node classification task and investigate its practical relevance in 23 +real-world graphs. + +
+
+ comment: Presented at the Second Learning on Graphs Conference (LoG 2023) as + extended abstract +
+
+
+
+
+ + ☆ Rethinking Urban Mobility Prediction: A Super-Multivariate Time Series + Forecasting Approach + + +
+ Long-term urban mobility predictions play a crucial role in the effective +management of urban facilities and services. Conventionally, urban mobility +data has been structured as spatiotemporal videos, treating longitude and +latitude grids as fundamental pixels. Consequently, video prediction methods, +relying on Convolutional Neural Networks (CNNs) and Vision Transformers (ViTs), +have been instrumental in this domain. In our research, we introduce a fresh +perspective on urban mobility prediction. Instead of oversimplifying urban +mobility data as traditional video data, we regard it as a complex multivariate +time series. This perspective involves treating the time-varying values of each +grid in each channel as individual time series, necessitating a thorough +examination of temporal dynamics, cross-variable correlations, and +frequency-domain insights for precise and reliable predictions. To address this +challenge, we present the Super-Multivariate Urban Mobility Transformer +(SUMformer), which utilizes a specially designed attention mechanism to +calculate temporal and cross-variable correlations and reduce computational +costs stemming from a large number of time series. SUMformer also employs +low-frequency filters to extract essential information for long-term +predictions. Furthermore, SUMformer is structured with a temporal patch merge +mechanism, forming a hierarchical framework that enables the capture of +multi-scale correlations. Consequently, it excels in urban mobility pattern +modeling and long-term prediction, outperforming current state-of-the-art +methods across three real-world datasets. + +
+
+ comment: 14 pages,9 figures +
+
+
+
+
+ + ☆ Risk-Controlling Model Selection via Guided Bayesian Optimization + + +
+ Adjustable hyperparameters of machine learning models typically impact +various key trade-offs such as accuracy, fairness, robustness, or inference +cost. Our goal in this paper is to find a configuration that adheres to +user-specified limits on certain risks while being useful with respect to other +conflicting metrics. We solve this by combining Bayesian Optimization (BO) with +rigorous risk-controlling procedures, where our core idea is to steer BO +towards an efficient testing strategy. Our BO method identifies a set of Pareto +optimal configurations residing in a designated region of interest. The +resulting candidates are statistically verified and the best-performing +configuration is selected with guaranteed risk levels. We demonstrate the +effectiveness of our approach on a range of tasks with multiple desiderata, +including low error rates, equitable predictions, handling spurious +correlations, managing rate and distortion in generative models, and reducing +computational costs. + +
+
+
+
+
+ + ☆ Estimating Coronal Mass Ejection Mass and Kinetic Energy by Fusion of + Multiple Deep-learning Models + + +
+ Coronal mass ejections (CMEs) are massive solar eruptions, which have a +significant impact on Earth. In this paper, we propose a new method, called +DeepCME, to estimate two properties of CMEs, namely, CME mass and kinetic +energy. Being able to estimate these properties helps better understand CME +dynamics. Our study is based on the CME catalog maintained at the Coordinated +Data Analysis Workshops (CDAW) Data Center, which contains all CMEs manually +identified since 1996 using the Large Angle and Spectrometric Coronagraph +(LASCO) on board the Solar and Heliospheric Observatory (SOHO). We use LASCO C2 +data in the period between January 1996 and December 2020 to train, validate +and test DeepCME through 10-fold cross validation. The DeepCME method is a +fusion of three deep learning models, including ResNet, InceptionNet, and +InceptionResNet. Our fusion model extracts features from LASCO C2 images, +effectively combining the learning capabilities of the three component models +to jointly estimate the mass and kinetic energy of CMEs. Experimental results +show that the fusion model yields a mean relative error (MRE) of 0.013 (0.009, +respectively) compared to the MRE of 0.019 (0.017, respectively) of the best +component model InceptionResNet (InceptionNet, respectively) in estimating the +CME mass (kinetic energy, respectively). To our knowledge, this is the first +time that deep learning has been used for CME mass and kinetic energy +estimations. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Optimizing Bus Travel: A Novel Approach to Feature Mining with P-KMEANS + and P-LDA Algorithms + + +
+ Customizing services for bus travel can bolster its attractiveness, optimize +usage, alleviate traffic congestion, and diminish carbon emissions. This +potential is realized by harnessing recent advancements in positioning +communication facilities, the Internet of Things, and artificial intelligence +for feature mining in public transportation. However, the inherent complexities +of disorganized and unstructured public transportation data introduce +substantial challenges to travel feature extraction. This study presents a bus +travel feature extraction method rooted in Point of Interest (POI) data, +employing enhanced P-KMENAS and P-LDA algorithms to overcome these limitations. +While the KMEANS algorithm adeptly segments passenger travel paths into +distinct clusters, its outcomes can be influenced by the initial K value. On +the other hand, Latent Dirichlet Allocation (LDA) excels at feature +identification and probabilistic interpretations yet encounters difficulties +with feature intermingling and nuanced sub-feature interactions. Incorporating +the POI dimension enhances our understanding of travel behavior, aligning it +more closely with passenger attributes and facilitating easier data analysis. +By incorporating POI data, our refined P-KMENAS and P-LDA algorithms grant a +holistic insight into travel behaviors and attributes, effectively mitigating +the limitations above. Consequently, this POI-centric algorithm effectively +amalgamates diverse POI attributes, delineates varied travel contexts, and +imparts probabilistic metrics to feature properties. Our method successfully +mines the diverse aspects of bus travel, such as age, occupation, gender, +sports, cost, safety, and personality traits. It effectively calculates +relationships between individual travel behaviors and assigns explanatory and +evaluative probabilities to POI labels, thereby enhancing bus travel +optimization. + +
+
+
+
+
+ + ☆ Adversarial Medical Image with Hierarchical Feature Hiding + + +
+ Deep learning based methods for medical images can be easily compromised by +adversarial examples (AEs), posing a great security flaw in clinical +decision-making. It has been discovered that conventional adversarial attacks +like PGD which optimize the classification logits, are easy to distinguish in +the feature space, resulting in accurate reactive defenses. To better +understand this phenomenon and reassess the reliability of the reactive +defenses for medical AEs, we thoroughly investigate the characteristic of +conventional medical AEs. Specifically, we first theoretically prove that +conventional adversarial attacks change the outputs by continuously optimizing +vulnerable features in a fixed direction, thereby leading to outlier +representations in the feature space. Then, a stress test is conducted to +reveal the vulnerability of medical images, by comparing with natural images. +Interestingly, this vulnerability is a double-edged sword, which can be +exploited to hide AEs. We then propose a simple-yet-effective hierarchical +feature constraint (HFC), a novel add-on to conventional white-box attacks, +which assists to hide the adversarial feature in the target feature +distribution. The proposed method is evaluated on three medical datasets, both +2D and 3D, with different modalities. The experimental results demonstrate the +superiority of HFC, \emph{i.e.,} it bypasses an array of state-of-the-art +adversarial medical AE detectors more efficiently than competing adaptive +attacks, which reveals the deficiencies of medical reactive defense and allows +to develop more robust defenses in future. + +
+
+ comment: Our code is available at + \url{https://github.com/qsyao/Hierarchical_Feature_Constraint} +
+
+
+
+
+ + ☆ Jellyfish: A Large Language Model for Data Preprocessing + + +
+ In this paper, we present Jellyfish, an open-source LLM as a universal task +solver for DP. Built on the Llama 2 13B model, Jellyfish is instruction-tuned +with the datasets of several typical DP tasks including error detection, data +imputation, schema matching, and entity matching, and delivers generalizability +to other tasks. Remarkably, Jellyfish can operate on a local, single, and +low-priced GPU with its 13 billion parameters, ensuring data security and +enabling further tuning. Its proficiency in understanding natural language +allows users to manually craft instructions for DP tasks. Unlike many existing +methods that heavily rely on prior knowledge, Jellyfish acquires domain +knowledge during its tuning process and integrates optional knowledge injection +during inference. A distinctive feature of Jellyfish is its interpreter, which +elucidates its output decisions. To construct Jellyfish, we develop a series of +pre-tuning and DP-tuning techniques. Jellyfish is equipped with an instance +serializer, which automatically translates raw data into model prompts, and a +knowledge injector, which optionally introduces task- and dataset-specific +knowledge to enhance DP performance. Our evaluation of Jellyfish, using a range +of real datasets, shows its competitiveness compared to state-of-the-art +methods and its strong generalizability to unseen tasks. Jellyfish's +performance rivals that of GPT series models, and its interpreter offers +enhanced reasoning capabilities compared to GPT-3.5. Furthermore, our +evaluation highlights the effectiveness of the techniques employed in +constructing Jellyfish. Our model is available at Hugging Face: +https://huggingface.co/NECOUDBFM/Jellyfish . + +
+
+
+
+
+ + ☆ EDALearn: A Comprehensive RTL-to-Signoff EDA Benchmark for Democratized + and Reproducible ML for EDA Research + + +
+ The application of Machine Learning (ML) in Electronic Design Automation +(EDA) for Very Large-Scale Integration (VLSI) design has garnered significant +research attention. Despite the requirement for extensive datasets to build +effective ML models, most studies are limited to smaller, internally generated +datasets due to the lack of comprehensive public resources. In response, we +introduce EDALearn, the first holistic, open-source benchmark suite +specifically for ML tasks in EDA. This benchmark suite presents an end-to-end +flow from synthesis to physical implementation, enriching data collection +across various stages. It fosters reproducibility and promotes research into ML +transferability across different technology nodes. Accommodating a wide range +of VLSI design instances and sizes, our benchmark aptly represents the +complexity of contemporary VLSI designs. Additionally, we provide an in-depth +data analysis, enabling users to fully comprehend the attributes and +distribution of our data, which is essential for creating efficient ML models. +Our contributions aim to encourage further advances in the ML-EDA domain. + +
+
+ comment: 8 pages +
+
+
+
+
+ + ☆ Universal Deoxidation of Semiconductor Substrates Assisted by + Machine-Learning and Real-Time-Feedback-Control + + +
+ Thin film deposition is an essential step in the semiconductor process. +During preparation or loading, the substrate is exposed to the air unavoidably, +which has motivated studies of the process control to remove the surface oxide +before thin film deposition. Optimizing the deoxidation process in molecular +beam epitaxy (MBE) for a random substrate is a multidimensional challenge and +sometimes controversial. Due to variations in semiconductor materials and +growth processes, the determination of substrate deoxidation temperature is +highly dependent on the grower's expertise; the same substrate may yield +inconsistent results when evaluated by different growers. Here, we employ a +machine learning (ML) hybrid convolution and vision transformer (CNN-ViT) +model. This model utilizes reflection high-energy electron diffraction (RHEED) +video as input to determine the deoxidation status of the substrate as output, +enabling automated substrate deoxidation under a controlled architecture. This +also extends to the successful application of deoxidation processes on other +substrates. Furthermore, we showcase the potential of models trained on data +from a single MBE equipment to achieve high-accuracy deployment on other +equipment. In contrast to traditional methods, our approach holds exceptional +practical value. It standardizes deoxidation temperatures across various +equipment and substrate materials, advancing the standardization research +process in semiconductor preparation, a significant milestone in thin film +growth technology. The concepts and methods demonstrated in this work are +anticipated to revolutionize semiconductor manufacturing in optoelectronics and +microelectronics industries by applying them to diverse material growth +processes. + +
+
+ comment: 5 figures +
+
+
+
+
+ + ☆ RiskBench: A Scenario-based Benchmark for Risk Identification + + +
+ Intelligent driving systems aim to achieve a zero-collision mobility +experience, requiring interdisciplinary efforts to enhance safety performance. +This work focuses on risk identification, the process of identifying and +analyzing risks stemming from dynamic traffic participants and unexpected +events. While significant advances have been made in the community, the current +evaluation of different risk identification algorithms uses independent +datasets, leading to difficulty in direct comparison and hindering collective +progress toward safety performance enhancement. To address this limitation, we +introduce \textbf{RiskBench}, a large-scale scenario-based benchmark for risk +identification. We design a scenario taxonomy and augmentation pipeline to +enable a systematic collection of ground truth risks under diverse scenarios. +We assess the ability of ten algorithms to (1) detect and locate risks, (2) +anticipate risks, and (3) facilitate decision-making. We conduct extensive +experiments and summarize future research on risk identification. Our aim is to +encourage collaborative endeavors in achieving a society with zero collisions. +We have made our dataset and benchmark toolkit publicly on the project page: +https://hcis-lab.github.io/RiskBench/ + +
+
+
+
+
+ + ☆ AGD: an Auto-switchable Optimizer using Stepwise Gradient Difference for + Preconditioning Matrix NeurIPS '23 + + +
+ Adaptive optimizers, such as Adam, have achieved remarkable success in deep +learning. A key component of these optimizers is the so-called preconditioning +matrix, providing enhanced gradient information and regulating the step size of +each gradient direction. In this paper, we propose a novel approach to +designing the preconditioning matrix by utilizing the gradient difference +between two successive steps as the diagonal elements. These diagonal elements +are closely related to the Hessian and can be perceived as an approximation of +the inner product between the Hessian row vectors and difference of the +adjacent parameter vectors. Additionally, we introduce an auto-switching +function that enables the preconditioning matrix to switch dynamically between +Stochastic Gradient Descent (SGD) and the adaptive optimizer. Based on these +two techniques, we develop a new optimizer named AGD that enhances the +generalization performance. We evaluate AGD on public datasets of Natural +Language Processing (NLP), Computer Vision (CV), and Recommendation Systems +(RecSys). Our experimental results demonstrate that AGD outperforms the +state-of-the-art (SOTA) optimizers, achieving highly competitive or +significantly better predictive performance. Furthermore, we analyze how AGD is +able to switch automatically between SGD and the adaptive optimizer and its +actual effects on various scenarios. The code is available at +https://github.com/intelligent-machine-learning/dlrover/tree/master/atorch/atorch/optimizers. + +
+
+ comment: 21 pages. Accepted as a conference paper at NeurIPS '23 +
+
+
+
+
+ + ☆ On Tuning Neural ODE for Stability, Consistency and Faster Convergence + + +
+ Neural-ODE parameterize a differential equation using continuous depth neural +network and solve it using numerical ODE-integrator. These models offer a +constant memory cost compared to models with discrete sequence of hidden layers +in which memory cost increases linearly with the number of layers. In addition +to memory efficiency, other benefits of neural-ode include adaptability of +evaluation approach to input, and flexibility to choose numerical precision or +fast training. However, despite having all these benefits, it still has some +limitations. We identify the ODE-integrator (also called ODE-solver) as the +weakest link in the chain as it may have stability, consistency and convergence +(CCS) issues and may suffer from slower convergence or may not converge at all. +We propose a first-order Nesterov's accelerated gradient (NAG) based ODE-solver +which is proven to be tuned vis-a-vis CCS conditions. We empirically +demonstrate the efficacy of our approach by training faster, while achieving +better or comparable performance against neural-ode employing other fixed-step +explicit ODE-solvers as well discrete depth models such as ResNet in three +different tasks including supervised classification, density estimation, and +time-series modelling. + +
+
+
+
+
+ + ☆ An End-to-End Network Pruning Pipeline with Sparsity Enforcement + + +
+ Neural networks have emerged as a powerful tool for solving complex tasks +across various domains, but their increasing size and computational +requirements have posed significant challenges in deploying them on +resource-constrained devices. Neural network sparsification, and in particular +pruning, has emerged as an effective technique to alleviate these challenges by +reducing model size, computational complexity, and memory footprint while +maintaining competitive performance. However, many pruning pipelines modify the +standard training pipeline at only a single stage, if at all. In this work, we +look to develop an end-to-end training pipeline that befits neural network +pruning and sparsification at all stages of training. To do so, we make use of +nonstandard model parameter initialization, pre-pruning training methodologies, +and post-pruning training optimizations. We conduct experiments utilizing +combinations of these methods, in addition to different techniques used in the +pruning step, and find that our combined pipeline can achieve significant gains +over current state of the art approaches to neural network sparsification. + +
+
+
+
+
+ + ☆ Characterizing Large Language Model Geometry Solves Toxicity Detection + and Generation + + +
+ Large Language Models~(LLMs) drive current AI breakthroughs despite very +little being known about their internal representations, e.g., how to extract a +few informative features to solve various downstream tasks. To provide a +practical and principled answer, we propose to characterize LLMs from a +geometric perspective. We obtain in closed form (i) the intrinsic dimension in +which the Multi-Head Attention embeddings are constrained to exist and (ii) the +partition and per-region affine mappings of the per-layer feedforward networks. +Our results are informative, do not rely on approximations, and are actionable. +First, we show that, motivated by our geometric interpretation, we can bypass +Llama$2$'s RLHF by controlling its embedding's intrinsic dimension through +informed prompt manipulation. Second, we derive $7$ interpretable spline +features that can be extracted from any (pre-trained) LLM layer, providing a +rich abstract representation of their inputs. Those features alone ($224$ for +Mistral-7B and Llama$2$-7B) are sufficient to help solve toxicity detection, +infer the domain of the prompt, and even tackle the Jigsaw challenge, which +aims at characterizing the type of toxicity of various prompts. Our results +demonstrate how, even in large-scale regimes, exact theoretical results can +answer practical questions in language models. Code: +\url{https://github.com/RandallBalestriero/SplineLLM}. + +
+
+
+
+
+ + ☆ Robust Streaming, Sampling, and a Perspective on Online Learning + + +
+ In this work we present an overview of statistical learning, followed by a +survey of robust streaming techniques and challenges, culminating in several +rigorous results proving the relationship that we motivate and hint at +throughout the journey. Furthermore, we unify often disjoint theorems in a +shared framework and notation to clarify the deep connections that are +discovered. We hope that by approaching these results from a shared +perspective, already aware of the technical connections that exist, we can +enlighten the study of both fields and perhaps motivate new and previously +unconsidered directions of research. + +
+
+
+
+
+ + ☆ GVFs in the Real World: Making Predictions Online for Water Treatment + + +
+ In this paper we investigate the use of reinforcement-learning based +prediction approaches for a real drinking-water treatment plant. Developing +such a prediction system is a critical step on the path to optimizing and +automating water treatment. Before that, there are many questions to answer +about the predictability of the data, suitable neural network architectures, +how to overcome partial observability and more. We first describe this dataset, +and highlight challenges with seasonality, nonstationarity, partial +observability, and heterogeneity across sensors and operation modes of the +plant. We then describe General Value Function (GVF) predictions -- discounted +cumulative sums of observations -- and highlight why they might be preferable +to classical n-step predictions common in time series prediction. We discuss +how to use offline data to appropriately pre-train our temporal difference +learning (TD) agents that learn these GVF predictions, including how to select +hyperparameters for online fine-tuning in deployment. We find that the +TD-prediction agent obtains an overall lower normalized mean-squared error than +the n-step prediction agent. Finally, we show the importance of learning in +deployment, by comparing a TD agent trained purely offline with no online +updating to a TD agent that learns online. This final result is one of the +first to motivate the importance of adapting predictions in real-time, for +non-stationary high-volume systems in the real world. + +
+
+ comment: Published in Machine Learning (2023) +
+
+
+
+
+ + ☆ How Many Validation Labels Do You Need? Exploring the Design Space of + Label-Efficient Model Ranking + + +
+ The paper introduces LEMR, a framework that reduces annotation costs for +model selection tasks. Our approach leverages ensemble methods to generate +pseudo-labels, employs uncertainty sampling for target acquisition, and +utilizes a Z-score mechanism for iterative committee reelection to refine model +ranks. We present a systematic study across various selection metrics, +demonstrating that LEMR achieves comparable results to fully labeled datasets +with a fraction of the labeling budget. Our findings indicate that LEMR not +only economizes the labeling effort in weak supervision and semi-supervised +learning settings but also effectively guides prompt selection for large +language models. With extensive experiments across 23 tasks, we reveal that our +framework can dramatically decrease the labeling cost without compromising the +accuracy of model selection, thereby offering a cost-effective alternative to +traditional practices. + +
+
+
+
+
+ + ☆ xNeuSM: Explainable Neural Subgraph Matching with Graph Learnable + Multi-hop Attention Networks + + +
+ Subgraph matching is a challenging problem with a wide range of applications +in database systems, biochemistry, and cognitive science. It involves +determining whether a given query graph is present within a larger target +graph. Traditional graph-matching algorithms provide precise results but face +challenges in large graph instances due to the NP-complete problem, limiting +their practical applicability. In contrast, recent neural network-based +approximations offer more scalable solutions, but often lack interpretable node +correspondences. To address these limitations, this article presents xNeuSM: +Explainable Neural Subgraph Matching which introduces Graph Learnable Multi-hop +Attention Networks (GLeMA) that adaptively learns the parameters governing the +attention factor decay for each node across hops rather than relying on fixed +hyperparameters. We provide a theoretical analysis establishing error bounds +for GLeMA's approximation of multi-hop attention as a function of the number of +hops. Additionally, we prove that learning distinct attention decay factors for +each node leads to a correct approximation of multi-hop attention. Empirical +evaluation on real-world datasets shows that xNeuSM achieves substantial +improvements in prediction accuracy of up to 34% compared to approximate +baselines and, notably, at least a seven-fold faster query time than exact +algorithms. The source code of our implementation is available at +https://github.com/martinakaduc/xNeuSM. + +
+
+ comment: 33 pages, 8 figures, 6 tables +
+
+
+
+
+ + ☆ Deep Learning-Driven Enhancement of Welding Quality Control: Predicting + Welding Depth and Pore Volume in Hairpin Welding + + +
+ To advance quality assurance in the welding process, this study presents a +robust deep learning model that enables the prediction of two critical welds +Key Performance Characteristics (KPCs): welding depth and average pore volume. +In the proposed approach, a comprehensive range of laser welding Key Input +Characteristics (KICs) is utilized, including welding beam geometries, welding +feed rates, path repetitions for weld beam geometries, and bright light weld +ratios for all paths, all of which were obtained from hairpin welding +experiments. Two deep learning networks are employed with multiple hidden dense +layers and linear activation functions to showcase the capabilities of deep +neural networks in capturing the intricate nonlinear connections inherent +within welding KPCs and KICs. Applying deep learning networks to the small +numerical experimental hairpin welding dataset has shown promising results, +achieving Mean Absolute Error (MAE) values as low as 0.1079 for predicting +welding depth and 0.0641 for average pore volume. Additionally, the validity +verification demonstrates the reliability of the proposed method. This, in +turn, promises significant advantages in controlling welding outcomes, moving +beyond the current trend of relying merely on monitoring for defect +classification. + +
+
+
+
+
+ + ☆ TextAug: Test time Text Augmentation for Multimodal Person + Re-identification + + +
+ Multimodal Person Reidentification is gaining popularity in the research +community due to its effectiveness compared to counter-part unimodal +frameworks. However, the bottleneck for multimodal deep learning is the need +for a large volume of multimodal training examples. Data augmentation +techniques such as cropping, flipping, rotation, etc. are often employed in the +image domain to improve the generalization of deep learning models. Augmenting +in other modalities than images, such as text, is challenging and requires +significant computational resources and external data sources. In this study, +we investigate the effectiveness of two computer vision data augmentation +techniques: cutout and cutmix, for text augmentation in multi-modal person +re-identification. Our approach merges these two augmentation strategies into +one strategy called CutMixOut which involves randomly removing words or +sub-phrases from a sentence (Cutout) and blending parts of two or more +sentences to create diverse examples (CutMix) with a certain probability +assigned to each operation. This augmentation was implemented at inference time +without any prior training. Our results demonstrate that the proposed technique +is simple and effective in improving the performance on multiple multimodal +person re-identification benchmarks. + +
+
+ comment: 10 pages, 5 figures +
+
+
+
+
+ + ☆ ActiveClean: Generating Line-Level Vulnerability Data via Active + Learning + + +
+ Deep learning vulnerability detection tools are increasing in popularity and +have been shown to be effective. These tools rely on large volume of high +quality training data, which are very hard to get. Most of the currently +available datasets provide function-level labels, reporting whether a function +is vulnerable or not vulnerable. However, for a vulnerability detection to be +useful, we need to also know the lines that are relevant to the vulnerability. +This paper makes efforts towards developing systematic tools and proposes. +ActiveClean to generate the large volume of line-level vulnerability data from +commits. That is, in addition to function-level labels, it also reports which +lines in the function are likely responsible for vulnerability detection. In +the past, static analysis has been applied to clean commits to generate +line-level data. Our approach based on active learning, which is easy to use +and scalable, provide a complementary approach to static analysis. We designed +semantic and syntactic properties from commit lines and use them to train the +model. We evaluated our approach on both Java and C datasets processing more +than 4.3K commits and 119K commit lines. AcitveClean achieved an F1 score +between 70-74. Further, we also show that active learning is effective by using +just 400 training data to reach F1 score of 70.23. Using ActiveClean, we +generate the line-level labels for the entire FFMpeg project in the Devign +dataset, including 5K functions, and also detected incorrect function-level +labels. We demonstrated that using our cleaned data, LineVul, a SOTA line-level +vulnerability detection tool, detected 70 more vulnerable lines and 18 more +vulnerable functions, and improved Top 10 accuracy from 66% to 73%. + +
+
+
+
+
+ + ☆ Scalable and Independent Learning of Nash Equilibrium Policies in + $n$-Player Stochastic Games with Unknown Independent Chains + + +
+ We study a subclass of $n$-player stochastic games, namely, stochastic games +with independent chains and unknown transition matrices. In this class of +games, players control their own internal Markov chains whose transitions do +not depend on the states/actions of other players. However, players' decisions +are coupled through their payoff functions. We assume players can receive only +realizations of their payoffs, and that the players can not observe the states +and actions of other players, nor do they know the transition probability +matrices of their own Markov chain. Relying on a compact dual formulation of +the game based on occupancy measures and the technique of confidence set to +maintain high-probability estimates of the unknown transition matrices, we +propose a fully decentralized mirror descent algorithm to learn an +$\epsilon$-NE for this class of games. The proposed algorithm has the desired +properties of independence, scalability, and convergence. Specifically, under +no assumptions on the reward functions, we show the proposed algorithm +converges in polynomial time in a weaker distance (namely, the averaged +Nikaido-Isoda gap) to the set of $\epsilon$-NE policies with arbitrarily high +probability. Moreover, assuming the existence of a variationally stable Nash +equilibrium policy, we show that the proposed algorithm converges +asymptotically to the stable $\epsilon$-NE policy with arbitrarily high +probability. In addition to Markov potential games and linear-quadratic +stochastic games, this work provides another subclass of $n$-player stochastic +games that, under some mild assumptions, admit polynomial-time learning +algorithms for finding their stationary $\epsilon$-NE policies. + +
+
+
+
+
+ + ☆ OCGEC: One-class Graph Embedding Classification for DNN Backdoor + Detection + + +
+ Deep neural networks (DNNs) have been found vulnerable to backdoor attacks, +raising security concerns about their deployment in mission-critical +applications. There are various approaches to detect backdoor attacks, however +they all make certain assumptions about the target attack to be detected and +require equal and huge numbers of clean and backdoor samples for training, +which renders these detection methods quite limiting in real-world +circumstances. + This study proposes a novel one-class classification framework called +One-class Graph Embedding Classification (OCGEC) that uses GNNs for model-level +backdoor detection with only a little amount of clean data. First, we train +thousands of tiny models as raw datasets from a small number of clean datasets. +Following that, we design a ingenious model-to-graph method for converting the +model's structural details and weight features into graph data. We then +pre-train a generative self-supervised graph autoencoder (GAE) to better learn +the features of benign models in order to detect backdoor models without +knowing the attack strategy. After that, we dynamically combine the GAE and +one-class classifier optimization goals to form classification boundaries that +distinguish backdoor models from benign models. + Our OCGEC combines the powerful representation capabilities of graph neural +networks with the utility of one-class classification techniques in the field +of anomaly detection. In comparison to other baselines, it achieves AUC scores +of more than 98% on a number of tasks, which far exceeds existing methods for +detection even when they rely on a huge number of positive and negative +samples. Our pioneering application of graphic scenarios for generic backdoor +detection can provide new insights that can be used to improve other backdoor +defense tasks. Code is available at https://github.com/jhy549/OCGEC. + +
+
+ comment: 13 pages, 9 figures +
+
+
+
+
+ + ☆ Signed Binarization: Unlocking Efficiency Through Repetition-Sparsity + Trade-Off + + +
+ Efficient inference of Deep Neural Networks (DNNs) on resource-constrained +edge devices is essential. Quantization and sparsity are key algorithmic +techniques that translate to repetition and sparsity within tensors at the +hardware-software interface. This paper introduces the concept of +repetition-sparsity trade-off that helps explain computational efficiency +during inference. We propose Signed Binarization, a unified co-design framework +that synergistically integrates hardware-software systems, quantization +functions, and representation learning techniques to address this trade-off. +Our results demonstrate that Signed Binarization is more accurate than +binarization with the same number of non-zero weights. Detailed analysis +indicates that signed binarization generates a smaller distribution of +effectual (non-zero) parameters nested within a larger distribution of total +parameters, both of the same type, for a DNN block. Finally, our approach +achieves a 26% speedup on real hardware, doubles energy efficiency, and reduces +density by 2.8x compared to binary methods for ResNet 18, presenting an +alternative solution for deploying efficient models in resource-limited +environments. + +
+
+
+
+
+ + ☆ RJHMC-Tree for Exploration of the Bayesian Decision Tree Posterior + + +
+ Decision trees have found widespread application within the machine learning +community due to their flexibility and interpretability. This paper is directed +towards learning decision trees from data using a Bayesian approach, which is +challenging due to the potentially enormous parameter space required to span +all tree models. Several approaches have been proposed to combat this +challenge, with one of the more successful being Markov chain Monte Carlo +(MCMC) methods. The efficacy and efficiency of MCMC methods fundamentally rely +on the quality of the so-called proposals, which is the focus of this paper. In +particular, this paper investigates using a Hamiltonian Monte Carlo (HMC) +approach to explore the posterior of Bayesian decision trees more efficiently +by exploiting the geometry of the likelihood within a global update scheme. Two +implementations of the novel algorithm are developed and compared to existing +methods by testing against standard datasets in the machine learning and +Bayesian decision tree literature. HMC-based methods are shown to perform +favourably with respect to predictive test accuracy, acceptance rate, and tree +complexity. + +
+
+ comment: 43 pages, 7 figures +
+
+
+
+
+ + ☆ Toward Automated Quantum Variational Machine Learning + + +
+ In this work, we address the problem of automating quantum variational +machine learning. We develop a multi-locality parallelizable search algorithm, +called MUSE, to find the initial points and the sets of parameters that achieve +the best performance for quantum variational circuit learning. Simulations with +five real-world classification datasets indicate that on average, MUSE improves +the detection accuracy of quantum variational classifiers 2.3 times with +respect to the observed lowest scores. Moreover, when applied to two real-world +regression datasets, MUSE improves the quality of the predictions from negative +coefficients of determination to positive ones. Furthermore, the classification +and regression scores of the quantum variational models trained with MUSE are +on par with the classical counterparts. + +
+
+
+
+
+ + ☆ APoLLo: Unified Adapter and Prompt Learning for Vision Language Models EMNLP 2023 + + +
+ The choice of input text prompt plays a critical role in the performance of +Vision-Language Pretrained (VLP) models such as CLIP. We present APoLLo, a +unified multi-modal approach that combines Adapter and Prompt learning for +Vision-Language models. Our method is designed to substantially improve the +generalization capabilities of VLP models when they are fine-tuned in a +few-shot setting. We introduce trainable cross-attention-based adapter layers +in conjunction with vision and language encoders to strengthen the alignment +between the two modalities. We enforce consistency between the respective +encoder branches (receiving augmented inputs) to prevent overfitting in +downstream tasks. Our method is evaluated on three representative tasks: +generalization to novel classes, cross-dataset evaluation, and unseen domain +shifts. In practice, APoLLo achieves a relative gain up to 6.03% over MaPLe +(SOTA) on novel classes for 10 diverse image recognition datasets. + +
+
+ comment: Accepted at EMNLP 2023 (Main track) +
+
+
+
+
+ + ☆ Explainable AI is Responsible AI: How Explainability Creates Trustworthy + and Socially Responsible Artificial Intelligence + + +
+ Artificial intelligence (AI) has been clearly established as a technology +with the potential to revolutionize fields from healthcare to finance - if +developed and deployed responsibly. This is the topic of responsible AI, which +emphasizes the need to develop trustworthy AI systems that minimize bias, +protect privacy, support security, and enhance transparency and accountability. +Explainable AI (XAI) has been broadly considered as a building block for +responsible AI (RAI), with most of the literature considering it as a solution +for improved transparency. This work proposes that XAI and responsible AI are +significantly more deeply entwined. In this work, we explore state-of-the-art +literature on RAI and XAI technologies. Based on our findings, we demonstrate +that XAI can be utilized to ensure fairness, robustness, privacy, security, and +transparency in a wide range of contexts. Our findings lead us to conclude that +XAI is an essential foundation for every pillar of RAI. + +
+
+ comment: 35 pages, 7 figures (figures 3-6 include subfigures) +
+
+
+
+
+ + ☆ Near-Optimal Algorithms for Gaussians with Huber Contamination: Mean + Estimation and Linear Regression NeurIPS 2023 + + +
+ We study the fundamental problems of Gaussian mean estimation and linear +regression with Gaussian covariates in the presence of Huber contamination. Our +main contribution is the design of the first sample near-optimal and almost +linear-time algorithms with optimal error guarantees for both of these +problems. Specifically, for Gaussian robust mean estimation on $\mathbb{R}^d$ +with contamination parameter $\epsilon \in (0, \epsilon_0)$ for a small +absolute constant $\epsilon_0$, we give an algorithm with sample complexity $n += \tilde{O}(d/\epsilon^2)$ and almost linear runtime that approximates the +target mean within $\ell_2$-error $O(\epsilon)$. This improves on prior work +that achieved this error guarantee with polynomially suboptimal sample and time +complexity. For robust linear regression, we give the first algorithm with +sample complexity $n = \tilde{O}(d/\epsilon^2)$ and almost linear runtime that +approximates the target regressor within $\ell_2$-error $O(\epsilon)$. This is +the first polynomial sample and time algorithm achieving the optimal error +guarantee, answering an open question in the literature. At the technical +level, we develop a methodology that yields almost-linear time algorithms for +multi-directional filtering that may be of broader interest. + +
+
+ comment: To appear in NeurIPS 2023 +
+
+
+
+
+ + ☆ KEEC: Embed to Control on An Equivariant Geometry + + +
+ This paper investigates how representation learning can enable optimal +control in unknown and complex dynamics, such as chaotic and non-linear +systems, without relying on prior domain knowledge of the dynamics. The core +idea is to establish an equivariant geometry that is diffeomorphic to the +manifold defined by a dynamical system and to perform optimal control within +this corresponding geometry, which is a non-trivial task. To address this +challenge, Koopman Embed to Equivariant Control (KEEC) is introduced for model +learning and control. Inspired by Lie theory, KEEC begins by learning a +non-linear dynamical system defined on a manifold and embedding trajectories +into a Lie group. Subsequently, KEEC formulates an equivariant value function +equation in reinforcement learning on the equivariant geometry, ensuring an +invariant effect as the value function on the original manifold. By deriving +analytical-form optimal actions on the equivariant value function, KEEC +theoretically achieves quadratic convergence for the optimal equivariant value +function by leveraging the differential information on the equivariant +geometry. The effectiveness of KEEC is demonstrated in challenging dynamical +systems, including chaotic ones like Lorenz-63. Notably, our findings indicate +that isometric and isomorphic loss functions, ensuring the compactness and +smoothness of geometry, outperform loss functions without these properties. + +
+
+
+
+
+ + ☆ Unsupervised Change Detection for Space Habitats Using 3D Point Clouds + + +
+ This work presents an algorithm for scene change detection from point clouds +to enable autonomous robotic caretaking in future space habitats. Autonomous +robotic systems will help maintain future deep-space habitats, such as the +Gateway space station, which will be uncrewed for extended periods. Existing +scene analysis software used on the International Space Station (ISS) relies on +manually-labeled images for detecting changes. In contrast, the algorithm +presented in this work uses raw, unlabeled point clouds as inputs. The +algorithm first applies modified Expectation-Maximization Gaussian Mixture +Model (GMM) clustering to two input point clouds. It then performs change +detection by comparing the GMMs using the Earth Mover's Distance. The algorithm +is validated quantitatively and qualitatively using a test dataset collected by +an Astrobee robot in the NASA Ames Granite Lab comprising single frame depth +images taken directly by Astrobee and full-scene reconstructed maps built with +RGB-D and pose data from Astrobee. The runtimes of the approach are also +analyzed in depth. The source code is publicly released to promote further +development. + +
+
+ comment: 15 pages, 7 figures, Manuscript will be presented at the AIAA SciTech + Forum in Orlando, FL, USA, 8 - 12 January 2024 +
+
+
+
+
+ + ☆ Dissecting Medical Referral Mechanisms in Health Services: Role of + Physician Professional Networks + + +
+ Medical referrals between primary care physicians (PC) and specialist care +(SC) physicians profoundly impact patient care regarding quality, satisfaction, +and cost. This paper investigates the influence of professional networks among +medical doctors on referring patients from PC to SC. Using five-year +consultation data from a Portuguese private health provider, we conducted +exploratory data analysis and constructed both professional and referral +networks among physicians. We then apply Graph Neural Network (GNN) models to +learn latent representations of the referral network. Our analysis supports the +hypothesis that doctors' professional social connections can predict medical +referrals, potentially enhancing collaboration within organizations and +improving healthcare services. This research contributes to dissecting the +underlying mechanisms in primary-specialty referrals, thereby providing +valuable insights for enhancing patient care and effective healthcare +management. + +
+
+ comment: 27 pages, 9 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ AMAGO: Scalable In-Context Reinforcement Learning for Adaptive Agents + + +
+ We introduce AMAGO, an in-context Reinforcement Learning (RL) agent that uses +sequence models to tackle the challenges of generalization, long-term memory, +and meta-learning. Recent works have shown that off-policy learning can make +in-context RL with recurrent policies viable. Nonetheless, these approaches +require extensive tuning and limit scalability by creating key bottlenecks in +agents' memory capacity, planning horizon, and model size. AMAGO revisits and +redesigns the off-policy in-context approach to successfully train +long-sequence Transformers over entire rollouts in parallel with end-to-end RL. +Our agent is uniquely scalable and applicable to a wide range of problems. We +demonstrate its strong performance empirically in meta-RL and long-term memory +domains. AMAGO's focus on sparse rewards and off-policy data also allows +in-context learning to extend to goal-conditioned problems with challenging +exploration. When combined with a novel hindsight relabeling scheme, AMAGO can +solve a previously difficult category of open-world domains, where agents +complete many possible instructions in procedurally generated environments. We +evaluate our agent on three goal-conditioned domains and study how its +individual improvements connect to create a generalist policy. + +
+
+
+
+
+ + ♻ ☆ ContriMix: Unsupervised disentanglement of content and attribute for + domain generalization in microscopy image analysis + + +
+ Domain generalization is critical for real-world applications of machine +learning to microscopy images, including histopathology and fluorescence +imaging. Artifacts in these modalities arise through a complex combination of +factors relating to tissue collection and laboratory processing, as well as +factors intrinsic to patient samples. In fluorescence imaging, these artifacts +stem from variations across experimental batches. The complexity and subtlety +of these artifacts make the enumeration of data domains intractable. Therefore, +augmentation-based methods of domain generalization that require domain +identifiers and manual fine-tuning are inadequate in this setting. To overcome +this challenge, we introduce ContriMix, a domain generalization technique that +learns to generate synthetic images by disentangling and permuting the +biological content ("content") and technical variations ("attributes") in +microscopy images. ContriMix does not rely on domain identifiers or handcrafted +augmentations and makes no assumptions about the input characteristics of +images. We assess the performance of ContriMix on two pathology datasets +dealing with patch classification and Whole Slide Image label prediction tasks +respectively (Camelyon17-WILDS and RCC subtyping), and one fluorescence +microscopy dataset (RxRx1-WILDS). Without any access to domain identifiers at +train or test time, ContriMix performs similar or better than current +state-of-the-art methods in all these datasets, motivating its usage for +microscopy image analysis in real-world settings where domain information is +hard to come by. The code for ContriMix can be found at +https://gitlab.com/huutan86/contrimix + +
+
+
+
+
+ + ♻ ☆ SCOPE-RL: A Python Library for Offline Reinforcement Learning and + Off-Policy Evaluation + + +
+ This paper introduces SCOPE-RL, a comprehensive open-source Python software +designed for offline reinforcement learning (offline RL), off-policy evaluation +(OPE), and selection (OPS). Unlike most existing libraries that focus solely on +either policy learning or evaluation, SCOPE-RL seamlessly integrates these two +key aspects, facilitating flexible and complete implementations of both offline +RL and OPE processes. SCOPE-RL put particular emphasis on its OPE modules, +offering a range of OPE estimators and robust evaluation-of-OPE protocols. This +approach enables more in-depth and reliable OPE compared to other packages. For +instance, SCOPE-RL enhances OPE by estimating the entire reward distribution +under a policy rather than its mere point-wise expected value. Additionally, +SCOPE-RL provides a more thorough evaluation-of-OPE by presenting the +risk-return tradeoff in OPE results, extending beyond mere accuracy evaluations +in existing OPE literature. SCOPE-RL is designed with user accessibility in +mind. Its user-friendly APIs, comprehensive documentation, and a variety of +easy-to-follow examples assist researchers and practitioners in efficiently +implementing and experimenting with various offline RL methods and OPE +estimators, tailored to their specific problem contexts. The documentation of +SCOPE-RL is available at https://scope-rl.readthedocs.io/en/latest/. + +
+
+ comment: preprint, open-source software: + https://github.com/hakuhodo-technologies/scope-rl +
+
+
+
+
+ + ♻ ☆ Towards Assessing and Benchmarking Risk-Return Tradeoff of Off-Policy + Evaluation + + +
+ Off-Policy Evaluation (OPE) aims to assess the effectiveness of +counterfactual policies using only offline logged data and is often used to +identify the top-k promising policies for deployment in online A/B tests. +Existing evaluation metrics for OPE estimators primarily focus on the +"accuracy" of OPE or that of downstream policy selection, neglecting +risk-return tradeoff in the subsequent online policy deployment. To address +this issue, we draw inspiration from portfolio evaluation in finance and +develop a new metric, called SharpeRatio@k, which measures the risk-return +tradeoff of policy portfolios formed by an OPE estimator under varying online +evaluation budgets (k). We validate our metric in two example scenarios, +demonstrating its ability to effectively distinguish between low-risk and +high-risk estimators and to accurately identify the most efficient estimator. +This efficient estimator is characterized by its capability to form the most +advantageous policy portfolios, maximizing returns while minimizing risks +during online deployment, a nuance that existing metrics typically overlook. To +facilitate a quick, accurate, and consistent evaluation of OPE via +SharpeRatio@k, we have also integrated this metric into an open-source +software, SCOPE-RL. Employing SharpeRatio@k and SCOPE-RL, we conduct +comprehensive benchmarking experiments on various estimators and RL tasks, +focusing on their risk-return tradeoff. These experiments offer several +interesting directions and suggestions for future OPE research. + +
+
+ comment: preprint, under review +
+
+
+
+
+ + ♻ ☆ Accuracy Improvement in Differentially Private Logistic Regression: A + Pre-training Approach + + +
+ Machine learning (ML) models can memorize training datasets. As a result, +training ML models over private datasets can lead to the violation of +individuals' privacy. Differential privacy (DP) is a rigorous privacy notion to +preserve the privacy of underlying training datasets. Yet, training ML models +in a DP framework usually degrades the accuracy of ML models. This paper aims +to boost the accuracy of a DP logistic regression (LR) via a pre-training +module. In more detail, we initially pre-train our LR model on a public +training dataset that there is no privacy concern about it. Then, we fine-tune +our DP-LR model with the private dataset. In the numerical results, we show +that adding a pre-training module significantly improves the accuracy of the +DP-LR model. + +
+
+
+
+
+ + ♻ ☆ A Nonstochastic Control Approach to Optimization + + +
+ Selecting the best hyperparameters for a particular optimization instance, +such as the learning rate and momentum, is an important but nonconvex problem. +As a result, iterative optimization methods such as hypergradient descent lack +global optimality guarantees in general. + We propose an online nonstochastic control methodology for mathematical +optimization. First, we formalize the setting of meta-optimization, an online +learning formulation of learning the best optimization algorithm from a class +of methods. The meta-optimization problem over gradient-based methods can be +framed as a feedback control problem over the choice of hyperparameters, +including the learning rate, momentum, and the preconditioner. + Although the original optimal control problem is nonconvex, we show how +recent methods from online nonstochastic control using convex relaxations can +be used to overcome the challenge of nonconvexity, and obtain regret guarantees +against the best offline solution. This guarantees that in meta-optimization, +given a sequence of optimization problems, we can learn a method that attains +convergence comparable to that of the best optimization method in hindsight +from a class of methods. + +
+
+
+
+
+ + ♻ ☆ From Monte Carlo to neural networks approximations of boundary value + problems + + +
+ In this paper we study probabilistic and neural network approximations for +solutions to Poisson equation subject to H\" older data in general bounded +domains of $\mathbb{R}^d$. We aim at two fundamental goals. + The first, and the most important, we show that the solution to Poisson +equation can be numerically approximated in the sup-norm by Monte Carlo +methods, { and that this can be done highly efficiently if we use a modified +version} of the walk on spheres algorithm { as an acceleration method. This +provides estimates which are efficient with respect to the prescribed +approximation error and with polynomial complexity in the dimension and the +reciprocal of the error.} {A crucial feature is that} the overall number of +samples does not not depend on the point at which the approximation is +performed. + As a second goal, we show that the obtained Monte Carlo solver renders { in a +constructive way} ReLU deep neural network (DNN) solutions to Poisson problem, +whose sizes depend at most polynomialy in the dimension $d$ and in the desired +error. In fact we show that the random DNN provides with high probability a +small approximation error and low polynomial complexity in the dimension. + +
+
+
+
+
+ + ♻ ☆ Improving Intrinsic Exploration by Creating Stationary Objectives NeurIPS + + +
+ Exploration bonuses in reinforcement learning guide long-horizon exploration +by defining custom intrinsic objectives. Several exploration objectives like +count-based bonuses, pseudo-counts, and state-entropy maximization are +non-stationary and hence are difficult to optimize for the agent. While this +issue is generally known, it is usually omitted and solutions remain +under-explored. The key contribution of our work lies in transforming the +original non-stationary rewards into stationary rewards through an augmented +state representation. For this purpose, we introduce the Stationary Objectives +For Exploration (SOFE) framework. SOFE requires identifying sufficient +statistics for different exploration bonuses and finding an efficient encoding +of these statistics to use as input to a deep network. SOFE is based on +proposing state augmentations that expand the state space but hold the promise +of simplifying the optimization of the agent's objective. We show that SOFE +improves the performance of several exploration objectives, including +count-based bonuses, pseudo-counts, and state-entropy maximization. Moreover, +SOFE outperforms prior methods that attempt to stabilize the optimization of +intrinsic objectives. We demonstrate the efficacy of SOFE in hard-exploration +problems, including sparse-reward tasks, pixel-based observations, 3D +navigation, and procedurally generated environments. + +
+
+ comment: Accepted at the ALOE 2023 Workshop at NeurIPS. Under Review at ICLR +
+
+
+
+
+ + ♻ ☆ How Deep Neural Networks Learn Compositional Data: The Random Hierarchy + Model + + +
+ Deep learning algorithms demonstrate a surprising ability to learn +high-dimensional tasks from limited examples. This is commonly attributed to +the depth of neural networks, enabling them to build a hierarchy of abstract, +low-dimensional data representations. However, how many training examples are +required to learn such representations remains unknown. To quantitatively study +this question, we introduce the Random Hierarchy Model: a family of synthetic +tasks inspired by the hierarchical structure of language and images. The model +is a classification task where each class corresponds to a group of high-level +features, chosen among several equivalent groups associated with the same +class. In turn, each feature corresponds to a group of sub-features chosen +among several equivalent ones and so on, following a hierarchy of composition +rules. We find that deep networks learn the task by developing internal +representations invariant to exchanging equivalent groups. Moreover, the number +of data required corresponds to the point where correlations between low-level +features and classes become detectable. Overall, our results indicate how deep +networks overcome the curse of dimensionality by building invariant +representations, and provide an estimate of the number of data required to +learn a hierarchical task. + +
+
+ comment: 9 pages, 8 figures +
+
+
+
+
+ + ♻ ☆ Generating Realistic Counterfactuals for Retinal Fundus and OCT Images + using Diffusion Models + + +
+ Counterfactual reasoning is often used in clinical settings to explain +decisions or weigh alternatives. Therefore, for imaging based specialties such +as ophthalmology, it would be beneficial to be able to create counterfactual +images, illustrating answers to questions like "If the subject had had diabetic +retinopathy, how would the fundus image have looked?". Here, we demonstrate +that using a diffusion model in combination with an adversarially robust +classifier trained on retinal disease classification tasks enables the +generation of highly realistic counterfactuals of retinal fundus images and +optical coherence tomography (OCT) B-scans. The key to the realism of +counterfactuals is that these classifiers encode salient features indicative +for each disease class and can steer the diffusion model to depict disease +signs or remove disease-related lesions in a realistic way. In a user study, +domain experts also found the counterfactuals generated using our method +significantly more realistic than counterfactuals generated from a previous +method, and even indistinguishable from real images. + +
+
+
+
+
+ + ♻ ☆ Interpreting and Improving Diffusion Models Using the Euclidean Distance + Function + + +
+ Denoising is intuitively related to projection. Indeed, under the manifold +hypothesis, adding random noise is approximately equivalent to orthogonal +perturbation. Hence, learning to denoise is approximately learning to project. +In this paper, we use this observation to reinterpret denoising diffusion +models as approximate gradient descent applied to the Euclidean distance +function. We then provide straight-forward convergence analysis of the DDIM +sampler under simple assumptions on the projection-error of the denoiser. +Finally, we propose a new sampler based on two simple modifications to DDIM +using insights from our theoretical results. In as few as 5-10 function +evaluations, our sampler achieves state-of-the-art FID scores on pretrained +CIFAR-10 and CelebA models and can generate high quality samples on latent +diffusion models. + +
+
+ comment: 19 pages, 8 figures, 2 tables +
+
+
+
+
+ + ♻ ☆ Space-Time Attention with Shifted Non-Local Search + + +
+ Efficiently computing attention maps for videos is challenging due to the +motion of objects between frames. While a standard non-local search is +high-quality for a window surrounding each query point, the window's small size +cannot accommodate motion. Methods for long-range motion use an auxiliary +network to predict the most similar key coordinates as offsets from each query +location. However, accurately predicting this flow field of offsets remains +challenging, even for large-scale networks. Small spatial inaccuracies +significantly impact the attention module's quality. This paper proposes a +search strategy that combines the quality of a non-local search with the range +of predicted offsets. The method, named Shifted Non-Local Search, executes a +small grid search surrounding the predicted offsets to correct small spatial +errors. Our method's in-place computation consumes 10 times less memory and is +over 3 times faster than previous work. Experimentally, correcting the small +spatial errors improves the video frame alignment quality by over 3 dB PSNR. +Our search upgrades existing space-time attention modules, which improves video +denoising results by 0.30 dB PSNR for a 7.5% increase in overall runtime. We +integrate our space-time attention module into a UNet-like architecture to +achieve state-of-the-art results on video denoising. + +
+
+ comment: 15 pages, 12 figures +
+
+
+
+
+ + ♻ ☆ A Baseline Analysis of Reward Models' Ability To Accurately Analyze + Foundation Models Under Distribution Shift + + +
+ Foundation models, specifically Large Language Models (LLM's), have lately +gained wide-spread attention and adoption. Reinforcement Learning with Human +Feedback (RLHF) involves training a reward model to capture desired behaviors, +which is then used to align LLM's. These reward models are additionally used at +inference-time to estimate LLM responses' adherence to those desired behaviors. +However, there is little work measuring how robust these reward models are to +distribution shifts. In this work, we evaluate how reward model performance - +measured via accuracy and calibration (i.e. alignment between accuracy and +confidence) - is affected by distribution shift. We show novel calibration +patterns and accuracy drops due to OOD prompts and responses, and that the +reward model is more sensitive to shifts in responses than prompts. +Additionally, we adapt an OOD detection technique commonly used in +classification to the reward model setting to detect these distribution shifts +in prompts and responses. + +
+
+
+
+
+ + ♻ ☆ SMT 2.0: A Surrogate Modeling Toolbox with a focus on Hierarchical and + Mixed Variables Gaussian Processes + + +
+ The Surrogate Modeling Toolbox (SMT) is an open-source Python package that +offers a collection of surrogate modeling methods, sampling techniques, and a +set of sample problems. This paper presents SMT 2.0, a major new release of SMT +that introduces significant upgrades and new features to the toolbox. This +release adds the capability to handle mixed-variable surrogate models and +hierarchical variables. These types of variables are becoming increasingly +important in several surrogate modeling applications. SMT 2.0 also improves SMT +by extending sampling methods, adding new surrogate models, and computing +variance and kernel derivatives for Kriging. This release also includes new +functions to handle noisy and use multifidelity data. To the best of our +knowledge, SMT 2.0 is the first open-source surrogate library to propose +surrogate models for hierarchical and mixed inputs. This open-source software +is distributed under the New BSD license. + +
+
+ comment: version 4 +
+
+
+
+
+ + ♻ ☆ Heuristic Optimal Transport in Branching Networks + + +
+ Optimal transport aims to learn a mapping of sources to targets by minimizing +the cost, which is typically defined as a function of distance. The solution to +this problem consists of straight line segments optimally connecting sources to +targets, and it does not exhibit branching. These optimal solutions are in +stark contrast with both natural, and man-made transportation networks, where +branching structures are prevalent. Here we discuss a fast heuristic branching +method for optimal transport in networks, and we provide several applications. + +
+
+ comment: 11 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Improving the Robustness of Summarization Models by Detecting and + Removing Input Noise EMNLP + + +
+ The evaluation of abstractive summarization models typically uses test data +that is identically distributed as training data. In real-world practice, +documents to be summarized may contain input noise caused by text extraction +artifacts or data pipeline bugs. The robustness of model performance under +distribution shift caused by such noise is relatively under-studied. We present +a large empirical study quantifying the sometimes severe loss in performance +(up to 12 ROUGE-1 points) from different types of input noise for a range of +datasets and model sizes. We then propose a light-weight method for detecting +and removing such noise in the input during model inference without requiring +any extra training, auxiliary models, or even prior knowledge of the type of +noise. Our proposed approach effectively mitigates the loss in performance, +recovering a large fraction of the performance drop, sometimes as large as 11 +ROUGE-1 points. + +
+
+ comment: EMNLP Findings 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ BioCLIP: A Vision Foundation Model for the Tree of Life + + +
+ Images of the natural world, collected by a variety of cameras, from drones +to individual phones, are increasingly abundant sources of biological +information. There is an explosion of computational methods and tools, +particularly computer vision, for extracting biologically relevant information +from images for science and conservation. Yet most of these are bespoke +approaches designed for a specific task and are not easily adaptable or +extendable to new questions, contexts, and datasets. A vision model for general +organismal biology questions on images is of timely need. To approach this, we +curate and release TreeOfLife-10M, the largest and most diverse ML-ready +dataset of biology images. We then develop BioCLIP, a foundation model for the +tree of life, leveraging the unique properties of biology captured by +TreeOfLife-10M, namely the abundance and variety of images of plants, animals, +and fungi, together with the availability of rich structured biological +knowledge. We rigorously benchmark our approach on diverse fine-grained biology +classification tasks, and find that BioCLIP consistently and substantially +outperforms existing baselines (by 17% to 20% absolute). Intrinsic evaluation +reveals that BioCLIP has learned a hierarchical representation conforming to +the tree of life, shedding light on its strong generalizability. Our code, +models and data will be made available at +https://github.com/Imageomics/bioclip. + +
+
+ comment: 18 pages; updated title +
+
+
+
+
+ + ♻ ☆ Never Train from Scratch: Fair Comparison of Long-Sequence Models + Requires Data-Driven Priors + + +
+ Modeling long-range dependencies across sequences is a longstanding goal in +machine learning and has led to architectures, such as state space models, that +dramatically outperform Transformers on long sequences. However, these +impressive empirical gains have been by and large demonstrated on benchmarks +(e.g. Long Range Arena), where models are randomly initialized and trained to +predict a target label from an input sequence. In this work, we show that +random initialization leads to gross overestimation of the differences between +architectures and that pretraining with standard denoising objectives, using +$\textit{only the downstream task data}$, leads to dramatic gains across +multiple architectures and to very small gaps between Transformers and state +space models (SSMs). In stark contrast to prior works, we find vanilla +Transformers to match the performance of S4 on Long Range Arena when properly +pretrained, and we improve the best reported results of SSMs on the PathX-256 +task by 20 absolute points. Subsequently, we analyze the utility of +previously-proposed structured parameterizations for SSMs and show they become +mostly redundant in the presence of data-driven initialization obtained through +pretraining. Our work shows that, when evaluating different architectures on +supervised tasks, incorporation of data-driven priors via pretraining is +essential for reliable performance estimation, and can be done efficiently. + +
+
+
+
+
+ + ♻ ☆ Optimization dependent generalization bound for ReLU networks based on + sensitivity in the tangent bundle NeurIPS 2023 + + +
+ Recent advances in deep learning have given us some very promising results on +the generalization ability of deep neural networks, however literature still +lacks a comprehensive theory explaining why heavily over-parametrized models +are able to generalize well while fitting the training data. In this paper we +propose a PAC type bound on the generalization error of feedforward ReLU +networks via estimating the Rademacher complexity of the set of networks +available from an initial parameter vector via gradient descent. The key idea +is to bound the sensitivity of the network's gradient to perturbation of the +input data along the optimization trajectory. The obtained bound does not +explicitly depend on the depth of the network. Our results are experimentally +verified on the MNIST and CIFAR-10 datasets. + +
+
+ comment: 17 pages, 5 figures, OPT2023: 15th Annual Workshop on Optimization + for Machine Learning at the 37th NeurIPS 2023, New Orleans, LA, USA +
+
+
+
+
+ + ♻ ☆ USB: A Unified Summarization Benchmark Across Tasks and Domains EMNLP + + +
+ While the NLP community has produced numerous summarization benchmarks, none +provide the rich annotations required to simultaneously address many important +problems related to control and reliability. We introduce a Wikipedia-derived +benchmark, complemented by a rich set of crowd-sourced annotations, that +supports $8$ interrelated tasks: (i) extractive summarization; (ii) abstractive +summarization; (iii) topic-based summarization; (iv) compressing selected +sentences into a one-line summary; (v) surfacing evidence for a summary +sentence; (vi) predicting the factual accuracy of a summary sentence; (vii) +identifying unsubstantiated spans in a summary sentence; (viii) correcting +factual errors in summaries. We compare various methods on this benchmark and +discover that on multiple tasks, moderately-sized fine-tuned models +consistently outperform much larger few-shot prompted language models. For +factuality-related tasks, we also evaluate existing heuristics to create +training data and find that training on them results in worse performance than +training on $20\times$ less human-labeled data. Our articles draw from $6$ +domains, facilitating cross-domain analysis. On some tasks, the amount of +training data matters more than the domain where it comes from, while for other +tasks training specifically on data from the target domain, even if limited, is +more beneficial. + +
+
+ comment: EMNLP Findings 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ Enabling Non-Linear Quantum Operations through Variational Quantum + Splines + + +
+ The postulates of quantum mechanics impose only unitary transformations on +quantum states, which is a severe limitation for quantum machine learning +algorithms. Quantum Splines (QSplines) have recently been proposed to +approximate quantum activation functions to introduce non-linearity in quantum +algorithms. However, QSplines make use of the HHL as a subroutine and require a +fault-tolerant quantum computer to be correctly implemented. This work proposes +the Generalised Hybrid Quantum Splines (GHQSplines), a novel method for +approximating non-linear quantum activation functions using hybrid +quantum-classical computation. The GHQSplines overcome the highly demanding +requirements of the original QSplines in terms of quantum hardware and can be +implemented using near-term quantum computers. Furthermore, the proposed method +relies on a flexible problem representation for non-linear approximation and it +is suitable to be embedded in existing quantum neural network architectures. In +addition, we provide a practical implementation of the GHQSplines using +Pennylane and show that our model outperforms the original QSplines in terms of +quality of fitting. + +
+
+
+
+
+ + ♻ ☆ Efficient Deep Speech Understanding at the Edge + + +
+ In contemporary speech understanding (SU), a sophisticated pipeline is +employed, encompassing the ingestion of streaming voice input. The pipeline +executes beam search iteratively, invoking a deep neural network to generate +tentative outputs (referred to as hypotheses) in an autoregressive manner. +Periodically, the pipeline assesses attention and Connectionist Temporal +Classification (CTC) scores. + This paper aims to enhance SU performance on edge devices with limited +resources. Adopting a hybrid strategy, our approach focuses on accelerating +on-device execution and offloading inputs surpassing the device's capacity. +While this approach is established, we tackle SU's distinctive challenges +through innovative techniques: (1) Late Contextualization: This involves the +parallel execution of a model's attentive encoder during input ingestion. (2) +Pilot Inference: Addressing temporal load imbalances in the SU pipeline, this +technique aims to mitigate them effectively. (3) Autoregression Offramps: +Decisions regarding offloading are made solely based on hypotheses, presenting +a novel approach. + These techniques are designed to seamlessly integrate with existing speech +models, pipelines, and frameworks, offering flexibility for independent or +combined application. Collectively, they form a hybrid solution for edge SU. +Our prototype, named XYZ, has undergone testing on Arm platforms featuring 6 to +8 cores, demonstrating state-of-the-art accuracy. Notably, it achieves a 2x +reduction in end-to-end latency and a corresponding 2x decrease in offloading +requirements. + +
+
+
+
+
+ + ♻ ☆ On skip connections and normalisation layers in deep optimisation NeurIPS 2023 + + +
+ We introduce a general theoretical framework, designed for the study of +gradient optimisation of deep neural networks, that encompasses ubiquitous +architecture choices including batch normalisation, weight normalisation and +skip connections. Our framework determines the curvature and regularity +properties of multilayer loss landscapes in terms of their constituent layers, +thereby elucidating the roles played by normalisation layers and skip +connections in globalising these properties. We then demonstrate the utility of +this framework in two respects. First, we give the only proof of which we are +aware that a class of deep neural networks can be trained using gradient +descent to global optima even when such optima only exist at infinity, as is +the case for the cross-entropy cost. Second, we identify a novel causal +mechanism by which skip connections accelerate training, which we verify +predictively with ResNets on MNIST, CIFAR10, CIFAR100 and ImageNet. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Trading-off price for data quality to achieve fair online allocation + + +
+ We consider the problem of online allocation subject to a long-term fairness +penalty. Contrary to existing works, however, we do not assume that the +decision-maker observes the protected attributes -- which is often unrealistic +in practice. Instead they can purchase data that help estimate them from +sources of different quality; and hence reduce the fairness penalty at some +cost. We model this problem as a multi-armed bandit problem where each arm +corresponds to the choice of a data source, coupled with the online allocation +problem. We propose an algorithm that jointly solves both problems and show +that it has a regret bounded by $\mathcal{O}(\sqrt{T})$. A key difficulty is +that the rewards received by selecting a source are correlated by the fairness +penalty, which leads to a need for randomization (despite a stochastic +setting). Our algorithm takes into account contextual information available +before the source selection, and can adapt to many different fairness notions. +We also show that in some instances, the estimates used can be learned on the +fly. + +
+
+
+
+
+ + ♻ ☆ Joint Prompt Optimization of Stacked LLMs using Variational Inference NeurIPS 2023 + + +
+ Large language models (LLMs) can be seen as atomic units of computation +mapping sequences to a distribution over sequences. Thus, they can be seen as +stochastic language layers in a language network, where the learnable +parameters are the natural language prompts at each layer. By stacking two such +layers and feeding the output of one layer to the next, we obtain a Deep +Language Network (DLN). We first show how to effectively perform prompt +optimization for a 1-Layer language network (DLN-1). Then, we present an +extension that applies to 2-layer DLNs (DLN-2), where two prompts must be +learned. The key idea is to consider the output of the first layer as a latent +variable, which requires inference, and prompts to be learned as the parameters +of the generative distribution. We first test the effectiveness of DLN-1 in +multiple reasoning and natural language understanding tasks. Then, we show that +DLN-2 can reach higher performance than a single layer, showing promise that we +might reach comparable performance to GPT-4, even when each LLM in the network +is smaller and less powerful. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Skill Reinforcement Learning and Planning for Open-World Long-Horizon + Tasks NeurIPS 2023 + + +
+ We study building multi-task agents in open-world environments. Without human +demonstrations, learning to accomplish long-horizon tasks in a large open-world +environment with reinforcement learning (RL) is extremely inefficient. To +tackle this challenge, we convert the multi-task learning problem into learning +basic skills and planning over the skills. Using the popular open-world game +Minecraft as the testbed, we propose three types of fine-grained basic skills, +and use RL with intrinsic rewards to acquire skills. A novel Finding-skill that +performs exploration to find diverse items provides better initialization for +other skills, improving the sample efficiency for skill learning. In skill +planning, we leverage the prior knowledge in Large Language Models to find the +relationships between skills and build a skill graph. When the agent is solving +a task, our skill search algorithm walks on the skill graph and generates the +proper skill plans for the agent. In experiments, our method accomplishes 40 +diverse Minecraft tasks, where many tasks require sequentially executing for +more than 10 skills. Our method outperforms baselines by a large margin and is +the most sample-efficient demonstration-free RL method to solve Minecraft Tech +Tree tasks. The project's website and code can be found at +https://sites.google.com/view/plan4mc. + +
+
+ comment: 24 pages, presented in Foundation Models for Decision Making Workshop + at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Continual Learning with Dynamic Sparse Training: Exploring Algorithms + for Effective Model Updates + + +
+ Continual learning (CL) refers to the ability of an intelligent system to +sequentially acquire and retain knowledge from a stream of data with as little +computational overhead as possible. To this end; regularization, replay, +architecture, and parameter isolation approaches were introduced to the +literature. Parameter isolation using a sparse network which enables to +allocate distinct parts of the neural network to different tasks and also +allows to share of parameters between tasks if they are similar. Dynamic Sparse +Training (DST) is a prominent way to find these sparse networks and isolate +them for each task. This paper is the first empirical study investigating the +effect of different DST components under the CL paradigm to fill a critical +research gap and shed light on the optimal configuration of DST for CL if it +exists. Therefore, we perform a comprehensive study in which we investigate +various DST components to find the best topology per task on well-known +CIFAR100 and miniImageNet benchmarks in a task-incremental CL setup since our +primary focus is to evaluate the performance of various DST criteria, rather +than the process of mask selection. We found that, at a low sparsity level, +Erdos-R\'enyi Kernel (ERK) initialization utilizes the backbone more +efficiently and allows to effectively learn increments of tasks. At a high +sparsity level, unless it is extreme, uniform initialization demonstrates a +more reliable and robust performance. In terms of growth strategy; performance +is dependent on the defined initialization strategy and the extent of sparsity. +Finally, adaptivity within DST components is a promising way for better +continual learners. + +
+
+
+
+
+ + ♻ ☆ FedEmb: A Vertical and Hybrid Federated Learning Algorithm using Network + And Feature Embedding Aggregation + + +
+ Federated learning (FL) is an emerging paradigm for decentralized training of +machine learning models on distributed clients, without revealing the data to +the central server. The learning scheme may be horizontal, vertical or hybrid +(both vertical and horizontal). Most existing research work with deep neural +network (DNN) modelling is focused on horizontal data distributions, while +vertical and hybrid schemes are much less studied. In this paper, we propose a +generalized algorithm FedEmb, for modelling vertical and hybrid DNN-based +learning. The idea of our algorithm is characterised by higher inference +accuracy, stronger privacy-preserving properties, and lower client-server +communication bandwidth demands as compared with existing work. The +experimental results show that FedEmb is an effective method to tackle both +split feature & subject space decentralized problems, shows 0.3% to 4.2% +inference accuracy improvement with limited privacy revealing for datasets +stored in local clients, and reduces 88.9 % time complexity over vertical +baseline method. + +
+
+ comment: Accepted by Proceedings on Engineering Sciences +
+
+
+
+
+ + ♻ ☆ SVDinsTN: A Tensor Network Paradigm for Efficient Structure Search from + Regularized Modeling Perspective + + +
+ Tensor network (TN) representation is a powerful technique for computer +vision and machine learning. TN structure search (TN-SS) aims to search for a +customized structure to achieve a compact representation, which is a +challenging NP-hard problem. Recent "sampling-evaluation-based" methods require +sampling an extensive collection of structures and evaluating them one by one, +resulting in prohibitively high computational costs. To address this issue, we +propose a novel TN paradigm, named SVD-inspired TN decomposition (SVDinsTN), +which allows us to efficiently solve the TN-SS problem from a regularized +modeling perspective, eliminating the repeated structure evaluations. To be +specific, by inserting a diagonal factor for each edge of the fully-connected +TN, SVDinsTN allows us to calculate TN cores and diagonal factors +simultaneously, with the factor sparsity revealing a compact TN structure. In +theory, we prove a convergence guarantee for the proposed method. Experimental +results demonstrate that the proposed method achieves approximately 100 to 1000 +times acceleration compared to the state-of-the-art TN-SS methods while +maintaining a comparable representation ability. + +
+
+
+
+
+ + ♻ ☆ Data-efficient operator learning for solving high Mach number fluid flow + problems + + +
+ We consider the problem of using SciML to predict solutions of high Mach +fluid flows over irregular geometries. In this setting, data is limited, and so +it is desirable for models to perform well in the low-data setting. We show +that Neural Basis Functions (NBF), which learns a basis of behavior modes from +the data and then uses this basis to make predictions, is more effective than a +basis-unaware baseline model. In addition, we identify continuing challenges in +the space of predicting solutions for this type of problem. + +
+
+
+
+
+ + ♻ ☆ Benchpress: A Scalable and Versatile Workflow for Benchmarking Structure + Learning Algorithms + + +
+ Describing the relationship between the variables in a study domain and +modelling the data generating mechanism is a fundamental problem in many +empirical sciences. Probabilistic graphical models are one common approach to +tackle the problem. Learning the graphical structure for such models is +computationally challenging and a fervent area of current research with a +plethora of algorithms being developed. To facilitate the benchmarking of +different methods, we present a novel Snakemake workflow, called Benchpress for +producing scalable, reproducible, and platform-independent benchmarks of +structure learning algorithms for probabilistic graphical models. Benchpress is +interfaced via a simple JSON-file, which makes it accessible for all users, +while the code is designed in a fully modular fashion to enable researchers to +contribute additional methodologies. Benchpress currently provides an interface +to a large number of state-of-the-art algorithms from libraries such as +BDgraph, BiDAG, bnlearn, causal-learn, gCastle, GOBNILP, pcalg, r.blip, +scikit-learn, TETRAD, and trilearn as well as a variety of methods for data +generating models and performance evaluation. Alongside user-defined models and +randomly generated datasets, the workflow also includes a number of standard +datasets and graphical models from the literature, which may be included in a +benchmarking study. We demonstrate the applicability of this workflow for +learning Bayesian networks in five typical data scenarios. The source code and +documentation is publicly available from http://benchpressdocs.readthedocs.io. + +
+
+ comment: 41 pages, 8 figure +
+
+
+
+
+ + ♻ ☆ BatteryML:An Open-source platform for Machine Learning on Battery + Degradation + + +
+ Battery degradation remains a pivotal concern in the energy storage domain, +with machine learning emerging as a potent tool to drive forward insights and +solutions. However, this intersection of electrochemical science and machine +learning poses complex challenges. Machine learning experts often grapple with +the intricacies of battery science, while battery researchers face hurdles in +adapting intricate models tailored to specific datasets. Beyond this, a +cohesive standard for battery degradation modeling, inclusive of data formats +and evaluative benchmarks, is conspicuously absent. Recognizing these +impediments, we present BatteryML - a one-step, all-encompass, and open-source +platform designed to unify data preprocessing, feature extraction, and the +implementation of both traditional and state-of-the-art models. This +streamlined approach promises to enhance the practicality and efficiency of +research applications. BatteryML seeks to fill this void, fostering an +environment where experts from diverse specializations can collaboratively +contribute, thus elevating the collective understanding and advancement of +battery research.The code for our project is publicly available on GitHub at +https://github.com/microsoft/BatteryML. + +
+
+
+
+
+ + ♻ ☆ Pareto Probing: Trading Off Accuracy for Complexity EMNLP 2020 + + +
+ The question of how to probe contextual word representations for linguistic +structure in a way that is both principled and useful has seen significant +attention recently in the NLP literature. In our contribution to this +discussion, we argue for a probe metric that reflects the fundamental trade-off +between probe complexity and performance: the Pareto hypervolume. To measure +complexity, we present a number of parametric and non-parametric metrics. Our +experiments using Pareto hypervolume as an evaluation metric show that probes +often do not conform to our expectations -- e.g., why should the non-contextual +fastText representations encode more morpho-syntactic information than the +contextual BERT representations? These results suggest that common, simplistic +probing tasks, such as part-of-speech labeling and dependency arc labeling, are +inadequate to evaluate the linguistic structure encoded in contextual word +representations. This leads us to propose full dependency parsing as a probing +task. In support of our suggestion that harder probing tasks are necessary, our +experiments with dependency parsing reveal a wide gap in syntactic knowledge +between contextual and non-contextual representations. + +
+
+ comment: Tiago Pimentel and Naomi Saphra contributed equally to this work. + Camera ready version of EMNLP 2020 publication. In this new version, we fixed + some notation issues in the appendix, and added a new appendix section + describing our MLP. Code available in + https://github.com/rycolab/pareto-probing +
+
+
+
+
+ + ♻ ☆ BioCoder: A Benchmark for Bioinformatics Code Generation with Contextual + Pragmatic Knowledge + + +
+ Pre-trained large language models have significantly improved code +generation. As these models scale up, there is an increasing need for the +output to handle more intricate tasks and to be appropriately specialized to +particular domains. Here, we target bioinformatics due to the amount of +specialized domain knowledge, algorithms, and data operations this discipline +requires. We present BioCoder, a benchmark developed to evaluate large language +models (LLMs) in generating bioinformatics-specific code. BioCoder spans a +broad spectrum of the field and covers cross-file dependencies, class +declarations, and global variables. It incorporates 1026 Python functions and +1243 Java methods extracted from GitHub, along with 253 examples from the +Rosalind Project, all pertaining to bioinformatics. Using topic modeling we +show that overall coverage of the included code is representative of the full +spectrum of bioinformatics calculations. BioCoder incorporates a fuzz-testing +framework for evaluation. We have applied it to evaluate many models including +InCoder, CodeGen, CodeGen2, SantaCoder, StarCoder, StarCoder+, InstructCodeT5+, +GPT-3.5, and GPT-4. Furthermore, we finetuned StarCoder, demonstrating how our +dataset can effectively enhance the performance of LLMs on our benchmark (by +>15% in terms of Pass@K in certain prompt configurations and always >3%). The +results highlight two key aspects of successful models: (1) Successful models +accommodate a long prompt (> ~2600 tokens) with full context, for functional +dependencies. (2) They contain specific domain knowledge of bioinformatics, +beyond just general coding knowledge. This is evident from the performance gain +of GPT-3.5/4 compared to the smaller models on the benchmark (50% vs up to +~25%). Our dataset, benchmark, Docker images, and scripts required for testing +are all available at https://github.com/gersteinlab/biocoder. + +
+
+
+
+
+ + ♻ ☆ Provably Efficient Iterated CVaR Reinforcement Learning with Function + Approximation and Human Feedback + + +
+ Risk-sensitive reinforcement learning (RL) aims to optimize policies that +balance the expected reward and risk. In this paper, we present a novel +risk-sensitive RL framework that employs an Iterated Conditional Value-at-Risk +(CVaR) objective under both linear and general function approximations, +enriched by human feedback. These new formulations provide a principled way to +guarantee safety in each decision making step throughout the control process. +Moreover, integrating human feedback into risk-sensitive RL framework bridges +the gap between algorithmic decision-making and human participation, allowing +us to also guarantee safety for human-in-the-loop systems. We propose provably +sample-efficient algorithms for this Iterated CVaR RL and provide rigorous +theoretical analysis. Furthermore, we establish a matching lower bound to +corroborate the optimality of our algorithms in a linear context. + +
+
+
+
+
+ + ♻ ☆ Multi Time Scale World Models NeurIPS 2023 + + +
+ Intelligent agents use internal world models to reason and make predictions +about different courses of their actions at many scales. Devising learning +paradigms and architectures that allow machines to learn world models that +operate at multiple levels of temporal abstractions while dealing with complex +uncertainty predictions is a major technical hurdle. In this work, we propose a +probabilistic formalism to learn multi-time scale world models which we call +the Multi Time Scale State Space (MTS3) model. Our model uses a computationally +efficient inference scheme on multiple time scales for highly accurate +long-horizon predictions and uncertainty estimates over several seconds into +the future. Our experiments, which focus on action conditional long horizon +future predictions, show that MTS3 outperforms recent methods on several system +identification benchmarks including complex simulated and real-world dynamical +systems. Code is available at this repository: https://github.com/ALRhub/MTS3. + +
+
+ comment: Accepted as spotlight at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ PASTA: Pretrained Action-State Transformer Agents + + +
+ Self-supervised learning has brought about a revolutionary paradigm shift in +various computing domains, including NLP, vision, and biology. Recent +approaches involve pre-training transformer models on vast amounts of unlabeled +data, serving as a starting point for efficiently solving downstream tasks. In +reinforcement learning, researchers have recently adapted these approaches, +developing models pre-trained on expert trajectories. This advancement enables +the models to tackle a broad spectrum of tasks, ranging from robotics to +recommendation systems. However, existing methods mostly rely on intricate +pre-training objectives tailored to specific downstream applications. This +paper conducts a comprehensive investigation of models, referred to as +pre-trained action-state transformer agents (PASTA). Our study covers a unified +methodology and covers an extensive set of general downstream tasks including +behavioral cloning, offline RL, sensor failure robustness, and dynamics change +adaptation. Our objective is to systematically compare various design choices +and offer valuable insights that will aid practitioners in developing robust +models. Key highlights of our study include tokenization at the component level +for actions and states, the use of fundamental pre-training objectives such as +next token prediction or masked language modeling, simultaneous training of +models across multiple domains, and the application of various fine-tuning +strategies. In this study, the developed models contain fewer than 7 million +parameters allowing a broad community to use these models and reproduce our +experiments. We hope that this study will encourage further research into the +use of transformers with first principle design choices to represent RL +trajectories and contribute to robust policy learning. + +
+
+
+
+
+ + ♻ ☆ Dealing with Drift of Adaptation Spaces in Learning-based Self-Adaptive + Systems using Lifelong Self-Adaptation + + +
+ Recently, machine learning (ML) has become a popular approach to support +self-adaptation. ML has been used to deal with several problems in +self-adaptation, such as maintaining an up-to-date runtime model under +uncertainty and scalable decision-making. Yet, exploiting ML comes with +inherent challenges. In this paper, we focus on a particularly important +challenge for learning-based self-adaptive systems: drift in adaptation spaces. +With adaptation space we refer to the set of adaptation options a self-adaptive +system can select from at a given time to adapt based on the estimated quality +properties of the adaptation options. Drift of adaptation spaces originates +from uncertainties, affecting the quality properties of the adaptation options. +Such drift may imply that eventually no adaptation option can satisfy the +initial set of the adaptation goals, deteriorating the quality of the system, +or adaptation options may emerge that allow enhancing the adaptation goals. In +ML, such shift corresponds to novel class appearance, a type of concept drift +in target data that common ML techniques have problems dealing with. To tackle +this problem, we present a novel approach to self-adaptation that enhances +learning-based self-adaptive systems with a lifelong ML layer. We refer to this +approach as lifelong self-adaptation. The lifelong ML layer tracks the system +and its environment, associates this knowledge with the current tasks, +identifies new tasks based on differences, and updates the learning models of +the self-adaptive system accordingly. A human stakeholder may be involved to +support the learning process and adjust the learning and goal models. We +present a general architecture for lifelong self-adaptation and apply it to the +case of drift of adaptation spaces that affects the decision-making in +self-adaptation. We validate the approach for a series of scenarios using the +DeltaIoT exemplar. + +
+
+
+
+
+ + ♻ ☆ Minimal Random Code Learning with Mean-KL Parameterization ICML + + +
+ This paper studies the qualitative behavior and robustness of two variants of +Minimal Random Code Learning (MIRACLE) used to compress variational Bayesian +neural networks. MIRACLE implements a powerful, conditionally Gaussian +variational approximation for the weight posterior $Q_{\mathbf{w}}$ and uses +relative entropy coding to compress a weight sample from the posterior using a +Gaussian coding distribution $P_{\mathbf{w}}$. To achieve the desired +compression rate, $D_{\mathrm{KL}}[Q_{\mathbf{w}} \Vert P_{\mathbf{w}}]$ must +be constrained, which requires a computationally expensive annealing procedure +under the conventional mean-variance (Mean-Var) parameterization for +$Q_{\mathbf{w}}$. Instead, we parameterize $Q_{\mathbf{w}}$ by its mean and KL +divergence from $P_{\mathbf{w}}$ to constrain the compression cost to the +desired value by construction. We demonstrate that variational training with +Mean-KL parameterization converges twice as fast and maintains predictive +performance after compression. Furthermore, we show that Mean-KL leads to more +meaningful variational distributions with heavier tails and compressed weight +samples which are more robust to pruning. + +
+
+ comment: ICML Neural Compression Workshop 2023 +
+
+
+
+
+ + ♻ ☆ RL4CO: a Unified Reinforcement Learning for Combinatorial Optimization + Library NeurIPS 2023 + + +
+ Deep reinforcement learning offers notable benefits in addressing +combinatorial problems over traditional solvers, reducing the reliance on +domain-specific knowledge and expert solutions, and improving computational +efficiency. Despite the recent surge in interest in neural combinatorial +optimization, practitioners often do not have access to a standardized code +base. Moreover, different algorithms are frequently based on fragmentized +implementations that hinder reproducibility and fair comparison. To address +these challenges, we introduce RL4CO, a unified Reinforcement Learning (RL) for +Combinatorial Optimization (CO) library. We employ state-of-the-art software +and best practices in implementation, such as modularity and configuration +management, to be flexible, easily modifiable, and extensible by researchers. +Thanks to our unified codebase, we benchmark baseline RL solvers with different +evaluation schemes on zero-shot performance, generalization, and adaptability +on diverse tasks. Notably, we find that some recent methods may fall behind +their predecessors depending on the evaluation settings. We hope RL4CO will +encourage the exploration of novel solutions to complex real-world tasks, +allowing the community to compare with existing methods through a unified +framework that decouples the science from software engineering. We open-source +our library at https://github.com/ai4co/rl4co. + +
+
+ comment: Accepted as a workshop paper at the NeurIPS 2023 GLFrontiers Workshop + (Oral) +
+
+
+
+
+ + ♻ ☆ Omnipotent Adversarial Training in the Wild + + +
+ Adversarial training is an important topic in robust deep learning, but the +community lacks attention to its practical usage. In this paper, we aim to +resolve a real-world challenge, i.e., training a model on an imbalanced and +noisy dataset to achieve high clean accuracy and adversarial robustness, with +our proposed Omnipotent Adversarial Training (OAT) strategy. OAT consists of +two innovative methodologies to address the imperfection in the training set. +We first introduce an oracle into the adversarial training process to help the +model learn a correct data-label conditional distribution. This +carefully-designed oracle can provide correct label annotations for adversarial +training. We further propose logits adjustment adversarial training to overcome +the data imbalance issue, which can help the model learn a Bayes-optimal +distribution. Our comprehensive evaluation results show that OAT outperforms +other baselines by more than 20% clean accuracy improvement and 10% robust +accuracy improvement under complex combinations of data imbalance and label +noise scenarios. The code can be found in https://github.com/GuanlinLee/OAT. + +
+
+
+
+
+ + ♻ ☆ Alleviating the Effect of Data Imbalance on Adversarial Training + + +
+ In this paper, we study adversarial training on datasets that obey the +long-tailed distribution, which is practical but rarely explored in previous +works. Compared with conventional adversarial training on balanced datasets, +this process falls into the dilemma of generating uneven adversarial examples +(AEs) and an unbalanced feature embedding space, causing the resulting model to +exhibit low robustness and accuracy on tail data. To combat that, we +theoretically analyze the lower bound of the robust risk to train a model on a +long-tailed dataset to obtain the key challenges in addressing the +aforementioned dilemmas. Based on it, we propose a new adversarial training +framework -- Re-balancing Adversarial Training (REAT). This framework consists +of two components: (1) a new training strategy inspired by the effective number +to guide the model to generate more balanced and informative AEs; (2) a +carefully constructed penalty function to force a satisfactory feature space. +Evaluation results on different datasets and model structures prove that REAT +can effectively enhance the model's robustness and preserve the model's clean +accuracy. The code can be found in https://github.com/GuanlinLee/REAT. + +
+
+
+
+
+ + ♻ ☆ Learning Structure-from-Motion with Graph Attention Networks + + +
+ In this paper we tackle the problem of learning Structure-from-Motion (SfM) +through the use of graph attention networks. SfM is a classic computer vision +problem that is solved though iterative minimization of reprojection errors, +referred to as Bundle Adjustment (BA), starting from a good initialization. In +order to obtain a good enough initialization to BA, conventional methods rely +on a sequence of sub-problems (such as pairwise pose estimation, pose averaging +or triangulation) which provides an initial solution that can then be refined +using BA. In this work we replace these sub-problems by learning a model that +takes as input the 2D keypoints detected across multiple views, and outputs the +corresponding camera poses and 3D keypoint coordinates. Our model takes +advantage of graph neural networks to learn SfM-specific primitives, and we +show that it can be used for fast inference of the reconstruction for new and +unseen sequences. The experimental results show that the proposed model +outperforms competing learning-based methods, and challenges COLMAP while +having lower runtime. + +
+
+ comment: Added additional metrics +
+
+
+
+
+ + ♻ ☆ De Novo Drug Design with Joint Transformers NeurIPS 2023 + + +
+ De novo drug design requires simultaneously generating novel molecules +outside of training data and predicting their target properties, making it a +hard task for generative models. To address this, we propose Joint Transformer +that combines a Transformer decoder, Transformer encoder, and a predictor in a +joint generative model with shared weights. We formulate a probabilistic +black-box optimization algorithm that employs Joint Transformer to generate +novel molecules with improved target properties and outperforms other +SMILES-based optimization methods in de novo drug design. + +
+
+ comment: Accepted to NeurIPS 2023 Generative AI and Biology Workshop +
+
+
+
+
+ + ♻ ☆ Language Agents with Reinforcement Learning for Strategic Play in the + Werewolf Game + + +
+ Agents built with large language models (LLMs) have recently achieved great +advancements. However, most of the efforts focus on single-agent or cooperative +settings, leaving more general multi-agent environments underexplored. We +propose a new framework powered by reinforcement learning (RL) to develop +strategic language agents, i.e., LLM-based agents with strategic thinking +ability, for a popular language game, Werewolf. Werewolf is a social deduction +game with hidden roles that involves both cooperation and competition and +emphasizes deceptive communication and diverse gameplay. Our agent tackles this +game by first using LLMs to reason about potential deceptions and generate a +set of strategically diverse actions. Then an RL policy, which selects an +action from the candidates, is learned by population-based training to enhance +the agents' decision-making ability. By combining LLMs with the RL policy, our +agent produces a variety of emergent strategies, achieves the highest win rate +against other LLM-based agents, and stays robust against adversarial human +players in the Werewolf game. + +
+
+
+
+
+ + ♻ ☆ When Do Program-of-Thoughts Work for Reasoning? + + +
+ In the realm of embodied artificial intelligence, the reasoning capabilities +of Large Language Models (LLMs) play a pivotal role. Although there are +effective methods like program-of-thought prompting for LLMs which uses +programming language to tackle complex reasoning tasks, the specific impact of +code data on the improvement of reasoning capabilities remains under-explored. +To address this gap, we propose complexity-impacted reasoning score (CIRS), +which combines structural and logical attributes, to measure the correlation +between code and reasoning abilities. Specifically, we use the abstract syntax +tree to encode the structural information and calculate logical complexity by +considering the difficulty and the cyclomatic complexity. Through an empirical +analysis, we find not all code data of complexity can be learned or understood +by LLMs. Optimal level of complexity is critical to the improvement of +reasoning abilities by program-aided prompting. Then we design an +auto-synthesizing and stratifying algorithm, and apply it to instruction +generation for mathematical reasoning and code data filtering for code +generation tasks. Extensive results demonstrates the effectiveness of our +proposed approach. Code will be integrated into the EasyInstruct framework at +https://github.com/zjunlp/EasyInstruct. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ♻ ☆ Convergence Analysis of Fractional Gradient Descent + + +
+ Fractional derivatives are a well-studied generalization of integer order +derivatives. Naturally, for optimization, it is of interest to understand the +convergence properties of gradient descent using fractional derivatives. +Convergence analysis of fractional gradient descent is currently limited both +in the methods analyzed and the settings analyzed. This paper aims to fill in +these gaps by analyzing variations of fractional gradient descent in smooth and +convex, smooth and strongly convex, and smooth and non-convex settings. First, +novel bounds will be established bridging fractional and integer derivatives. +Then, these bounds will be applied to the aforementioned settings to prove +$O(1/T)$ convergence for smooth and convex functions and linear convergence for +smooth and strongly convex functions. Additionally, we prove $O(1/T)$ +convergence for smooth and non-convex functions using an extended notion of +smoothness that is more natural for fractional derivatives. Finally, empirical +results will be presented on the potential speed up of fractional gradient +descent over standard gradient descent as well as the challenges of predicting +which will be faster in general. + +
+
+ comment: 21 pages, 4 figures. Submitted to TMLR. Updated to TMLR format. Minor + corrections in Figure 1 description, statements of Theorem 14, 18 and + Corollary 17. Minor clarification in statement of Theorem 10, 21. Moved most + proofs to appendix and added sketches, moved remarks within proofs into main + body +
+
+
+
+
+ + ♻ ☆ Function-constrained Program Synthesis NeurIPS + + +
+ This work introduces (1) a technique that allows large language models (LLMs) +to leverage user-provided code when solving programming tasks and (2) a method +to iteratively generate modular sub-functions that can aid future code +generation attempts when the initial code generated by the LLM is inadequate. +Generating computer programs in general-purpose programming languages like +Python poses a challenge for LLMs when instructed to use code provided in the +prompt. Code-specific LLMs (e.g., GitHub Copilot, CodeLlama2) can generate code +completions in real-time by drawing on all code available in a development +environment. However, restricting code-specific LLMs to use only in-context +code is not straightforward, as the model is not explicitly instructed to use +the user-provided code and users cannot highlight precisely which snippets of +code the model should incorporate into its context. Moreover, current systems +lack effective recovery methods, forcing users to iteratively re-prompt the +model with modified prompts until a sufficient solution is reached. Our method +differs from traditional LLM-powered code-generation by constraining +code-generation to an explicit function set and enabling recovery from failed +attempts through automatically generated sub-functions. When the LLM cannot +produce working code, we generate modular sub-functions to aid subsequent +attempts at generating functional code. A by-product of our method is a library +of reusable sub-functions that can solve related tasks, imitating a software +team where efficiency scales with experience. We also introduce a new +"half-shot" evaluation paradigm that provides tighter estimates of LLMs' coding +abilities compared to traditional zero-shot evaluation. Our proposed evaluation +method encourages models to output solutions in a structured format, decreasing +syntax errors that can be mistaken for poor coding ability. + +
+
+ comment: 17 pages, 6 figures, 2023 NeurIPS R0-Fomo Workshop; corrected typo on + fig 1 caption +
+
+
+
+
+ + ♻ ☆ Invariance is Key to Generalization: Examining the Role of + Representation in Sim-to-Real Transfer for Visual Navigation SP + + +
+ The data-driven approach to robot control has been gathering pace rapidly, +yet generalization to unseen task domains remains a critical challenge. We +argue that the key to generalization is representations that are (i) rich +enough to capture all task-relevant information and (ii) invariant to +superfluous variability between the training and the test domains. We +experimentally study such a representation -- containing both depth and +semantic information -- for visual navigation and show that it enables a +control policy trained entirely in simulated indoor scenes to generalize to +diverse real-world environments, both indoors and outdoors. Further, we show +that our representation reduces the A-distance between the training and test +domains, improving the generalization error bound as a result. Our proposed +approach is scalable: the learned policy improves continuously, as the +foundation models that it exploits absorb more diverse data during +pre-training. + +
+
+ comment: 11 pages, accepted by the 18th International Symposium on + Experimental Robotics (ISER 2023) and published within the Springer + Proceedings in Advanced Robotics (SPAR) +
+
+
+
+
+ + ♻ ☆ Graph Generation with $K^2$-trees + + +
+ Generating graphs from a target distribution is a significant challenge +across many domains, including drug discovery and social network analysis. In +this work, we introduce a novel graph generation method leveraging $K^2$-tree +representation, originally designed for lossless graph compression. The +$K^2$-tree representation {encompasses inherent hierarchy while enabling +compact graph generation}. In addition, we make contributions by (1) presenting +a sequential $K^2$-treerepresentation that incorporates pruning, flattening, +and tokenization processes and (2) introducing a Transformer-based architecture +designed to generate the sequence by incorporating a specialized tree +positional encoding scheme. Finally, we extensively evaluate our algorithm on +four general and two molecular graph datasets to confirm its superiority for +graph generation. + +
+
+ comment: 24 pages +
+
+
+
+
+ + ♻ ☆ Simple Transferability Estimation for Regression Tasks UAI + + +
+ We consider transferability estimation, the problem of estimating how well +deep learning models transfer from a source to a target task. We focus on +regression tasks, which received little previous attention, and propose two +simple and computationally efficient approaches that estimate transferability +based on the negative regularized mean squared error of a linear regression +model. We prove novel theoretical results connecting our approaches to the +actual transferability of the optimal target models obtained from the transfer +learning process. Despite their simplicity, our approaches significantly +outperform existing state-of-the-art regression transferability estimators in +both accuracy and efficiency. On two large-scale keypoint regression +benchmarks, our approaches yield 12% to 36% better results on average while +being at least 27% faster than previous state-of-the-art methods. + +
+
+ comment: Paper published at The 39th Conference on Uncertainty in Artificial + Intelligence (UAI) 2023 +
+
+
+
+
+ + ♻ ☆ Make the U in UDA Matter: Invariant Consistency Learning for + Unsupervised Domain Adaptation NeurIPS 2023 + + +
+ Domain Adaptation (DA) is always challenged by the spurious correlation +between domain-invariant features (e.g., class identity) and domain-specific +features (e.g., environment) that does not generalize to the target domain. +Unfortunately, even enriched with additional unsupervised target domains, +existing Unsupervised DA (UDA) methods still suffer from it. This is because +the source domain supervision only considers the target domain samples as +auxiliary data (e.g., by pseudo-labeling), yet the inherent distribution in the +target domain -- where the valuable de-correlation clues hide -- is +disregarded. We propose to make the U in UDA matter by giving equal status to +the two domains. Specifically, we learn an invariant classifier whose +prediction is simultaneously consistent with the labels in the source domain +and clusters in the target domain, hence the spurious correlation inconsistent +in the target domain is removed. We dub our approach "Invariant CONsistency +learning" (ICON). Extensive experiments show that ICON achieves the +state-of-the-art performance on the classic UDA benchmarks: Office-Home and +VisDA-2017, and outperforms all the conventional methods on the challenging +WILDS 2.0 benchmark. Codes are in https://github.com/yue-zhongqi/ICON. + +
+
+ comment: Accepted by NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ What User Behaviors Make the Differences During the Process of Visual + Analytics? + + +
+ The understanding of visual analytics process can benefit visualization +researchers from multiple aspects, including improving visual designs and +developing advanced interaction functions. However, the log files of user +behaviors are still hard to analyze due to the complexity of sensemaking and +our lack of knowledge on the related user behaviors. This work presents a study +on a comprehensive data collection of user behaviors, and our analysis approach +with time-series classification methods. We have chosen a classical +visualization application, Covid-19 data analysis, with common analysis tasks +covering geo-spatial, time-series and multi-attributes. Our user study collects +user behaviors on a diverse set of visualization tasks with two comparable +systems, desktop and immersive visualizations. We summarize the classification +results with three time-series machine learning algorithms at two scales, and +explore the influences of behavior features. Our results reveal that user +behaviors can be distinguished during the process of visual analytics and there +is a potentially strong association between the physical behaviors of users and +the visualization tasks they perform. We also demonstrate the usage of our +models by interpreting open sessions of visual analytics, which provides an +automatic way to study sensemaking without tedious manual annotations. + +
+
+ comment: This version corrects the issues of previous versions +
+
+
+
+
+ + ♻ ☆ Optimistic Natural Policy Gradient: a Simple Efficient Policy + Optimization Framework for Online RL + + +
+ While policy optimization algorithms have played an important role in recent +empirical success of Reinforcement Learning (RL), the existing theoretical +understanding of policy optimization remains rather limited -- they are either +restricted to tabular MDPs or suffer from highly suboptimal sample complexity, +especial in online RL where exploration is necessary. This paper proposes a +simple efficient policy optimization framework -- Optimistic NPG for online RL. +Optimistic NPG can be viewed as a simple combination of the classic natural +policy gradient (NPG) algorithm [Kakade, 2001] with optimistic policy +evaluation subroutines to encourage exploration. For $d$-dimensional linear +MDPs, Optimistic NPG is computationally efficient, and learns an +$\varepsilon$-optimal policy within $\tilde{O}(d^2/\varepsilon^3)$ samples, +which is the first computationally efficient algorithm whose sample complexity +has the optimal dimension dependence $\tilde{\Theta}(d^2)$. It also improves +over state-of-the-art results of policy optimization algorithms [Zanette et +al., 2021] by a factor of $d$. In the realm of general function approximation, +which subsumes linear MDPs, Optimistic NPG, to our best knowledge, stands as +the first policy optimization algorithm that achieves polynomial sample +complexity for learning near-optimal policies. + +
+
+
+
+
+ + ♻ ☆ Universality and approximation bounds for echo state networks with + random weights + + +
+ We study the uniform approximation of echo state networks with randomly +generated internal weights. These models, in which only the readout weights are +optimized during training, have made empirical success in learning dynamical +systems. Recent results showed that echo state networks with ReLU activation +are universal. In this paper, we give an alternative construction and prove +that the universality holds for general activation functions. Specifically, our +main result shows that, under certain condition on the activation function, +there exists a sampling procedure for the internal weights so that the echo +state network can approximate any continuous casual time-invariant operators +with high probability. In particular, for ReLU activation, we give explicit +construction for these sampling procedures. We also quantify the approximation +error of the constructed ReLU echo state networks for sufficiently regular +operators. + +
+
+
+
+
+ + ♻ ☆ Deep Unlearning: Fast and Efficient Training-free Approach to Controlled + Forgetting + + +
+ Machine unlearning has emerged as a prominent and challenging area of +interest, driven in large part by the rising regulatory demands for industries +to delete user data upon request and the heightened awareness of privacy. +Existing approaches either retrain models from scratch or use several +finetuning steps for every deletion request, often constrained by computational +resource limitations and restricted access to the original training data. In +this work, we introduce a novel class unlearning algorithm designed to +strategically eliminate an entire class or a group of classes from the learned +model. To that end, our algorithm first estimates the Retain Space and the +Forget Space, representing the feature or activation spaces for samples from +classes to be retained and unlearned, respectively. To obtain these spaces, we +propose a novel singular value decomposition-based technique that requires +layer wise collection of network activations from a few forward passes through +the network. We then compute the shared information between these spaces and +remove it from the forget space to isolate class-discriminatory feature space +for unlearning. Finally, we project the model weights in the orthogonal +direction of the class-discriminatory space to obtain the unlearned model. We +demonstrate our algorithm's efficacy on ImageNet using a Vision Transformer +with only $\sim$1.5% drop in retain accuracy compared to the original model +while maintaining under 1% accuracy on the unlearned class samples. Further, +our algorithm consistently performs well when subject to Membership Inference +Attacks showing 7.8% improvement on average across a variety of image +classification datasets and network architectures, as compared to other +baselines while being $\sim$6x more computationally efficient. + +
+
+
+
+
+ + ♻ ☆ Coneheads: Hierarchy Aware Attention NeurIPS 2023 + + +
+ Attention networks such as transformers have achieved state-of-the-art +performance in many domains. These networks rely heavily on the dot product +attention operator, which computes the similarity between two points by taking +their inner product. However, the inner product does not explicitly model the +complex structural properties of real world datasets, such as hierarchies +between data points. To remedy this, we introduce cone attention, a drop-in +replacement for dot product attention based on hyperbolic entailment cones. +Cone attention associates two points by the depth of their lowest common +ancestor in a hierarchy defined by hyperbolic cones, which intuitively measures +the divergence of two points and gives a hierarchy aware similarity score. We +test cone attention on a wide variety of models and tasks and show that it +improves task-level performance over dot product attention and other baselines, +and is able to match dot-product attention with significantly fewer parameters. +Our results suggest that cone attention is an effective way to capture +hierarchical relationships when calculating attention. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ On the Foundation of Distributionally Robust Reinforcement Learning + + +
+ Motivated by the need for a robust policy in the face of environment shifts +between training and the deployment, we contribute to the theoretical +foundation of distributionally robust reinforcement learning (DRRL). This is +accomplished through a comprehensive modeling framework centered around +distributionally robust Markov decision processes (DRMDPs). This framework +obliges the decision maker to choose an optimal policy under the worst-case +distributional shift orchestrated by an adversary. By unifying and extending +existing formulations, we rigorously construct DRMDPs that embraces various +modeling attributes for both the decision maker and the adversary. These +attributes include adaptability granularity, exploring history-dependent, +Markov, and Markov time-homogeneous decision maker and adversary dynamics. +Additionally, we delve into the flexibility of shifts induced by the adversary, +examining SA and S-rectangularity. Within this DRMDP framework, we investigate +conditions for the existence or absence of the dynamic programming principle +(DPP). From an algorithmic standpoint, the existence of DPP holds significant +implications, as the vast majority of existing data and computationally +efficiency RL algorithms are reliant on the DPP. To study its existence, we +comprehensively examine combinations of controller and adversary attributes, +providing streamlined proofs grounded in a unified methodology. We also offer +counterexamples for settings in which a DPP with full generality is absent. + +
+
+
+
+
+ + ♻ ☆ Efficient and Effective Deep Multi-view Subspace Clustering + + +
+ Recent multi-view subspace clustering achieves impressive results utilizing +deep networks, where the self-expressive correlation is typically modeled by a +fully connected (FC) layer. However, they still suffer from two limitations. i) +The parameter scale of the FC layer is quadratic to sample numbers, resulting +in high time and memory costs that significantly degrade their feasibility in +large-scale datasets. ii) It is under-explored to extract a unified +representation that simultaneously satisfies minimal sufficiency and +discriminability. To this end, we propose a novel deep framework, termed +Efficient and Effective deep Multi-View Subspace Clustering (E$^2$MVSC). +Instead of a parameterized FC layer, we design a Relation-Metric Net that +decouples network parameter scale from sample numbers for greater computational +efficiency. Most importantly, the proposed method devises a multi-type +auto-encoder to explicitly decouple consistent, complementary, and superfluous +information from every view, which is supervised by a soft clustering +assignment similarity constraint. Following information bottleneck theory and +the maximal coding rate reduction principle, a sufficient yet minimal unified +representation can be obtained, as well as pursuing intra-cluster aggregation +and inter-cluster separability within it. Extensive experiments show that +E$^2$MVSC yields comparable results to existing methods and achieves +state-of-the-art performance in various types of multi-view datasets. + +
+
+
+
+
+ + ♻ ☆ ChemSpaceAL: An Efficient Active Learning Methodology Applied to + Protein-Specific Molecular Generation + + +
+ The incredible capabilities of generative artificial intelligence models have +inevitably led to their application in the domain of drug discovery. Within +this domain, the vastness of chemical space motivates the development of more +efficient methods for identifying regions with molecules that exhibit desired +characteristics. In this work, we present a computationally efficient active +learning methodology that requires evaluation of only a subset of the generated +data in the constructed sample space to successfully align a generative model +with respect to a specified objective. We demonstrate the applicability of this +methodology to targeted molecular generation by fine-tuning a GPT-based +molecular generator toward a protein with FDA-approved small-molecule +inhibitors, c-Abl kinase. Remarkably, the model learns to generate molecules +similar to the inhibitors without prior knowledge of their existence, and even +reproduces two of them exactly. We also show that the methodology is effective +for a protein without any commercially available small-molecule inhibitors, the +HNH domain of the CRISPR-associated protein 9 (Cas9) enzyme. We believe that +the inherent generality of this method ensures that it will remain applicable +as the exciting field of in silico molecular generation evolves. To facilitate +implementation and reproducibility, we have made all of our software available +through the open-source ChemSpaceAL Python package. + +
+
+
+
+
+ + ♻ ☆ Quantitative Analysis of Primary Attribution Explainable Artificial + Intelligence Methods for Remote Sensing Image Classification + + +
+ We present a comprehensive analysis of quantitatively evaluating explainable +artificial intelligence (XAI) techniques for remote sensing image +classification. Our approach leverages state-of-the-art machine learning +approaches to perform remote sensing image classification across multiple +modalities. We investigate the results of the models qualitatively through XAI +methods. Additionally, we compare the XAI methods quantitatively through +various categories of desired properties. Through our analysis, we offer +insights and recommendations for selecting the most appropriate XAI method(s) +to gain a deeper understanding of the models' decision-making processes. The +code for this work is publicly available. + +
+
+ comment: 4 pages, 3 figures, Accepted to 2023 IGARSS Community-Contributed + Sessions - Opening the Black Box: Explainable AI/ML in Remote Sensing + Analysis +
+
+
+
+
+
+
+
+ + Multimedia 6 + +
+
+
+ + ☆ SequencePAR: Understanding Pedestrian Attributes via A Sequence + Generation Paradigm + + +
+ Current pedestrian attribute recognition (PAR) algorithms are developed based +on multi-label or multi-task learning frameworks, which aim to discriminate the +attributes using specific classification heads. However, these discriminative +models are easily influenced by imbalanced data or noisy samples. Inspired by +the success of generative models, we rethink the pedestrian attribute +recognition scheme and believe the generative models may perform better on +modeling dependencies and complexity between human attributes. In this paper, +we propose a novel sequence generation paradigm for pedestrian attribute +recognition, termed SequencePAR. It extracts the pedestrian features using a +pre-trained CLIP model and embeds the attribute set into query tokens under the +guidance of text prompts. Then, a Transformer decoder is proposed to generate +the human attributes by incorporating the visual features and attribute query +tokens. The masked multi-head attention layer is introduced into the decoder +module to prevent the model from remembering the next attribute while making +attribute predictions during training. Extensive experiments on multiple widely +used pedestrian attribute recognition datasets fully validated the +effectiveness of our proposed SequencePAR. The source code and pre-trained +models will be released at https://github.com/Event-AHU/OpenPAR. + +
+
+ comment: In Peer Review +
+
+
+
+
+ + ☆ Learning Efficient Unsupervised Satellite Image-based Building Damage + Detection ICDM 2023 + + +
+ Existing Building Damage Detection (BDD) methods always require +labour-intensive pixel-level annotations of buildings and their conditions, +hence largely limiting their applications. In this paper, we investigate a +challenging yet practical scenario of BDD, Unsupervised Building Damage +Detection (U-BDD), where only unlabelled pre- and post-disaster satellite image +pairs are provided. As a pilot study, we have first proposed an advanced U-BDD +baseline that leverages pre-trained vision-language foundation models (i.e., +Grounding DINO, SAM and CLIP) to address the U-BDD task. However, the apparent +domain gap between satellite and generic images causes low confidence in the +foundation models used to identify buildings and their damages. In response, we +further present a novel self-supervised framework, U-BDD++, which improves upon +the U-BDD baseline by addressing domain-specific issues associated with +satellite imagery. Furthermore, the new Building Proposal Generation (BPG) +module and the CLIP-enabled noisy Building Proposal Selection (CLIP-BPS) module +in U-BDD++ ensure high-quality self-training. Extensive experiments on the +widely used building damage assessment benchmark demonstrate the effectiveness +of the proposed method for unsupervised building damage detection. The +presented annotation-free and foundation model-based paradigm ensures an +efficient learning phase. This study opens a new direction for real-world BDD +and sets a strong baseline for future research. + +
+
+ comment: ICDM 2023 +
+
+
+
+
+ + ☆ A Contrastive Compositional Benchmark for Text-to-Image Synthesis: A + Study with Unified Text-to-Image Fidelity Metrics + + +
+ Text-to-image (T2I) synthesis has recently achieved significant advancements. +However, challenges remain in the model's compositionality, which is the +ability to create new combinations from known components. We introduce +Winoground-T2I, a benchmark designed to evaluate the compositionality of T2I +models. This benchmark includes 11K complex, high-quality contrastive sentence +pairs spanning 20 categories. These contrastive sentence pairs with subtle +differences enable fine-grained evaluations of T2I synthesis models. +Additionally, to address the inconsistency across different metrics, we propose +a strategy that evaluates the reliability of various metrics by using +comparative sentence pairs. We use Winoground-T2I with a dual objective: to +evaluate the performance of T2I models and the metrics used for their +evaluation. Finally, we provide insights into the strengths and weaknesses of +these metrics and the capabilities of current T2I models in tackling challenges +across a range of complex compositional categories. Our benchmark is publicly +available at https://github.com/zhuxiangru/Winoground-T2I . + +
+
+ comment: 17 pages, 14 figures, 11 tables +
+
+
+
+
+ + ☆ X-Adapter: Adding Universal Compatibility of Plugins for Upgraded + Diffusion Model + + +
+ We introduce X-Adapter, a universal upgrader to enable the pretrained +plug-and-play modules (e.g., ControlNet, LoRA) to work directly with the +upgraded text-to-image diffusion model (e.g., SDXL) without further retraining. +We achieve this goal by training an additional network to control the frozen +upgraded model with the new text-image data pairs. In detail, X-Adapter keeps a +frozen copy of the old model to preserve the connectors of different plugins. +Additionally, X-Adapter adds trainable mapping layers that bridge the decoders +from models of different versions for feature remapping. The remapped features +will be used as guidance for the upgraded model. To enhance the guidance +ability of X-Adapter, we employ a null-text training strategy for the upgraded +model. After training, we also introduce a two-stage denoising strategy to +align the initial latents of X-Adapter and the upgraded model. Thanks to our +strategies, X-Adapter demonstrates universal compatibility with various plugins +and also enables plugins of different versions to work together, thereby +expanding the functionalities of diffusion community. To verify the +effectiveness of the proposed method, we conduct extensive experiments and the +results show that X-Adapter may facilitate wider application in the upgraded +foundational diffusion model. + +
+
+ comment: Project page: https://showlab.github.io/X-Adapter/ +
+
+
+
+
+ + ♻ ☆ 360TripleView: 360-Degree Video View Management System Driven by + Convergence Value of Viewing Preferences + + +
+ 360-degree video has become increasingly popular in content consumption. +However, finding the viewing direction for important content within each frame +poses a significant challenge. Existing approaches rely on either viewer input +or algorithmic determination to select the viewing direction, but neither mode +consistently outperforms the other in terms of content-importance. In this +paper, we propose 360TripleView, the first view management system for +360-degree video that automatically infers and utilizes the better view mode +for each frame, ultimately providing viewers with higher content-importance +views. Through extensive experiments and a user study, we demonstrate that +360TripleView achieves over 90\% accuracy in inferring the better mode and +significantly enhances content-importance compared to existing methods. + +
+
+
+
+
+ + ♻ ☆ GEmo-CLAP: Gender-Attribute-Enhanced Contrastive Language-Audio + Pretraining for Accurate Speech Emotion Recognition + + +
+ Contrastive cross-modality pretraining has recently exhibited impressive +success in diverse fields, whereas there is limited research on their merits in +speech emotion recognition (SER). In this paper, we propose GEmo-CLAP, a kind +of gender-attribute-enhanced contrastive language-audio pretraining (CLAP) +method for SER. Specifically, we first construct an effective emotion CLAP +(Emo-CLAP) for SER, using pre-trained text and audio encoders. Second, given +the significance of gender information in SER, two novel multi-task learning +based GEmo-CLAP (ML-GEmo-CLAP) and soft label based GEmo-CLAP (SL-GEmo-CLAP) +models are further proposed to incorporate gender information of speech +signals, forming more reasonable objectives. Experiments on IEMOCAP indicate +that our proposed two GEmo-CLAPs consistently outperform Emo-CLAP with +different pre-trained models. Remarkably, the proposed WavLM-based SL-GEmo-CLAP +obtains the best WAR of 83.16\%, which performs better than state-of-the-art +SER methods. + +
+
+ comment: 5 pages +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 31 + +
+
+
+ + ☆ Using Large Language Models to Accelerate Communication for Users with + Severe Motor Impairments + + +
+ Finding ways to accelerate text input for individuals with profound motor +impairments has been a long-standing area of research. Closing the speed gap +for augmentative and alternative communication (AAC) devices such as +eye-tracking keyboards is important for improving the quality of life for such +individuals. Recent advances in neural networks of natural language pose new +opportunities for re-thinking strategies and user interfaces for enhanced +text-entry for AAC users. In this paper, we present SpeakFaster, consisting of +large language models (LLMs) and a co-designed user interface for text entry in +a highly-abbreviated form, allowing saving 57% more motor actions than +traditional predictive keyboards in offline simulation. A pilot study with 19 +non-AAC participants typing on a mobile device by hand demonstrated gains in +motor savings in line with the offline simulation, while introducing relatively +small effects on overall typing speed. Lab and field testing on two eye-gaze +typing users with amyotrophic lateral sclerosis (ALS) demonstrated text-entry +rates 29-60% faster than traditional baselines, due to significant saving of +expensive keystrokes achieved through phrase and word predictions from +context-aware LLMs. These findings provide a strong foundation for further +exploration of substantially-accelerated text communication for motor-impaired +users and demonstrate a direction for applying LLMs to text-based user +interfaces. + +
+
+
+
+
+ + ☆ T3D: Towards 3D Medical Image Understanding through Vision-Language + Pre-training + + +
+ Expert annotation of 3D medical image for downstream analysis is +resource-intensive, posing challenges in clinical applications. Visual +self-supervised learning (vSSL), though effective for learning visual +invariance, neglects the incorporation of domain knowledge from medicine. To +incorporate medical knowledge into visual representation learning, +vision-language pre-training (VLP) has shown promising results in 2D image. +However, existing VLP approaches become generally impractical when applied to +high-resolution 3D medical images due to GPU hardware constraints and the +potential loss of critical details caused by downsampling, which is the +intuitive solution to hardware constraints. To address the above limitations, +we introduce T3D, the first VLP framework designed for high-resolution 3D +medical images. T3D incorporates two text-informed pretext tasks: +(\lowerromannumeral{1}) text-informed contrastive learning; +(\lowerromannumeral{2}) text-informed image restoration. These tasks focus on +learning 3D visual representations from high-resolution 3D medical images and +integrating clinical knowledge from radiology reports, without distorting +information through forced alignment of downsampled volumes with detailed +anatomical text. Trained on a newly curated large-scale dataset of 3D medical +images and radiology reports, T3D significantly outperforms current vSSL +methods in tasks like organ and tumor segmentation, as well as disease +classification. This underlines T3D's potential in representation learning for +3D medical image analysis. All data and code will be available upon acceptance. + +
+
+
+
+
+ + ☆ SymNoise: Advancing Language Model Fine-tuning with Symmetric Noise + + +
+ In this paper, we introduce a novel fine-tuning technique for language +models, which involves incorporating symmetric noise into the embedding +process. This method aims to enhance the model's function by more stringently +regulating its local curvature, demonstrating superior performance over the +current method, NEFTune. When fine-tuning the LLaMA-2-7B model using Alpaca, +standard techniques yield a 29.79% score on AlpacaEval. However, our approach, +SymNoise, increases this score significantly to 69.04%, using symmetric noisy +embeddings. This is a 6.7% improvement over the state-of-the-art method, +NEFTune~(64.69%). Furthermore, when tested on various models and stronger +baseline instruction datasets, such as Evol-Instruct, ShareGPT, OpenPlatypus, +SymNoise consistently outperforms NEFTune. The current literature, including +NEFTune, has underscored the importance of more in-depth research into the +application of noise-based strategies in the fine-tuning of language models. +Our approach, SymNoise, is another significant step towards this direction, +showing notable improvement over the existing state-of-the-art method. + +
+
+
+
+
+ + ☆ Bigger is not Always Better: The Effect of Context Size on Speech + Pre-Training + + +
+ It has been generally assumed in the automatic speech recognition (ASR) +literature that it is better for models to have access to wider context +windows. Yet, many of the potential reasons this might be true in the +supervised setting do not necessarily transfer over to the case of unsupervised +learning. We investigate how much context is necessary to achieve high-quality +pre-trained acoustic models using self-supervised learning. We principally +investigate contrastive predictive coding (CPC), which we adapt to be able to +precisely control the amount of context visible to the model during training +and inference. We find that phone discriminability in the resulting model +representations peaks at around 40~ms of preceding context, and that having too +much context (beyond around 320 ms) substantially degrades the quality of the +representations. Surprisingly, we find that this pattern also transfers to +supervised ASR when the pre-trained representations are used as frozen input +features. Our results point to potential changes in the design of current +upstream architectures to better facilitate a variety of downstream tasks. + +
+
+ comment: Repository at https://github.com/sdrobert/scpc. This work has been + submitted to the IEEE for possible publication. Copyright may be transferred + without notice, after which this version may no longer be accessible +
+
+
+
+
+ + ☆ Tackling Bias in Pre-trained Language Models: Current Trends and + Under-represented Societies + + +
+ The benefits and capabilities of pre-trained language models (LLMs) in +current and future innovations are vital to any society. However, introducing +and using LLMs comes with biases and discrimination, resulting in concerns +about equality, diversity and fairness, and must be addressed. While +understanding and acknowledging bias in LLMs and developing mitigation +strategies are crucial, the generalised assumptions towards societal needs can +result in disadvantages towards under-represented societies and indigenous +populations. Furthermore, the ongoing changes to actual and proposed amendments +to regulations and laws worldwide also impact research capabilities in tackling +the bias problem. This research presents a comprehensive survey synthesising +the current trends and limitations in techniques used for identifying and +mitigating bias in LLMs, where the overview of methods for tackling bias are +grouped into metrics, benchmark datasets, and mitigation strategies. The +importance and novelty of this survey are that it explores the perspective of +under-represented societies. We argue that current practices tackling the bias +problem cannot simply be 'plugged in' to address the needs of under-represented +societies. We use examples from New Zealand to present requirements for +adopting existing techniques to under-represented societies. + +
+
+ comment: 38 pages, 5 figures, 11 tables. arXiv admin note: text overlap with + arXiv:2309.00770 by other authors +
+
+
+
+
+ + ☆ Effectively Fine-tune to Improve Large Multimodal Models for Radiology + Report Generation NeurIPS + 2023 + + +
+ Writing radiology reports from medical images requires a high level of domain +expertise. It is time-consuming even for trained radiologists and can be +error-prone for inexperienced radiologists. It would be appealing to automate +this task by leveraging generative AI, which has shown drastic progress in +vision and language understanding. In particular, Large Language Models (LLM) +have demonstrated impressive capabilities recently and continued to set new +state-of-the-art performance on almost all natural language tasks. While many +have proposed architectures to combine vision models with LLMs for multimodal +tasks, few have explored practical fine-tuning strategies. In this work, we +proposed a simple yet effective two-stage fine-tuning protocol to align visual +features to LLM's text embedding space as soft visual prompts. Our framework +with OpenLLaMA-7B achieved state-of-the-art level performance without +domain-specific pretraining. Moreover, we provide detailed analyses of soft +visual prompts and attention mechanisms, shedding light on future research +directions. + +
+
+ comment: Accepted to Deep Generative Models for Health Workshop at NeurIPS + 2023 +
+
+
+
+
+ + ☆ Unsupervised Approach to Evaluate Sentence-Level Fluency: Do We Really + Need Reference? AACL + + +
+ Fluency is a crucial goal of all Natural Language Generation (NLG) systems. +Widely used automatic evaluation metrics fall short in capturing the fluency of +machine-generated text. Assessing the fluency of NLG systems poses a challenge +since these models are not limited to simply reusing words from the input but +may also generate abstractions. Existing reference-based fluency evaluations, +such as word overlap measures, often exhibit weak correlations with human +judgments. This paper adapts an existing unsupervised technique for measuring +text fluency without the need for any reference. Our approach leverages various +word embeddings and trains language models using Recurrent Neural Network (RNN) +architectures. We also experiment with other available multilingual Language +Models (LMs). To assess the performance of the models, we conduct a comparative +analysis across 10 Indic languages, correlating the obtained fluency scores +with human judgments. Our code and human-annotated benchmark test-set for +fluency is available at +https://github.com/AnanyaCoder/TextFluencyForIndicLanaguges. + +
+
+ comment: Accepted at IJCNLP-AACL SEALP Workshop +
+
+
+
+
+ + ☆ D-Bot: Database Diagnosis System using Large Language Models + + +
+ Database administrators (DBAs) play an important role in managing, +maintaining and optimizing database systems. However, it is hard and tedious +for DBAs to manage a large number of databases and give timely response +(waiting for hours is intolerable in many online cases). In addition, existing +empirical methods only support limited diagnosis scenarios, which are also +labor-intensive to update the diagnosis rules for database version updates. +Recently large language models (LLMs) have shown great potential in various +fields. Thus, we propose D-Bot, an LLM-based database diagnosis system that can +automatically acquire knowledge from diagnosis documents, and generate +reasonable and well-founded diagnosis report (i.e., identifying the root causes +and solutions) within acceptable time (e.g., under 10 minutes compared to hours +by a DBA). The techniques in D-Bot include (i) offline knowledge extraction +from documents, (ii) automatic prompt generation (e.g., knowledge matching, +tool retrieval), (iii) root cause analysis using tree search algorithm, and +(iv) collaborative mechanism for complex anomalies with multiple root causes. +We verify D-Bot on real benchmarks (including 539 anomalies of six typical +applications), and the results show that D-Bot can effectively analyze the root +causes of unseen anomalies and significantly outperforms traditional methods +and vanilla models like GPT-4. + +
+
+
+
+
+ + ☆ Transformers are uninterpretable with myopic methods: a case study with + bounded Dyck grammars + + +
+ Interpretability methods aim to understand the algorithm implemented by a +trained model (e.g., a Transofmer) by examining various aspects of the model, +such as the weight matrices or the attention patterns. In this work, through a +combination of theoretical results and carefully controlled experiments on +synthetic data, we take a critical view of methods that exclusively focus on +individual parts of the model, rather than consider the network as a whole. We +consider a simple synthetic setup of learning a (bounded) Dyck language. +Theoretically, we show that the set of models that (exactly or approximately) +solve this task satisfy a structural characterization derived from ideas in +formal languages (the pumping lemma). We use this characterization to show that +the set of optima is qualitatively rich; in particular, the attention pattern +of a single layer can be ``nearly randomized'', while preserving the +functionality of the network. We also show via extensive experiments that these +constructions are not merely a theoretical artifact: even after severely +constraining the architecture of the model, vastly different solutions can be +reached via standard training. Thus, interpretability claims based on +inspecting individual heads or weight matrices in the Transformer can be +misleading. + +
+
+
+
+
+ + ☆ Towards Mitigating Perceived Unfairness in Contracts from a Non-Legal + Stakeholder's Perspective EMNLP 2023 + + +
+ Commercial contracts are known to be a valuable source for deriving +project-specific requirements. However, contract negotiations mainly occur +among the legal counsel of the parties involved. The participation of non-legal +stakeholders, including requirement analysts, engineers, and solution +architects, whose primary responsibility lies in ensuring the seamless +implementation of contractual terms, is often indirect and inadequate. +Consequently, a significant number of sentences in contractual clauses, though +legally accurate, can appear unfair from an implementation perspective to +non-legal stakeholders. This perception poses a problem since requirements +indicated in the clauses are obligatory and can involve punitive measures and +penalties if not implemented as committed in the contract. Therefore, the +identification of potentially unfair clauses in contracts becomes crucial. In +this work, we conduct an empirical study to analyze the perspectives of +different stakeholders regarding contractual fairness. We then investigate the +ability of Pre-trained Language Models (PLMs) to identify unfairness in +contractual sentences by comparing chain of thought prompting and +semi-supervised fine-tuning approaches. Using BERT-based fine-tuning, we +achieved an accuracy of 84% on a dataset consisting of proprietary contracts. +It outperformed chain of thought prompting using Vicuna-13B by a margin of 9%. + +
+
+ comment: 9 pages, 2 figures, to be published in Natural Legal Language + Processing Workshop at EMNLP 2023 +
+
+
+
+
+ + ☆ CEScore: Simple and Efficient Confidence Estimation Model for Evaluating + Split and Rephrase + + +
+ The split and rephrase (SR) task aims to divide a long, complex sentence into +a set of shorter, simpler sentences that convey the same meaning. This +challenging problem in NLP has gained increased attention recently because of +its benefits as a pre-processing step in other NLP tasks. Evaluating quality of +SR is challenging, as there no automatic metric fit to evaluate this task. In +this work, we introduce CEScore, as novel statistical model to automatically +evaluate SR task. By mimicking the way humans evaluate SR, CEScore provides 4 +metrics (Sscore, Gscore, Mscore, and CEscore) to assess simplicity, +grammaticality, meaning preservation, and overall quality, respectively. In +experiments with 26 models, CEScore correlates strongly with human evaluations, +achieving 0.98 in Spearman correlations at model-level. This underscores the +potential of CEScore as a simple and effective metric for assessing the overall +quality of SR models. + +
+
+
+
+
+ + ☆ AI-Powered Arabic Crossword Puzzle Generation for Educational + Applications EMNLP 2023 + + +
+ This paper presents the first Arabic crossword puzzle generator driven by +advanced AI technology. Leveraging cutting-edge large language models including +GPT4, GPT3-Davinci, GPT3-Curie, GPT3-Babbage, GPT3-Ada, and BERT, the system +generates distinctive and challenging clues. Based on a dataset comprising over +50,000 clue-answer pairs, the generator employs fine-tuning, few/zero-shot +learning strategies, and rigorous quality-checking protocols to enforce the +generation of high-quality clue-answer pairs. Importantly, educational +crosswords contribute to enhancing memory, expanding vocabulary, and promoting +problem-solving skills, thereby augmenting the learning experience through a +fun and engaging approach, reshaping the landscape of traditional learning +methods. The overall system can be exploited as a powerful educational tool +that amalgamates AI and innovative learning techniques, heralding a +transformative era for Arabic crossword puzzles and the intersection of +technology and education. + +
+
+ comment: Accepted Paper for ArabicNLP 2023 - The First Arabic Natural Language + Processing Conference - Co-located with EMNLP 2023 in Singapore +
+
+
+
+
+ + ☆ NLEBench+NorGLM: A Comprehensive Empirical Analysis and Benchmark + Dataset for Generative Language Models in Norwegian + + +
+ Recent advancements in Generative Language Models (GLMs) have transformed +Natural Language Processing (NLP) by showcasing the effectiveness of the +"pre-train, prompt, and predict" paradigm in utilizing pre-trained GLM +knowledge for diverse applications. Despite their potential, these capabilities +lack adequate quantitative characterization due to the absence of comprehensive +benchmarks, particularly for low-resource languages. Existing low-resource +benchmarks focus on discriminative language models like BERT, neglecting the +evaluation of generative language models. Moreover, current benchmarks often +overlook measuring generalization performance across multiple tasks, a crucial +metric for GLMs. + To bridge these gaps, we introduce NLEBench, a comprehensive benchmark +tailored for evaluating natural language generation capabilities in Norwegian, +a low-resource language. We use Norwegian as a case study to explore whether +current GLMs and benchmarks in mainstream languages like English can reveal the +unique characteristics of underrepresented languages. NLEBench encompasses a +suite of real-world NLP tasks ranging from news storytelling, summarization, +open-domain conversation, natural language understanding, instruction +fine-tuning, toxicity and bias evaluation, to self-curated Chain-of-Thought +investigation. It features two high-quality, human-annotated datasets: an +instruction dataset covering traditional Norwegian cultures, idioms, slang, and +special expressions, and a document-grounded multi-label dataset for topic +classification, question answering, and summarization. This paper also +introduces foundational Norwegian Generative Language Models (NorGLMs) +developed with diverse parameter scales and Transformer-based architectures. +Systematic evaluations on the proposed benchmark suite provide insights into +the capabilities and scalability of NorGLMs across various downstream tasks. + +
+
+
+
+
+ + ☆ Bridging Background Knowledge Gaps in Translation with Automatic + Explicitation EMNLP2023 + + +
+ Translations help people understand content written in another language. +However, even correct literal translations do not fulfill that goal when people +lack the necessary background to understand them. Professional translators +incorporate explicitations to explain the missing context by considering +cultural differences between source and target audiences. Despite its potential +to help users, NLP research on explicitation is limited because of the dearth +of adequate evaluation methods. This work introduces techniques for +automatically generating explicitations, motivated by WikiExpl: a dataset that +we collect from Wikipedia and annotate with human translators. The resulting +explicitations are useful as they help answer questions more accurately in a +multilingual question answering framework. + +
+
+ comment: EMNLP2023 +
+
+
+
+
+ + ☆ On Significance of Subword tokenization for Low Resource and Efficient + Named Entity Recognition: A case study in Marathi + + +
+ Named Entity Recognition (NER) systems play a vital role in NLP applications +such as machine translation, summarization, and question-answering. These +systems identify named entities, which encompass real-world concepts like +locations, persons, and organizations. Despite extensive research on NER +systems for the English language, they have not received adequate attention in +the context of low resource languages. In this work, we focus on NER for +low-resource language and present our case study in the context of the Indian +language Marathi. The advancement of NLP research revolves around the +utilization of pre-trained transformer models such as BERT for the development +of NER models. However, we focus on improving the performance of shallow models +based on CNN, and LSTM by combining the best of both worlds. In the era of +transformers, these traditional deep learning models are still relevant because +of their high computational efficiency. We propose a hybrid approach for +efficient NER by integrating a BERT-based subword tokenizer into vanilla +CNN/LSTM models. We show that this simple approach of replacing a traditional +word-based tokenizer with a BERT-tokenizer brings the accuracy of vanilla +single-layer models closer to that of deep pre-trained models like BERT. We +show the importance of using sub-word tokenization for NER and present our +study toward building efficient NLP systems. The evaluation is performed on +L3Cube-MahaNER dataset using tokenizers from MahaBERT, MahaGPT, IndicBERT, and +mBERT. + +
+
+ comment: Accepted at ICDAM 2023 +
+
+
+
+
+ + ☆ TextGenSHAP: Scalable Post-hoc Explanations in Text Generation with Long + Documents + + +
+ Large language models (LLMs) have attracted huge interest in practical +applications given their increasingly accurate responses and coherent reasoning +abilities. Given their nature as black-boxes using complex reasoning processes +on their inputs, it is inevitable that the demand for scalable and faithful +explanations for LLMs' generated content will continue to grow. There have been +major developments in the explainability of neural network models over the past +decade. Among them, post-hoc explainability methods, especially Shapley values, +have proven effective for interpreting deep learning models. However, there are +major challenges in scaling up Shapley values for LLMs, particularly when +dealing with long input contexts containing thousands of tokens and +autoregressively generated output sequences. Furthermore, it is often unclear +how to effectively utilize generated explanations to improve the performance of +LLMs. In this paper, we introduce TextGenSHAP, an efficient post-hoc +explanation method incorporating LM-specific techniques. We demonstrate that +this leads to significant increases in speed compared to conventional Shapley +value computations, reducing processing times from hours to minutes for +token-level explanations, and to just seconds for document-level explanations. +In addition, we demonstrate how real-time Shapley values can be utilized in two +important scenarios, providing better understanding of long-document question +answering by localizing important words and sentences; and improving existing +document retrieval systems through enhancing the accuracy of selected passages +and ultimately the final responses. + +
+
+
+
+
+ + ☆ Running cognitive evaluations on large language models: The do's and the + don'ts + + +
+ In this paper, I describe methodological considerations for studies that aim +to evaluate the cognitive capacities of large language models (LLMs) using +language-based behavioral assessments. Drawing on three case studies from the +literature (a commonsense knowledge benchmark, a theory of mind evaluation, and +a test of syntactic agreement), I describe common pitfalls that might arise +when applying a cognitive test to an LLM. I then list 10 do's and don'ts that +should help design high-quality cognitive evaluations for AI systems. I +conclude by discussing four areas where the do's and don'ts are currently under +active discussion -- prompt sensitivity, cultural and linguistic diversity, +using LLMs as research assistants, and running evaluations on open vs. closed +LLMs. Overall, the goal of the paper is to contribute to the broader discussion +of best practices in the rapidly growing field of AI Psychology. + +
+
+
+
+
+ + ☆ Behind the Magic, MERLIM: Multi-modal Evaluation Benchmark for Large + Image-Language Models + + +
+ Large Vision and Language Models have enabled significant advances in fully +supervised and zero-shot vision tasks. These large pre-trained architectures +serve as the baseline to what is currently known as Instruction Tuning Large +Vision and Language models (IT-LVLMs). IT-LVLMs are general-purpose multi-modal +assistants whose responses are modulated by natural language instructions and +arbitrary visual data. Despite this versatility, IT-LVLM effectiveness in +fundamental computer vision problems remains unclear, primarily due to the +absence of a standardized evaluation benchmark. This paper introduces a +Multi-modal Evaluation Benchmark named MERLIM, a scalable test-bed to assess +the performance of IT-LVLMs on fundamental computer vision tasks. MERLIM +contains over 279K image-question pairs, and has a strong focus on detecting +cross-modal "hallucination" events in IT-LVLMs, where the language output +refers to visual concepts that lack any effective grounding in the image. Our +results show that state-of-the-art IT-LVMLs are still limited at identifying +fine-grained visual concepts, object hallucinations are common across tasks, +and their results are strongly biased by small variations in the input query, +even if the queries have the very same semantics. Our findings also suggest +that these models have weak visual groundings but they can still make adequate +guesses by global visual patterns or textual biases contained in the LLM +component. + +
+
+ comment: 11 pages, 7 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ Why Does ChatGPT Fall Short in Providing Truthful Answers? + + +
+ Recent advancements in large language models, such as ChatGPT, have +demonstrated significant potential to impact various aspects of human life. +However, ChatGPT still faces challenges in providing reliable and accurate +answers to user questions. To better understand the model's particular +weaknesses in providing truthful answers, we embark an in-depth exploration of +open-domain question answering. Specifically, we undertake a detailed +examination of ChatGPT's failures, categorized into: comprehension, factuality, +specificity, and inference. We further pinpoint factuality as the most +contributing failure and identify two critical abilities associated with +factuality: knowledge memorization and knowledge recall. Through experiments +focusing on factuality, we propose several potential enhancement strategies. +Our findings suggest that augmenting the model with granular external knowledge +and cues for knowledge recall can enhance the model's factuality in answering +questions. + +
+
+
+
+
+ + ♻ ☆ Tree of Thoughts: Deliberate Problem Solving with Large Language Models NeurIPS 2023 + + +
+ Language models are increasingly being deployed for general problem solving +across a wide range of tasks, but are still confined to token-level, +left-to-right decision-making processes during inference. This means they can +fall short in tasks that require exploration, strategic lookahead, or where +initial decisions play a pivotal role. To surmount these challenges, we +introduce a new framework for language model inference, Tree of Thoughts (ToT), +which generalizes over the popular Chain of Thought approach to prompting +language models, and enables exploration over coherent units of text (thoughts) +that serve as intermediate steps toward problem solving. ToT allows LMs to +perform deliberate decision making by considering multiple different reasoning +paths and self-evaluating choices to decide the next course of action, as well +as looking ahead or backtracking when necessary to make global choices. Our +experiments show that ToT significantly enhances language models' +problem-solving abilities on three novel tasks requiring non-trivial planning +or search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in +Game of 24, while GPT-4 with chain-of-thought prompting only solved 4% of +tasks, our method achieved a success rate of 74%. Code repo with all prompts: +https://github.com/princeton-nlp/tree-of-thought-llm. + +
+
+ comment: NeurIPS 2023 camera ready version. Code repo with all prompts: + https://github.com/princeton-nlp/tree-of-thought-llm +
+
+
+
+
+ + ♻ ☆ HistAlign: Improving Context Dependency in Language Generation by + Aligning with History EMNLP 2023 + + +
+ Language models (LMs) can generate hallucinations and incoherent outputs, +which highlights their weak context dependency. Cache-LMs, which augment LMs +with a memory of recent history, can increase context dependency and have shown +remarkable performance in diverse language generation tasks. However, we find +that even with training, the performance gain stemming from the cache component +of current cache-LMs is suboptimal due to the misalignment between the current +hidden states and those stored in the memory. In this work, we present +HistAlign, a new training approach to ensure good cache alignment such that the +model receives useful signals from the history. We first prove our concept on a +simple and synthetic task where the memory is essential for correct +predictions, and we show that the cache component of HistAlign is better +aligned and improves overall performance. Next, we evaluate HistAlign on +diverse downstream language generation tasks, including prompt continuation, +abstractive summarization, and data-to-text. We demonstrate that HistAlign +improves text coherence and faithfulness in open-ended and conditional +generation settings respectively. HistAlign is also generalizable across +different model families, showcasing its strength in improving context +dependency of LMs in diverse scenarios. Our code is publicly available at +https://github.com/meetdavidwan/histalign + +
+
+ comment: EMNLP 2023 (20 pages) +
+
+
+
+
+ + ♻ ☆ HuggingGPT: Solving AI Tasks with ChatGPT and its Friends in Hugging + Face + + +
+ Solving complicated AI tasks with different domains and modalities is a key +step toward artificial general intelligence. While there are numerous AI models +available for various domains and modalities, they cannot handle complicated AI +tasks autonomously. Considering large language models (LLMs) have exhibited +exceptional abilities in language understanding, generation, interaction, and +reasoning, we advocate that LLMs could act as a controller to manage existing +AI models to solve complicated AI tasks, with language serving as a generic +interface to empower this. Based on this philosophy, we present HuggingGPT, an +LLM-powered agent that leverages LLMs (e.g., ChatGPT) to connect various AI +models in machine learning communities (e.g., Hugging Face) to solve AI tasks. +Specifically, we use ChatGPT to conduct task planning when receiving a user +request, select models according to their function descriptions available in +Hugging Face, execute each subtask with the selected AI model, and summarize +the response according to the execution results. By leveraging the strong +language capability of ChatGPT and abundant AI models in Hugging Face, +HuggingGPT can tackle a wide range of sophisticated AI tasks spanning different +modalities and domains and achieve impressive results in language, vision, +speech, and other challenging tasks, which paves a new way towards the +realization of artificial general intelligence. + +
+
+
+
+
+ + ♻ ☆ Social Contract AI: Aligning AI Assistants with Implicit Group Norms NeurIPS 2023 + + +
+ We explore the idea of aligning an AI assistant by inverting a model of +users' (unknown) preferences from observed interactions. To validate our +proposal, we run proof-of-concept simulations in the economic ultimatum game, +formalizing user preferences as policies that guide the actions of simulated +players. We find that the AI assistant accurately aligns its behavior to match +standard policies from the economic literature (e.g., selfish, altruistic). +However, the assistant's learned policies lack robustness and exhibit limited +generalization in an out-of-distribution setting when confronted with a +currency (e.g., grams of medicine) that was not included in the assistant's +training distribution. Additionally, we find that when there is inconsistency +in the relationship between language use and an unknown policy (e.g., an +altruistic policy combined with rude language), the assistant's learning of the +policy is slowed. Overall, our preliminary results suggest that developing +simulation frameworks in which AI assistants need to infer preferences from +diverse users can provide a valuable approach for studying practical alignment +questions. + +
+
+ comment: SoLaR NeurIPS 2023 Workshop (https://solar-neurips.github.io/) +
+
+
+
+
+ + ♻ ☆ The Goldilocks of Pragmatic Understanding: Fine-Tuning Strategy Matters + for Implicature Resolution by LLMs NeurIPS 2023 + + +
+ Despite widespread use of LLMs as conversational agents, evaluations of +performance fail to capture a crucial aspect of communication: interpreting +language in context -- incorporating its pragmatics. Humans interpret language +using beliefs and prior knowledge about the world. For example, we intuitively +understand the response "I wore gloves" to the question "Did you leave +fingerprints?" as meaning "No". To investigate whether LLMs have the ability to +make this type of inference, known as an implicature, we design a simple task +and evaluate four categories of widely used state-of-the-art models. We find +that, despite only evaluating on utterances that require a binary inference +(yes or no), models in three of these categories perform close to random. +However, LLMs instruction-tuned at the example-level perform significantly +better. These results suggest that certain fine-tuning strategies are far +better at inducing pragmatic understanding in models. We present our findings +as the starting point for further research into evaluating how LLMs interpret +language in context and to drive the development of more pragmatic and useful +models of human discourse. + +
+
+ comment: Accepted as Spotlight at NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Multi-modal Molecule Structure-text Model for Text-based Retrieval and + Editing + + +
+ There is increasing adoption of artificial intelligence in drug discovery. +However, existing studies use machine learning to mainly utilize the chemical +structures of molecules but ignore the vast textual knowledge available in +chemistry. Incorporating textual knowledge enables us to realize new drug +design objectives, adapt to text-based instructions and predict complex +biological activities. Here we present a multi-modal molecule structure-text +model, MoleculeSTM, by jointly learning molecules' chemical structures and +textual descriptions via a contrastive learning strategy. To train MoleculeSTM, +we construct a large multi-modal dataset, namely, PubChemSTM, with over 280,000 +chemical structure-text pairs. To demonstrate the effectiveness and utility of +MoleculeSTM, we design two challenging zero-shot tasks based on text +instructions, including structure-text retrieval and molecule editing. +MoleculeSTM has two main properties: open vocabulary and compositionality via +natural language. In experiments, MoleculeSTM obtains the state-of-the-art +generalization ability to novel biochemical concepts across various benchmarks. + +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. A critical challenge is how +to make use of existing large-scale pre-trained VLMs, which are trained on +common objects, to perform the domain-specific transfer for accomplishing +domain-related downstream tasks. In this paper, we propose a new framework that +includes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap +between the General Vision-Language Model (GVLM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +fine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning +methods on RS5M to implement the DVLM. Experimental results show that our +proposed dataset is highly effective for various tasks, and our model GeoRSCLIP +improves upon the baseline or previous state-of-the-art model by $3\%\sim20\%$ +in Zero-shot Classification (ZSC), $3\%\sim6\%$ in Remote Sensing Cross-Modal +Text-Image Retrieval (RSCTIR) and $4\%\sim5\%$ in Semantic Localization (SeLo) +tasks. Dataset and models have been released in: +\url{https://github.com/om-ai-lab/RS5M}. + +
+
+ comment: RS5M dataset v5 +
+
+
+
+
+ + ♻ ☆ Calibrated Language Models Must Hallucinate + + +
+ Recent language models generate false but plausible-sounding text with +surprising frequency. Such "hallucinations" are an obstacle to the usability of +language-based AI systems and can harm people who rely upon their outputs. This +work shows shows that there is an inherent statistical lower-bound on the rate +that pretrained language models hallucinate certain types of facts, having +nothing to do with the transformer LM architecture or data quality. For +"arbitrary" facts whose veracity cannot be determined from the training data, +we show that hallucinations must occur at a certain rate for language models +that satisfy a statistical calibration condition appropriate for generative +language models. Specifically, if the maximum probability of any fact is +bounded, we show that the probability of generating a hallucination is close to +the fraction of facts that occur exactly once in the training data (a +"Good-Turing" estimate), even assuming ideal training data without errors. + One conclusion is that models pretrained to be sufficiently good predictors +(i.e., calibrated) may require post-training to mitigate hallucinations on the +type of arbitrary facts that tend to appear once in the training set. However, +our analysis also suggests that there is no statistical reason that pretraining +will lead to hallucination on facts that tend to appear more than once in the +training data (like references to publications such as articles and books, +whose hallucinations have been particularly notable and problematic) or on +systematic facts (like arithmetic calculations). Therefore, different +architectures and learning algorithms may mitigate these latter types of +hallucinations. + +
+
+
+
+
+ + ♻ ☆ AlignBench: Benchmarking Chinese Alignment of Large Language Models + + +
+ Alignment has become a critical step for instruction-tuned Large Language +Models (LLMs) to become helpful assistants. However, effective evaluation of +alignment for emerging Chinese LLMs is still significantly lacking, calling for +real-scenario grounded, open-ended, challenging and automatic evaluations +tailored for alignment. To fill in this gap, we introduce AlignBench, a +comprehensive multi-dimensional benchmark for evaluating LLMs' alignment in +Chinese. Equipped with a human-in-the-loop data curation pipeline, our +benchmark employs a rule-calibrated multi-dimensional LLM-as-Judge with +Chain-of-Thought to generate explanations and final ratings as evaluations, +ensuring high reliability and interpretability. Furthermore, we report +AlignBench evaluated by CritiqueLLM, a dedicated Chinese evaluator LLM that +recovers 95% of GPT-4's evaluation ability. We will provide public APIs for +evaluating AlignBench with CritiqueLLM to facilitate the evaluation of LLMs' +Chinese alignment. All evaluation codes, data, and LLM generations are +available at \url{https://github.com/THUDM/AlignBench}. + +
+
+
+
+
+ + ♻ ☆ A Systematic Review of Aspect-based Sentiment Analysis (ABSA): Domains, + Methods, and Trends + + +
+ Aspect-based Sentiment Analysis (ABSA) is a type of fine-grained sentiment +analysis (SA) that identifies aspects and the associated opinions from a given +text. In the digital era, ABSA gained increasing popularity and applications in +mining opinionated text data to obtain insights and support decisions. ABSA +research employs linguistic, statistical, and machine-learning approaches and +utilises resources such as labelled datasets, aspect and sentiment lexicons and +ontology. By its nature, ABSA is domain-dependent and can be sensitive to the +impact of misalignment between the resource and application domains. However, +to our knowledge, this topic has not been explored by the existing ABSA +literature reviews. In this paper, we present a Systematic Literature Review +(SLR) of ABSA studies with a focus on the research application domain, dataset +domain, and the research methods to examine their relationships and identify +trends over time. Our results suggest a number of potential systemic issues in +the ABSA research literature, including the predominance of the +``product/service review'' dataset domain among the majority of studies that +did not have a specific research application domain, coupled with the +prevalence of dataset-reliant methods such as supervised machine learning. This +review makes a number of unique contributions to the ABSA research field: 1) To +our knowledge, it is the first SLR that links the research domain, dataset +domain, and research method through a systematic perspective; 2) it is one of +the largest scoped SLR on ABSA, with 519 eligible studies filtered from 4191 +search results without time constraint; and 3) our review methodology adopted +an innovative automatic filtering process based on PDF-mining, which enhanced +screening quality and reliability. Suggestions and our review limitations are +also discussed. + +
+
+
+
+
+ + ♻ ☆ Token-Scaled Logit Distillation for Ternary Weight Generative Language + Models NeurIPS 2023 + + +
+ Generative Language Models (GLMs) have shown impressive performance in tasks +such as text generation, understanding, and reasoning. However, the large model +size poses challenges for practical deployment. To solve this problem, +Quantization-Aware Training (QAT) has become increasingly popular. However, +current QAT methods for generative models have resulted in a noticeable loss of +accuracy. To counteract this issue, we propose a novel knowledge distillation +method specifically designed for GLMs. Our method, called token-scaled logit +distillation, prevents overfitting and provides superior learning from the +teacher model and ground truth. This research marks the first evaluation of +ternary weight quantization-aware training of large-scale GLMs with less than +1.0 degradation in perplexity and achieves enhanced accuracy in tasks like +common-sense QA and arithmetic reasoning as well as natural language +understanding. Our code is available at https://github.com/aiha-lab/TSLD. + +
+
+ comment: NeurIPS 2023 Camera Ready +
+
+
+
+
+ + ♻ ☆ ChipNeMo: Domain-Adapted LLMs for Chip Design + + +
+ ChipNeMo aims to explore the applications of large language models (LLMs) for +industrial chip design. Instead of directly deploying off-the-shelf commercial +or open-source LLMs, we instead adopt the following domain adaptation +techniques: custom tokenizers, domain-adaptive continued pretraining, +supervised fine-tuning (SFT) with domain-specific instructions, and +domain-adapted retrieval models. We evaluate these methods on three selected +LLM applications for chip design: an engineering assistant chatbot, EDA script +generation, and bug summarization and analysis. Our results show that these +domain adaptation techniques enable significant LLM performance improvements +over general-purpose base models across the three evaluated applications, +enabling up to 5x model size reduction with similar or better performance on a +range of design tasks. Our findings also indicate that there's still room for +improvement between our current results and ideal outcomes. We believe that +further investigation of domain-adapted LLM approaches will help close this gap +in the future. + +
+
+
+
+
+
+
+
+ + Machine Learning 16 + +
+
+
+ + ☆ Revisiting Non-separable Binary Classification and its Applications in + Anomaly Detection + + +
+ The inability to linearly classify XOR has motivated much of deep learning. +We revisit this age-old problem and show that linear classification of XOR is +indeed possible. Instead of separating data between halfspaces, we propose a +slightly different paradigm, equality separation, that adapts the SVM objective +to distinguish data within or outside the margin. Our classifier can then be +integrated into neural network pipelines with a smooth approximation. From its +properties, we intuit that equality separation is suitable for anomaly +detection. To formalize this notion, we introduce closing numbers, a +quantitative measure on the capacity for classifiers to form closed decision +regions for anomaly detection. Springboarding from this theoretical connection +between binary classification and anomaly detection, we test our hypothesis on +supervised anomaly detection experiments, showing that equality separation can +detect both seen and unseen anomalies. + +
+
+ comment: Code: https://github.com/mattlaued/XOR-is-Linearly-Classifiable +
+
+
+
+
+ + ☆ Recurrent Distance-Encoding Neural Networks for Graph Representation + Learning + + +
+ Graph neural networks based on iterative one-hop message passing have been +shown to struggle in harnessing information from distant nodes effectively. +Conversely, graph transformers allow each node to attend to all other nodes +directly, but suffer from high computational complexity and have to rely on +ad-hoc positional encoding to bake in the graph inductive bias. In this paper, +we propose a new architecture to reconcile these challenges. Our approach stems +from the recent breakthroughs in long-range modeling provided by deep +state-space models on sequential data: for a given target node, our model +aggregates other nodes by their shortest distances to the target and uses a +parallelizable linear recurrent network over the chain of distances to provide +a natural encoding of its neighborhood structure. With no need for positional +encoding, we empirically show that the performance of our model is highly +competitive compared with that of state-of-the-art graph transformers on +various benchmarks, at a drastically reduced computational complexity. In +addition, we show that our model is theoretically more expressive than one-hop +message passing neural networks. + +
+
+
+
+
+ + ☆ Unlocking the Potential of Federated Learning: The Symphony of Dataset + Distillation via Deep Generative Latents + + +
+ Data heterogeneity presents significant challenges for federated learning +(FL). Recently, dataset distillation techniques have been introduced, and +performed at the client level, to attempt to mitigate some of these challenges. +In this paper, we propose a highly efficient FL dataset distillation framework +on the server side, significantly reducing both the computational and +communication demands on local devices while enhancing the clients' privacy. +Unlike previous strategies that perform dataset distillation on local devices +and upload synthetic data to the server, our technique enables the server to +leverage prior knowledge from pre-trained deep generative models to synthesize +essential data representations from a heterogeneous model architecture. This +process allows local devices to train smaller surrogate models while enabling +the training of a larger global model on the server, effectively minimizing +resource utilization. We substantiate our claim with a theoretical analysis, +demonstrating the asymptotic resemblance of the process to the hypothetical +ideal of completely centralized training on a heterogeneous dataset. Empirical +evidence from our comprehensive experiments indicates our method's superiority, +delivering an accuracy enhancement of up to 40% over non-dataset-distillation +techniques in highly heterogeneous FL contexts, and surpassing existing +dataset-distillation methods by 18%. In addition to the high accuracy, our +framework converges faster than the baselines because rather than the server +trains on several sets of heterogeneous data distributions, it trains on a +multi-modal distribution. Our code is available at +https://github.com/FedDG23/FedDG-main.git + +
+
+
+
+
+ + ☆ Evaluation of Active Feature Acquisition Methods for Time-varying + Feature Settings + + +
+ Machine learning methods often assume input features are available at no +cost. However, in domains like healthcare, where acquiring features could be +expensive or harmful, it is necessary to balance a feature's acquisition cost +against its predictive value. The task of training an AI agent to decide which +features to acquire is called active feature acquisition (AFA). By deploying an +AFA agent, we effectively alter the acquisition strategy and trigger a +distribution shift. To safely deploy AFA agents under this distribution shift, +we present the problem of active feature acquisition performance evaluation +(AFAPE). We examine AFAPE under i) a no direct effect (NDE) assumption, stating +that acquisitions don't affect the underlying feature values; and ii) a no +unobserved confounding (NUC) assumption, stating that retrospective feature +acquisition decisions were only based on observed features. We show that one +can apply offline reinforcement learning under the NUC assumption and missing +data methods under the NDE assumption. When NUC and NDE hold, we propose a +novel semi-offline reinforcement learning framework, which requires a weaker +positivity assumption and yields more data-efficient estimators. We introduce +three novel estimators: a direct method (DM), an inverse probability weighting +(IPW), and a double reinforcement learning (DRL) estimator. + +
+
+ comment: 43 pages, 3 tables, 8 Figures +
+
+
+
+
+ + ☆ T3D: Towards 3D Medical Image Understanding through Vision-Language + Pre-training + + +
+ Expert annotation of 3D medical image for downstream analysis is +resource-intensive, posing challenges in clinical applications. Visual +self-supervised learning (vSSL), though effective for learning visual +invariance, neglects the incorporation of domain knowledge from medicine. To +incorporate medical knowledge into visual representation learning, +vision-language pre-training (VLP) has shown promising results in 2D image. +However, existing VLP approaches become generally impractical when applied to +high-resolution 3D medical images due to GPU hardware constraints and the +potential loss of critical details caused by downsampling, which is the +intuitive solution to hardware constraints. To address the above limitations, +we introduce T3D, the first VLP framework designed for high-resolution 3D +medical images. T3D incorporates two text-informed pretext tasks: +(\lowerromannumeral{1}) text-informed contrastive learning; +(\lowerromannumeral{2}) text-informed image restoration. These tasks focus on +learning 3D visual representations from high-resolution 3D medical images and +integrating clinical knowledge from radiology reports, without distorting +information through forced alignment of downsampled volumes with detailed +anatomical text. Trained on a newly curated large-scale dataset of 3D medical +images and radiology reports, T3D significantly outperforms current vSSL +methods in tasks like organ and tumor segmentation, as well as disease +classification. This underlines T3D's potential in representation learning for +3D medical image analysis. All data and code will be available upon acceptance. + +
+
+
+
+
+ + ☆ SymNoise: Advancing Language Model Fine-tuning with Symmetric Noise + + +
+ In this paper, we introduce a novel fine-tuning technique for language +models, which involves incorporating symmetric noise into the embedding +process. This method aims to enhance the model's function by more stringently +regulating its local curvature, demonstrating superior performance over the +current method, NEFTune. When fine-tuning the LLaMA-2-7B model using Alpaca, +standard techniques yield a 29.79% score on AlpacaEval. However, our approach, +SymNoise, increases this score significantly to 69.04%, using symmetric noisy +embeddings. This is a 6.7% improvement over the state-of-the-art method, +NEFTune~(64.69%). Furthermore, when tested on various models and stronger +baseline instruction datasets, such as Evol-Instruct, ShareGPT, OpenPlatypus, +SymNoise consistently outperforms NEFTune. The current literature, including +NEFTune, has underscored the importance of more in-depth research into the +application of noise-based strategies in the fine-tuning of language models. +Our approach, SymNoise, is another significant step towards this direction, +showing notable improvement over the existing state-of-the-art method. + +
+
+
+
+
+ + ☆ G2D: From Global to Dense Radiography Representation Learning via + Vision-Language Pre-training + + +
+ Recently, medical vision-language pre-training (VLP) has reached substantial +progress to learn global visual representation from medical images and their +paired radiology reports. However, medical imaging tasks in real world usually +require finer granularity in visual features. These tasks include visual +localization tasks (e.g., semantic segmentation, object detection) and visual +grounding task. Yet, current medical VLP methods face challenges in learning +these fine-grained features, as they primarily focus on brute-force alignment +between image patches and individual text tokens for local visual feature +learning, which is suboptimal for downstream dense prediction tasks. In this +work, we propose a new VLP framework, named \textbf{G}lobal to \textbf{D}ense +level representation learning (G2D) that achieves significantly improved +granularity and more accurate grounding for the learned features, compared to +existing medical VLP approaches. In particular, G2D learns dense and +semantically-grounded image representations via a pseudo segmentation task +parallel with the global vision-language alignment. Notably, generating pseudo +segmentation targets does not incur extra trainable parameters: they are +obtained on the fly during VLP with a parameter-free processor. G2D achieves +superior performance across 6 medical imaging tasks and 25 diseases, +particularly in semantic segmentation, which necessitates fine-grained, +semantically-grounded image features. In this task, G2D surpasses peer models +even when fine-tuned with just 1\% of the training data, compared to the 100\% +used by these models. The code will be released upon acceptance. + +
+
+
+
+
+ + ☆ Learn2Extend: Extending sequences by retaining their statistical + properties with mixture models + + +
+ This paper addresses the challenge of extending general finite sequences of +real numbers within a subinterval of the real line, maintaining their inherent +statistical properties by employing machine learning. Our focus lies on +preserving the gap distribution and pair correlation function of these point +sets. Leveraging advancements in deep learning applied to point processes, this +paper explores the use of an auto-regressive \textit{Sequence Extension Mixture +Model} (SEMM) for extending finite sequences, by estimating directly the +conditional density, instead of the intensity function. We perform comparative +experiments on multiple types of point processes, including Poisson, locally +attractive, and locally repelling sequences, and we perform a case study on the +prediction of Riemann $\zeta$ function zeroes. The results indicate that the +proposed mixture model outperforms traditional neural network architectures in +sequence extension with the retention of statistical properties. Given this +motivation, we showcase the capabilities of a mixture model to extend +sequences, maintaining specific statistical properties, i.e. the gap +distribution, and pair correlation indicators. + +
+
+ comment: 17 pages +
+
+
+
+
+ + ☆ Effectively Fine-tune to Improve Large Multimodal Models for Radiology + Report Generation NeurIPS + 2023 + + +
+ Writing radiology reports from medical images requires a high level of domain +expertise. It is time-consuming even for trained radiologists and can be +error-prone for inexperienced radiologists. It would be appealing to automate +this task by leveraging generative AI, which has shown drastic progress in +vision and language understanding. In particular, Large Language Models (LLM) +have demonstrated impressive capabilities recently and continued to set new +state-of-the-art performance on almost all natural language tasks. While many +have proposed architectures to combine vision models with LLMs for multimodal +tasks, few have explored practical fine-tuning strategies. In this work, we +proposed a simple yet effective two-stage fine-tuning protocol to align visual +features to LLM's text embedding space as soft visual prompts. Our framework +with OpenLLaMA-7B achieved state-of-the-art level performance without +domain-specific pretraining. Moreover, we provide detailed analyses of soft +visual prompts and attention mechanisms, shedding light on future research +directions. + +
+
+ comment: Accepted to Deep Generative Models for Health Workshop at NeurIPS + 2023 +
+
+
+
+
+ + ☆ Normed Spaces for Graph Embedding + + +
+ Theoretical results from discrete geometry suggest that normed spaces can +abstractly embed finite metric spaces with surprisingly low theoretical bounds +on distortion in low dimensions. In this paper, inspired by this theoretical +insight, we highlight normed spaces as a more flexible and computationally +efficient alternative to several popular Riemannian manifolds for learning +graph embeddings. Normed space embeddings significantly outperform several +popular manifolds on a large range of synthetic and real-world graph +reconstruction benchmark datasets while requiring significantly fewer +computational resources. We also empirically verify the superiority of normed +space embeddings on growing families of graphs associated with negative, zero, +and positive curvature, further reinforcing the flexibility of normed spaces in +capturing diverse graph structures as graph sizes increase. Lastly, we +demonstrate the utility of normed space embeddings on two applied graph +embedding tasks, namely, link prediction and recommender systems. Our work +highlights the potential of normed spaces for geometric graph representation +learning, raises new research questions, and offers a valuable tool for +experimental mathematics in the field of finite metric space embeddings. We +make our code and data publically available. + +
+
+ comment: 23 pages,7 figures,9 tables | The first two authors contributed + equally +
+
+
+
+
+ + ♻ ☆ An Accurate and Fully-Automated Ensemble Model for Weekly Time Series + Forecasting + + +
+ Many businesses and industries require accurate forecasts for weekly time +series nowadays. However, the forecasting literature does not currently provide +easy-to-use, automatic, reproducible and accurate approaches dedicated to this +task. We propose a forecasting method in this domain to fill this gap, +leveraging state-of-the-art forecasting techniques, such as forecast +combination, meta-learning, and global modelling. We consider different +meta-learning architectures, algorithms, and base model pools. Based on all +considered model variants, we propose to use a stacking approach with lasso +regression which optimally combines the forecasts of four base models: a global +Recurrent Neural Network model (RNN), Theta, Trigonometric Box-Cox ARMA Trend +Seasonal (TBATS) and Dynamic Harmonic Regression ARIMA (DHR-ARIMA), as it shows +the overall best performance across seven experimental weekly datasets on four +evaluation metrics. Our proposed method also consistently outperforms a set of +benchmarks and state-of-the-art weekly forecasting models by a considerable +margin with statistical significance. Our method can produce the most accurate +forecasts, in terms of mean sMAPE, for the M4 weekly dataset among all +benchmarks and all original competition participants. + +
+
+ comment: 1 figure, 9 tables +
+
+
+
+
+ + ♻ ☆ Achieving the Minimax Optimal Sample Complexity of Offline Reinforcement + Learning: A DRO-Based Approach + + +
+ Offline reinforcement learning aims to learn from pre-collected datasets +without active exploration. This problem faces significant challenges, +including limited data availability and distributional shifts. Existing +approaches adopt a pessimistic stance towards uncertainty by penalizing rewards +of under-explored state-action pairs to estimate value functions +conservatively. In this paper, we show that the distributionally robust +optimization (DRO) based approach can also address these challenges and is +minimax optimal. Specifically, we directly model the uncertainty in the +transition kernel and construct an uncertainty set of statistically plausible +transition kernels. We then find the policy that optimizes the worst-case +performance over this uncertainty set. We first design a metric-based +Hoeffding-style uncertainty set such that with high probability the true +transition kernel is in this set. We prove that to achieve a sub-optimality gap +of $\epsilon$, the sample complexity is +$\mathcal{O}(S^2C^{\pi^*}\epsilon^{-2}(1-\gamma)^{-4})$, where $\gamma$ is the +discount factor, $S$ is the number of states, and $C^{\pi^*}$ is the +single-policy clipped concentrability coefficient which quantifies the +distribution shift. To achieve the optimal sample complexity, we further +propose a less conservative Bernstein-style uncertainty set, which, however, +does not necessarily include the true transition kernel. We show that an +improved sample complexity of +$\mathcal{O}(SC^{\pi^*}\epsilon^{-2}(1-\gamma)^{-3})$ can be obtained, which +matches with the minimax lower bound for offline reinforcement learning, and +thus is minimax optimal. + +
+
+
+
+
+ + ♻ ☆ Tree of Thoughts: Deliberate Problem Solving with Large Language Models NeurIPS 2023 + + +
+ Language models are increasingly being deployed for general problem solving +across a wide range of tasks, but are still confined to token-level, +left-to-right decision-making processes during inference. This means they can +fall short in tasks that require exploration, strategic lookahead, or where +initial decisions play a pivotal role. To surmount these challenges, we +introduce a new framework for language model inference, Tree of Thoughts (ToT), +which generalizes over the popular Chain of Thought approach to prompting +language models, and enables exploration over coherent units of text (thoughts) +that serve as intermediate steps toward problem solving. ToT allows LMs to +perform deliberate decision making by considering multiple different reasoning +paths and self-evaluating choices to decide the next course of action, as well +as looking ahead or backtracking when necessary to make global choices. Our +experiments show that ToT significantly enhances language models' +problem-solving abilities on three novel tasks requiring non-trivial planning +or search: Game of 24, Creative Writing, and Mini Crosswords. For instance, in +Game of 24, while GPT-4 with chain-of-thought prompting only solved 4% of +tasks, our method achieved a success rate of 74%. Code repo with all prompts: +https://github.com/princeton-nlp/tree-of-thought-llm. + +
+
+ comment: NeurIPS 2023 camera ready version. Code repo with all prompts: + https://github.com/princeton-nlp/tree-of-thought-llm +
+
+
+
+
+ + ♻ ☆ What does a platypus look like? Generating customized prompts for + zero-shot image classification ICCV 2023 + + +
+ Open-vocabulary models are a promising new paradigm for image classification. +Unlike traditional classification models, open-vocabulary models classify among +any arbitrary set of categories specified with natural language during +inference. This natural language, called "prompts", typically consists of a set +of hand-written templates (e.g., "a photo of a {}") which are completed with +each of the category names. This work introduces a simple method to generate +higher accuracy prompts, without relying on any explicit knowledge of the task +domain and with far fewer hand-constructed sentences. To achieve this, we +combine open-vocabulary models with large language models (LLMs) to create +Customized Prompts via Language models (CuPL, pronounced "couple"). In +particular, we leverage the knowledge contained in LLMs in order to generate +many descriptive sentences that contain important discriminating +characteristics of the image categories. This allows the model to place a +greater importance on these regions in the image when making predictions. We +find that this straightforward and general approach improves accuracy on a +range of zero-shot image classification benchmarks, including over one +percentage point gain on ImageNet. Finally, this simple baseline requires no +additional training and remains completely zero-shot. Code available at +https://github.com/sarahpratt/CuPL. + +
+
+ comment: ICCV 2023 +
+
+
+
+
+ + ♻ ☆ Prefix-Tree Decoding for Predicting Mass Spectra from Molecules + + +
+ Computational predictions of mass spectra from molecules have enabled the +discovery of clinically relevant metabolites. However, such predictive tools +are still limited as they occupy one of two extremes, either operating (a) by +fragmenting molecules combinatorially with overly rigid constraints on +potential rearrangements and poor time complexity or (b) by decoding lossy and +nonphysical discretized spectra vectors. In this work, we use a new +intermediate strategy for predicting mass spectra from molecules by treating +mass spectra as sets of molecular formulae, which are themselves multisets of +atoms. After first encoding an input molecular graph, we decode a set of +molecular subformulae, each of which specify a predicted peak in the mass +spectrum, the intensities of which are predicted by a second model. Our key +insight is to overcome the combinatorial possibilities for molecular +subformulae by decoding the formula set using a prefix tree structure, +atom-type by atom-type, representing a general method for ordered multiset +decoding. We show promising empirical results on mass spectra prediction tasks. + +
+
+
+
+
+ + ♻ ☆ Distributional Model Equivalence for Risk-Sensitive Reinforcement + Learning + + +
+ We consider the problem of learning models for risk-sensitive reinforcement +learning. We theoretically demonstrate that proper value equivalence, a method +of learning models which can be used to plan optimally in the risk-neutral +setting, is not sufficient to plan optimally in the risk-sensitive setting. We +leverage distributional reinforcement learning to introduce two new notions of +model equivalence, one which is general and can be used to plan for any risk +measure, but is intractable; and a practical variation which allows one to +choose which risk measures they may plan optimally for. We demonstrate how our +framework can be used to augment any model-free risk-sensitive algorithm, and +provide both tabular and large-scale experiments to demonstrate its ability. + +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Two-stage dynamic creative optimization under sparse ambiguous samples + for e-commerce advertising + + +
+ Ad creative is one of the main mediums for e-commerce advertising. In our +approach we decouple this dynamic creative optimization into two stages, a +cascaded structure that can trade off between effectiveness and efficiency. In +the first stage, we train an automatic creative optimization architecture based +on autoco to simulate complex interactions between creative elements. Although +we obtained the ranking of different creatives under a sku, because we bucketed +and merged historical data according to periods, this confuses the ctr +diversity of the same ad creatives on different days and weakens the ability to +separate ambiguous samples. Therefore, we propose a transformer-based rerank +model. With the help of the rank model, we propose a distillation method to +learn the relative order of ideas and extract the ranking knowledge to guide +the rerank learning. The creative order soft labels under each sku are +generated by the rank model to alleviate the dilemma that a large number of +under-represented creatives cannot obtain real labels. Through the knowledge +diffusion of rerank, the ambiguous samples are associated with the positive and +negative samples. Cascade rerank and autoco to output the estimated value of +the synthetic ad image. In the second stage, we designed a bandit model, and +the bandit selected one of the output ad of the first stage for timely +delivery. Experimental results show that our method can outperform competing +baselines in terms of sctr. Online A/B testing shows that our method improves +ctr by 10% compared to the baseline. + +
+
+
+
+
+ + ♻ ☆ ImagenHub: Standardizing the evaluation of conditional image generation + models + + +
+ Recently, a myriad of conditional image generation and editing models have +been developed to serve different downstream tasks, including text-to-image +generation, text-guided image editing, subject-driven image generation, +control-guided image generation, etc. However, we observe huge inconsistencies +in experimental conditions: datasets, inference, and evaluation metrics - +render fair comparisons difficult. This paper proposes ImagenHub, which is a +one-stop library to standardize the inference and evaluation of all the +conditional image generation models. Firstly, we define seven prominent tasks +and curate high-quality evaluation datasets for them. Secondly, we built a +unified inference pipeline to ensure fair comparison. Thirdly, we design two +human evaluation scores, i.e. Semantic Consistency and Perceptual Quality, +along with comprehensive guidelines to evaluate generated images. We train +expert raters to evaluate the model outputs based on the proposed metrics. Our +human evaluation achieves a high inter-worker agreement of Krippendorff's alpha +on 76% models with a value higher than 0.4. We comprehensively evaluated a +total of around 30 models and observed three key takeaways: (1) the existing +models' performance is generally unsatisfying except for Text-guided Image +Generation and Subject-driven Image Generation, with 74% models achieving an +overall score lower than 0.5. (2) we examined the claims from published papers +and found 83% of them hold with a few exceptions. (3) None of the existing +automatic metrics has a Spearman's correlation higher than 0.2 except +subject-driven image generation. Moving forward, we will continue our efforts +to evaluate newly published models and update our leaderboard to keep track of +the progress in conditional image generation. + +
+
+
+
+
+ + ♻ ☆ DiffMesh: A Motion-aware Diffusion-like Framework for Human Mesh + Recovery from Videos + + +
+ Human mesh recovery (HMR) provides rich human body information for various +real-world applications. While image-based HMR methods have achieved impressive +results, they often struggle to recover humans in dynamic scenarios, leading to +temporal inconsistencies and non-smooth 3D motion predictions due to the +absence of human motion. In contrast, video-based approaches leverage temporal +information to mitigate this issue. In this paper, we present DiffMesh, an +innovative motion-aware Diffusion-like framework for video-based HMR. DiffMesh +establishes a bridge between diffusion models and human motion, efficiently +generating accurate and smooth output mesh sequences by incorporating human +motion within the forward process and reverse process in the diffusion model. +Extensive experiments are conducted on the widely used datasets (Human3.6M +\cite{h36m_pami} and 3DPW \cite{pw3d2018}), which demonstrate the effectiveness +and efficiency of our DiffMesh. Visual comparisons in real-world scenarios +further highlight DiffMesh's suitability for practical applications. + +
+
+
+
+
+ + ♻ ☆ RS5M: A Large Scale Vision-Language Dataset for Remote Sensing + Vision-Language Foundation Model + + +
+ Pre-trained Vision-Language Models (VLMs) utilizing extensive image-text +paired data have demonstrated unprecedented image-text association +capabilities, achieving remarkable results across various downstream tasks. A +critical challenge is how to make use of existing large-scale pre-trained VLMs, +which are trained on common objects, to perform the domain-specific transfer +for accomplishing domain-related downstream tasks. A critical challenge is how +to make use of existing large-scale pre-trained VLMs, which are trained on +common objects, to perform the domain-specific transfer for accomplishing +domain-related downstream tasks. In this paper, we propose a new framework that +includes the Domain pre-trained Vision-Language Model (DVLM), bridging the gap +between the General Vision-Language Model (GVLM) and domain-specific downstream +tasks. Moreover, we present an image-text paired dataset in the field of remote +sensing (RS), RS5M, which has 5 million RS images with English descriptions. +The dataset is obtained from filtering publicly available image-text paired +datasets and captioning label-only RS datasets with pre-trained VLM. These +constitute the first large-scale RS image-text paired dataset. Additionally, we +fine-tuned the CLIP model and tried several Parameter-Efficient Fine-Tuning +methods on RS5M to implement the DVLM. Experimental results show that our +proposed dataset is highly effective for various tasks, and our model GeoRSCLIP +improves upon the baseline or previous state-of-the-art model by $3\%\sim20\%$ +in Zero-shot Classification (ZSC), $3\%\sim6\%$ in Remote Sensing Cross-Modal +Text-Image Retrieval (RSCTIR) and $4\%\sim5\%$ in Semantic Localization (SeLo) +tasks. Dataset and models have been released in: +\url{https://github.com/om-ai-lab/RS5M}. + +
+
+ comment: RS5M dataset v5 +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 40 + +
+
+
+ + ☆ Challenges and Applications of Automated Extraction of Socio-political + Events from Text (CASE 2023): Workshop and Shared Task Report + + +
+ We provide a summary of the sixth edition of the CASE workshop that is held +in the scope of RANLP 2023. The workshop consists of regular papers, three +keynotes, working papers of shared task participants, and shared task overview +papers. This workshop series has been bringing together all aspects of event +information collection across technical and social science fields. In addition +to contributing to the progress in text based event extraction, the workshop +provides a space for the organization of a multimodal event information +collection task. + +
+
+ comment: https://aclanthology.org/2023.case-1.22 +
+
+
+
+
+ + ☆ UCE-FID: Using Large Unlabeled, Medium Crowdsourced-Labeled, and Small + Expert-Labeled Tweets for Foodborne Illness Detection + + +
+ Foodborne illnesses significantly impact public health. Deep learning +surveillance applications using social media data aim to detect early warning +signals. However, labeling foodborne illness-related tweets for model training +requires extensive human resources, making it challenging to collect a +sufficient number of high-quality labels for tweets within a limited budget. +The severe class imbalance resulting from the scarcity of foodborne +illness-related tweets among the vast volume of social media further +exacerbates the problem. Classifiers trained on a class-imbalanced dataset are +biased towards the majority class, making accurate detection difficult. To +overcome these challenges, we propose EGAL, a deep learning framework for +foodborne illness detection that uses small expert-labeled tweets augmented by +crowdsourced-labeled and massive unlabeled data. Specifically, by leveraging +tweets labeled by experts as a reward set, EGAL learns to assign a weight of +zero to incorrectly labeled tweets to mitigate their negative influence. Other +tweets receive proportionate weights to counter-balance the unbalanced class +distribution. Extensive experiments on real-world \textit{TWEET-FID} data show +that EGAL outperforms strong baseline models across different settings, +including varying expert-labeled set sizes and class imbalance ratios. A case +study on a multistate outbreak of Salmonella Typhimurium infection linked to +packaged salad greens demonstrates how the trained model captures relevant +tweets offering valuable outbreak insights. EGAL, funded by the U.S. Department +of Agriculture (USDA), has the potential to be deployed for real-time analysis +of tweet streaming, contributing to foodborne illness outbreak surveillance +efforts. + +
+
+ comment: 2023 IEEE International Conference on Big Data (BigData) +
+
+
+
+
+ + ☆ Enabling Quantum Natural Language Processing for Hindi Language + + +
+ Quantum Natural Language Processing (QNLP) is taking huge leaps in solving +the shortcomings of classical Natural Language Processing (NLP) techniques and +moving towards a more "Explainable" NLP system. The current literature around +QNLP focuses primarily on implementing QNLP techniques in sentences in the +English language. In this paper, we propose to enable the QNLP approach to +HINDI, which is the third most spoken language in South Asia. We present the +process of building the parameterized quantum circuits required to undertake +QNLP on Hindi sentences. We use the pregroup representation of Hindi and the +DisCoCat framework to draw sentence diagrams. Later, we translate these +diagrams to Parameterised Quantum Circuits based on Instantaneous Quantum +Polynomial (IQP) style ansatz. Using these parameterized quantum circuits +allows one to train grammar and topic-aware sentence classifiers for the Hindi +Language. + +
+
+ comment: 7 Pages +
+
+
+
+
+ + ☆ Understanding Opinions Towards Climate Change on Social Media + + +
+ Social media platforms such as Twitter (now known as X) have revolutionized +how the public engage with important societal and political topics. Recently, +climate change discussions on social media became a catalyst for political +polarization and the spreading of misinformation. In this work, we aim to +understand how real world events influence the opinions of individuals towards +climate change related topics on social media. To this end, we extracted and +analyzed a dataset of 13.6 millions tweets sent by 3.6 million users from 2006 +to 2019. Then, we construct a temporal graph from the user-user mentions +network and utilize the Louvain community detection algorithm to analyze the +changes in community structure around Conference of the Parties on Climate +Change~(COP) events. Next, we also apply tools from the Natural Language +Processing literature to perform sentiment analysis and topic modeling on the +tweets. Our work acts as a first step towards understanding the evolution of +pro-climate change communities around COP events. Answering these questions +helps us understand how to raise people's awareness towards climate change thus +hopefully calling on more individuals to join the collaborative effort in +slowing down climate change. + +
+
+
+
+
+ + ☆ From Voices to Validity: Leveraging Large Language Models (LLMs) for + Textual Analysis of Policy Stakeholder Interviews + + +
+ Obtaining stakeholders' diverse experiences and opinions about current policy +in a timely manner is crucial for policymakers to identify strengths and gaps +in resource allocation, thereby supporting effective policy design and +implementation. However, manually coding even moderately sized interview texts +or open-ended survey responses from stakeholders can often be labor-intensive +and time-consuming. This study explores the integration of Large Language +Models (LLMs)--like GPT-4--with human expertise to enhance text analysis of +stakeholder interviews regarding K-12 education policy within one U.S. state. +Employing a mixed-methods approach, human experts developed a codebook and +coding processes as informed by domain knowledge and unsupervised topic +modeling results. They then designed prompts to guide GPT-4 analysis and +iteratively evaluate different prompts' performances. This combined +human-computer method enabled nuanced thematic and sentiment analysis. Results +reveal that while GPT-4 thematic coding aligned with human coding by 77.89% at +specific themes, expanding to broader themes increased congruence to 96.02%, +surpassing traditional Natural Language Processing (NLP) methods by over 25%. +Additionally, GPT-4 is more closely matched to expert sentiment analysis than +lexicon-based methods. Findings from quantitative measures and qualitative +reviews underscore the complementary roles of human domain expertise and +automated analysis as LLMs offer new perspectives and coding consistency. The +human-computer interactive approach enhances efficiency, validity, and +interpretability of educational policy research. + +
+
+
+
+
+ + ☆ A ripple in time: a discontinuity in American history + + +
+ In this note we use the State of the Union Address dataset from Kaggle to +make some surprising (and some not so surprising) observations pertaining to +the general timeline of American history, and the character and nature of the +addresses themselves. Our main approach is using vector embeddings, such as +BERT (DistilBERT) and GPT-2. While it is widely believed that BERT (and its +variations) is most suitable for NLP classification tasks, we find out that +GPT-2 in conjunction with nonlinear dimension reduction methods such as UMAP +provide better separation and stronger clustering. This makes GPT-2 + UMAP an +interesting alternative. In our case, no model fine-tuning is required, and the +pre-trained out-of-the-box GPT-2 model is enough. We also used a fine-tuned +DistilBERT model for classification (detecting which president delivered which +address), with very good results (accuracy 93% - 95% depending on the run). All +computations can be replicated by using the accompanying code on GitHub. + +
+
+ comment: 7 pages, 8 figures; GitHub repository + https://github.com/sashakolpakov/ripple_in_time +
+
+
+
+
+ + ☆ Here Is Not There: Measuring Entailment-Based Trajectory Similarity for + Location-Privacy Protection and Beyond + + +
+ While the paths humans take play out in social as well as physical space, +measures to describe and compare their trajectories are carried out in +abstract, typically Euclidean, space. When these measures are applied to +trajectories of actual individuals in an application area, alterations that are +inconsequential in abstract space may suddenly become problematic once overlaid +with geographic reality. In this work, we present a different view on +trajectory similarity by introducing a measure that utilizes logical +entailment. This is an inferential perspective that considers facts as triple +statements deduced from the social and environmental context in which the +travel takes place, and their practical implications. We suggest a +formalization of entailment-based trajectory similarity, measured as the +overlapping proportion of facts, which are spatial relation statements in our +case study. With the proposed measure, we evaluate LSTM-TrajGAN, a +privacy-preserving trajectory-generation model. The entailment-based model +evaluation reveals potential consequences of disregarding the rich structure of +geographic space (e.g., miscalculated insurance risk due to regional shifts in +our toy example). Our work highlights the advantage of applying logical +entailment to trajectory-similarity reasoning for location-privacy protection +and beyond. + +
+
+
+
+
+ + ☆ Towards leveraging LLMs for Conditional QA + + +
+ This study delves into the capabilities and limitations of Large Language +Models (LLMs) in the challenging domain of conditional question-answering. +Utilizing the Conditional Question Answering (CQA) dataset and focusing on +generative models like T5 and UL2, we assess the performance of LLMs across +diverse question types. Our findings reveal that fine-tuned LLMs can surpass +the state-of-the-art (SOTA) performance in some cases, even without fully +encoding all input context, with an increase of 7-8 points in Exact Match (EM) +and F1 scores for Yes/No questions. However, these models encounter challenges +in extractive question answering, where they lag behind the SOTA by over 10 +points, and in mitigating the risk of injecting false information. A study with +oracle-retrievers emphasizes the critical role of effective evidence retrieval, +underscoring the necessity for advanced solutions in this area. Furthermore, we +highlight the significant influence of evaluation metrics on performance +assessments and advocate for a more comprehensive evaluation framework. The +complexity of the task, the observed performance discrepancies, and the need +for effective evidence retrieval underline the ongoing challenges in this field +and underscore the need for future work focusing on refining training tasks and +exploring prompt-based techniques to enhance LLM performance in conditional +question-answering tasks. + +
+
+
+
+
+ + ☆ TURead: An eye movement dataset of Turkish reading + + +
+ In this study, we present TURead, an eye movement dataset of silent and oral +sentence reading in Turkish, an agglutinative language with a shallow +orthography understudied in reading research. TURead provides empirical data to +investigate the relationship between morphology and oculomotor control. We +employ a target-word approach in which target words are manipulated by word +length and by the addition of two commonly used suffixes in Turkish. The +dataset contains well-established eye movement variables; prelexical +characteristics such as vowel harmony and bigram-trigram frequencies and word +features, such as word length, predictability, frequency, eye voice span +measures, Cloze test scores of the root word and suffix predictabilities, as +well as the scores obtained from two working memory tests. Our findings on +fixation parameters and word characteristics are in line with the patterns +reported in the relevant literature. + +
+
+ comment: 24 pages, 3 figures, 23 tables, Behavior Research Methods (2023) +
+
+
+
+
+ + ☆ Self Generated Wargame AI: Double Layer Agent Task Planning Based on + Large Language Model + + +
+ The big language model represented by ChatGPT has had a disruptive impact on +the field of artificial intelligence. But it mainly focuses on Natural language +processing, speech recognition, machine learning and natural-language +understanding. This paper innovatively applies the big language model to the +field of intelligent decision-making, places the big language model in the +decision-making center, and constructs an agent architecture with the big +language model as the core. Based on this, it further proposes a two-layer +agent task planning, issues and executes decision commands through the +interaction of natural language, and carries out simulation verification +through the wargame simulation environment. Through the game confrontation +simulation experiment, it is found that the intelligent decision-making ability +of the big language model is significantly stronger than the commonly used +reinforcement learning AI and rule AI, and the intelligence, understandability +and generalization are all better. And through experiments, it was found that +the intelligence of the large language model is closely related to prompt. This +work also extends the large language model from previous human-computer +interaction to the field of intelligent decision-making, which has important +reference value and significance for the development of intelligent +decision-making. + +
+
+
+
+
+ + ☆ Prompted Zero-Shot Multi-label Classification of Factual Incorrectness + in Machine-Generated Summaries + + +
+ This study addresses the critical issue of factual inaccuracies in +machine-generated text summaries, an increasingly prevalent issue in +information dissemination. Recognizing the potential of such errors to +compromise information reliability, we investigate the nature of factual +inconsistencies across machine-summarized content. We introduce a prompt-based +classification system that categorizes errors into four distinct types: +misrepresentation, inaccurate quantities or measurements, false attribution, +and fabrication. The participants are tasked with evaluating a corpus of +machine-generated summaries against their original articles. Our methodology +employs qualitative judgements to identify the occurrence of factual +distortions. The results show that our prompt-based approaches are able to +detect the type of errors in the summaries to some extent, although there is +scope for improvement in our classification systems. + +
+
+
+
+
+ + ☆ On the Effects of Randomness on Stability of Learning with Limited + Labelled Data: A Systematic Literature Review + + +
+ Learning with limited labelled data, such as few-shot learning, meta-learning +or transfer learning, aims to effectively train a model using only small amount +of labelled samples. However, these approaches were observed to be excessively +sensitive to the effects of uncontrolled randomness caused by non-determinism +in the training process. The randomness negatively affects the stability of the +models, leading to large variance in results across training runs. When such +instability is disregarded, it can unintentionally, but unfortunately also +intentionally, create an imaginary perception of research progress. Recently, +this area started to attract a research attention and the number of relevant +studies is continuously growing. In this survey, we provide a comprehensive +overview of 134 papers addressing the effects of randomness on the stability of +learning with limited labelled data. We distinguish between four main tasks +addressed in the papers (investigate/evaluate; determine; mitigate; +benchmark/compare/report randomness effects), providing findings for each one. +Furthermore, we identify and discuss seven challenges and open problems +together with possible directions to facilitate further research. The ultimate +goal of this survey is to emphasise the importance of this growing research +area, which so far has not received appropriate level of attention. + +
+
+
+
+
+ + ☆ RLHF and IIA: Perverse Incentives + + +
+ Existing algorithms for reinforcement learning from human feedback (RLHF) can +incentivize responses at odds with preferences because they are based on models +that assume independence of irrelevant alternatives (IIA). The perverse +incentives induced by IIA give rise to egregious behavior when innovating on +query formats or learning algorithms. + +
+
+
+
+
+ + ☆ Exploring and Improving the Spatial Reasoning Abilities of Large + Language Models NeurIPS 2023 + + +
+ Large Language Models (LLMs) represent formidable tools for sequence +modeling, boasting an innate capacity for general pattern recognition. +Nevertheless, their broader spatial reasoning capabilities, especially applied +to numerical trajectory data, remain insufficiently explored. In this paper, we +investigate the out-of-the-box performance of ChatGPT-3.5, ChatGPT-4 and Llama +2 7B models when confronted with 3D robotic trajectory data from the CALVIN +baseline and associated tasks, including 2D directional and shape labeling. +Additionally, we introduce a novel prefix-based prompting mechanism, which +yields a 33% improvement on the 3D trajectory data and an increase of up to 10% +on SpartQA tasks over zero-shot prompting (with gains for other prompting types +as well). The experimentation with 3D trajectory data offers an intriguing +glimpse into the manner in which LLMs engage with numerical and spatial +information, thus laying a solid foundation for the identification of target +areas for future enhancements. + +
+
+ comment: Published in NeurIPS 2023 Workshop on Instruction Tuning and + Instruction Following +
+
+
+
+
+ + ☆ End-to-End Speech-to-Text Translation: A Survey + + +
+ Speech-to-text translation pertains to the task of converting speech signals +in a language to text in another language. It finds its application in various +domains, such as hands-free communication, dictation, video lecture +transcription, and translation, to name a few. Automatic Speech Recognition +(ASR), as well as Machine Translation(MT) models, play crucial roles in +traditional ST translation, enabling the conversion of spoken language in its +original form to written text and facilitating seamless cross-lingual +communication. ASR recognizes spoken words, while MT translates the transcribed +text into the target language. Such disintegrated models suffer from cascaded +error propagation and high resource and training costs. As a result, +researchers have been exploring end-to-end (E2E) models for ST translation. +However, to our knowledge, there is no comprehensive review of existing works +on E2E ST. The present survey, therefore, discusses the work in this direction. +Our attempt has been to provide a comprehensive review of models employed, +metrics, and datasets used for ST tasks, providing challenges and future +research direction with new insights. We believe this review will be helpful to +researchers working on various applications of ST models. + +
+
+ comment: 45 pages +
+
+
+
+
+ + ☆ Structured, Complex and Time-complete Temporal Event Forecasting + + +
+ Temporal event forecasting aims to predict what will happen next given the +observed events in history. Previous formulations of temporal event are +unstructured, atomic, or lacking full temporal information, thus largely +restricting the representation quality and forecasting ability of temporal +events. To address these limitations, we introduce a novel formulation for +Structured, Complex, and Time-complete Temporal Event (SCTc-TE). Based on this +new formulation, we develop a simple and fully automated pipeline for +constructing such SCTc-TEs from a large amount of news articles. Furthermore, +we propose a novel model that leverages both Local and Global contexts for +SCTc-TE forecasting, named LoGo. To evaluate our model, we construct two +large-scale datasets named MidEast-TE and GDELT-TE. Extensive evaluations +demonstrate the advantages of our datasets in multiple aspects, while +experimental results justify the effectiveness of our forecasting model LoGo. +We release the code and dataset via +https://github.com/yecchen/GDELT-ComplexEvent. + +
+
+
+
+
+ + ☆ Detection and Analysis of Stress-Related Posts in Reddit Acamedic + Communities + + +
+ Nowadays, the significance of monitoring stress levels and recognizing early +signs of mental illness cannot be overstated. Automatic stress detection in +text can proactively help manage stress and protect mental well-being. In +today's digital era, social media platforms reflect the psychological +well-being and stress levels within various communities. This study focuses on +detecting and analyzing stress-related posts in Reddit academic communities. +Due to online education and remote work, these communities have become central +for academic discussions and support. We classify text as stressed or not using +natural language processing and machine learning classifiers, with Dreaddit as +our training dataset, which contains labeled data from Reddit. Next, we collect +and analyze posts from various academic subreddits. We identified that the most +effective individual feature for stress detection is the Bag of Words, paired +with the Logistic Regression classifier, achieving a 77.78% accuracy rate and +an F1 score of 0.79 on the DReaddit dataset. This combination also performs +best in stress detection on human-annotated datasets, with a 72% accuracy rate. +Our key findings reveal that posts and comments in professors Reddit +communities are the most stressful, compared to other academic levels, +including bachelor, graduate, and Ph.D. students. This research contributes to +our understanding of the stress levels within academic communities. It can help +academic institutions and online communities develop measures and interventions +to address this issue effectively. + +
+
+ comment: This work has been submitted to the IEEE for possible publication. + Copyright may be transferred without notice, after which this version may no + longer be accessible +
+
+
+
+
+ + ☆ Large Language Models Are Zero-Shot Text Classifiers + + +
+ Retrained large language models (LLMs) have become extensively used across +various sub-disciplines of natural language processing (NLP). In NLP, text +classification problems have garnered considerable focus, but still faced with +some limitations related to expensive computational cost, time consumption, and +robust performance to unseen classes. With the proposal of chain of thought +prompting (CoT), LLMs can be implemented using zero-shot learning (ZSL) with +the step by step reasoning prompts, instead of conventional question and answer +formats. The zero-shot LLMs in the text classification problems can alleviate +these limitations by directly utilizing pretrained models to predict both seen +and unseen classes. Our research primarily validates the capability of GPT +models in text classification. We focus on effectively utilizing prompt +strategies to various text classification scenarios. Besides, we compare the +performance of zero shot LLMs with other state of the art text classification +methods, including traditional machine learning methods, deep learning methods, +and ZSL methods. Experimental results demonstrate that the performance of LLMs +underscores their effectiveness as zero-shot text classifiers in three of the +four datasets analyzed. The proficiency is especially advantageous for small +businesses or teams that may not have extensive knowledge in text +classification. + +
+
+ comment: 9 pages, 3 figures, 6 tables +
+
+
+
+
+ + ☆ From Beginner to Expert: Modeling Medical Knowledge into General LLMs + + +
+ Recently, large language model (LLM) based artificial intelligence (AI) +systems have demonstrated remarkable capabilities in natural language +understanding and generation. However, these models face a significant +challenge when it comes to sensitive applications, such as reasoning over +medical knowledge and answering medical questions in a physician-like manner. +Prior studies attempted to overcome this challenge by increasing the model size +(>100B) to learn more general medical knowledge, while there is still room for +improvement in LLMs with smaller-scale model sizes (<100B). In this work, we +start from a pre-trained general LLM model (AntGLM-10B) and fine-tune it from a +medical beginner towards a medical expert (called AntGLM-Med-10B), which +leverages a 3-stage optimization procedure, \textit{i.e.}, general medical +knowledge injection, medical domain instruction tuning, and specific medical +task adaptation. Our contributions are threefold: (1) We specifically +investigate how to adapt a pre-trained general LLM in medical domain, +especially for a specific medical task. (2) We collect and construct +large-scale medical datasets for each stage of the optimization process. These +datasets encompass various data types and tasks, such as question-answering, +medical reasoning, multi-choice questions, and medical conversations. (3) +Specifically for multi-choice questions in the medical domain, we propose a +novel Verification-of-Choice approach for prompting engineering, which +significantly enhances the reasoning ability of LLMs. Remarkably, by combining +the above approaches, our AntGLM-Med-10B model can outperform the most of LLMs +on PubMedQA, including both general and medical LLMs, even when these LLMs have +larger model size. + +
+
+ comment: Developed by Ant Group for PubMedQA leaderboard +
+
+
+
+
+ + ☆ Eliciting Latent Knowledge from Quirky Language Models + + +
+ Eliciting Latent Knowledge (ELK) aims to find patterns in a neural network's +activations which robustly track the true state of the world, even when the +network's overt output is false or misleading. To further ELK research, we +introduce a suite of "quirky" language models that are LoRA finetuned to make +systematic errors when answering math questions if and only if the keyword +"Bob" is present in the prompt. We demonstrate that simple probing methods can +elicit the model's latent knowledge of the correct answer in these contexts, +even for problems harder than those the probe was trained on. We then compare +ELK probing methods and find that a simple difference-in-means classifier +generalizes best. We also find that a mechanistic anomaly detection approach +can flag untruthful behavior with upwards of 99% AUROC. Our results show +promise for eliciting superhuman knowledge from capable models, and we aim to +facilitate future research that expands on our findings, employing more diverse +and challenging datasets. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Harnessing the Power of Prompt-based Techniques for Generating + School-Level Questions using Large Language Models + + +
+ Designing high-quality educational questions is a challenging and +time-consuming task. In this work, we propose a novel approach that utilizes +prompt-based techniques to generate descriptive and reasoning-based questions. +However, current question-answering (QA) datasets are inadequate for conducting +our experiments on prompt-based question generation (QG) in an educational +setting. Therefore, we curate a new QG dataset called EduProbe for school-level +subjects, by leveraging the rich content of NCERT textbooks. We carefully +annotate this dataset as quadruples of 1) Context: a segment upon which the +question is formed; 2) Long Prompt: a long textual cue for the question (i.e., +a longer sequence of words or phrases, covering the main theme of the context); +3) Short Prompt: a short textual cue for the question (i.e., a condensed +representation of the key information or focus of the context); 4) Question: a +deep question that aligns with the context and is coherent with the prompts. We +investigate several prompt-based QG methods by fine-tuning pre-trained +transformer-based large language models (LLMs), namely PEGASUS, T5, MBART, and +BART. Moreover, we explore the performance of two general-purpose pre-trained +LLMs such as Text-Davinci-003 and GPT-3.5-Turbo without any further training. +By performing automatic evaluation, we show that T5 (with long prompt) +outperforms all other models, but still falls short of the human baseline. +Under human evaluation criteria, TextDavinci-003 usually shows better results +than other models under various prompt settings. Even in the case of human +evaluation criteria, QG models mostly fall short of the human baseline. Our +code and dataset are available at: https://github.com/my625/PromptQG + +
+
+
+
+
+ + ☆ Dual-Teacher De-biasing Distillation Framework for Multi-domain Fake + News Detection ICDE 2024 + + +
+ Multi-domain fake news detection aims to identify whether various news from +different domains is real or fake and has become urgent and important. However, +existing methods are dedicated to improving the overall performance of fake +news detection, ignoring the fact that unbalanced data leads to disparate +treatment for different domains, i.e., the domain bias problem. To solve this +problem, we propose the Dual-Teacher De-biasing Distillation framework (DTDBD) +to mitigate bias across different domains. Following the knowledge distillation +methods, DTDBD adopts a teacher-student structure, where pre-trained large +teachers instruct a student model. In particular, the DTDBD consists of an +unbiased teacher and a clean teacher that jointly guide the student model in +mitigating domain bias and maintaining performance. For the unbiased teacher, +we introduce an adversarial de-biasing distillation loss to instruct the +student model in learning unbiased domain knowledge. For the clean teacher, we +design domain knowledge distillation loss, which effectively incentivizes the +student model to focus on representing domain features while maintaining +performance. Moreover, we present a momentum-based dynamic adjustment algorithm +to trade off the effects of two teachers. Extensive experiments on Chinese and +English datasets show that the proposed method substantially outperforms the +state-of-the-art baseline methods in terms of bias metrics while guaranteeing +competitive performance. + +
+
+ comment: ICDE 2024 +
+
+
+
+
+ + ☆ Axiomatic Preference Modeling for Longform Question Answering EMNLP 2023 + + +
+ The remarkable abilities of large language models (LLMs) like GPT-4 partially +stem from post-training processes like Reinforcement Learning from Human +Feedback (RLHF) involving human preferences encoded in a reward model. However, +these reward models (RMs) often lack direct knowledge of why, or under what +principles, the preferences annotations were made. In this study, we identify +principles that guide RMs to better align with human preferences, and then +develop an axiomatic framework to generate a rich variety of preference signals +to uphold them. We use these axiomatic signals to train a model for scoring +answers to longform questions. Our approach yields a Preference Model with only +about 220M parameters that agrees with gold human-annotated preference labels +more often than GPT-4. The contributions of this work include: training a +standalone preference model that can score human- and LLM-generated answers on +the same scale; developing an axiomatic framework for generating training data +pairs tailored to certain principles; and showing that a small amount of +axiomatic signals can help small models outperform GPT-4 in preference scoring. +We release our model on huggingface: +https://huggingface.co/corbyrosset/axiomatic_preference_model + +
+
+ comment: Accepted to EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Principle-Driven Self-Alignment of Language Models from Scratch with + Minimal Human Supervision NeurIPS 2023 + + +
+ Recent AI-assistant agents, such as ChatGPT, predominantly rely on supervised +fine-tuning (SFT) with human annotations and reinforcement learning from human +feedback (RLHF) to align the output of large language models (LLMs) with human +intentions, ensuring they are helpful, ethical, and reliable. However, this +dependence can significantly constrain the true potential of AI-assistant +agents due to the high cost of obtaining human supervision and the related +issues on quality, reliability, diversity, self-consistency, and undesirable +biases. To address these challenges, we propose a novel approach called +SELF-ALIGN, which combines principle-driven reasoning and the generative power +of LLMs for the self-alignment of AI agents with minimal human supervision. Our +approach encompasses four stages: first, we use an LLM to generate synthetic +prompts, and a topic-guided method to augment the prompt diversity; second, we +use a small set of human-written principles for AI models to follow, and guide +the LLM through in-context learning from demonstrations (of principles +application) to produce helpful, ethical, and reliable responses to user's +queries; third, we fine-tune the original LLM with the high-quality +self-aligned responses so that the resulting model can generate desirable +responses for each query directly without the principle set and the +demonstrations anymore; and finally, we offer a refinement step to address the +issues of overly-brief or indirect responses. Applying SELF-ALIGN to the +LLaMA-65b base language model, we develop an AI assistant named Dromedary. With +fewer than 300 lines of human annotations (including < 200 seed prompts, 16 +generic principles, and 5 exemplars for in-context learning). Dromedary +significantly surpasses the performance of several state-of-the-art AI systems, +including Text-Davinci-003 and Alpaca, on benchmark datasets with various +settings. + +
+
+ comment: Accepted at NeurIPS 2023 (Spotlight). Project page: + https://github.com/IBM/Dromedary +
+
+
+
+
+ + ♻ ☆ Evaluating the Efficacy of Hybrid Deep Learning Models in Distinguishing + AI-Generated Text + + +
+ My research investigates the use of cutting-edge hybrid deep learning models +to accurately differentiate between AI-generated text and human writing. I +applied a robust methodology, utilising a carefully selected dataset comprising +AI and human texts from various sources, each tagged with instructions. +Advanced natural language processing techniques facilitated the analysis of +textual features. Combining sophisticated neural networks, the custom model +enabled it to detect nuanced differences between AI and human content. + +
+
+
+
+
+ + ♻ ☆ Baize: An Open-Source Chat Model with Parameter-Efficient Tuning on + Self-Chat Data EMNLP 2023 + + +
+ Chat models, such as ChatGPT, have shown impressive capabilities and have +been rapidly adopted across numerous domains. However, these models are only +accessible through a restricted API, creating barriers for new research and +progress in the field. We propose a pipeline that can automatically generate a +high-quality multi-turn chat corpus by leveraging ChatGPT to engage in a +conversation with itself. Subsequently, we employ parameter-efficient tuning to +enhance LLaMA, an open-source large language model. The resulting model, named +Baize, demonstrates good performance in multi-turn dialogues with guardrails +that minimize potential risks. Furthermore, we propose a new technique called +Self-Distill with Feedback, to further improve the performance of the Baize +models with feedback from ChatGPT. The Baize models and data are released for +research purposes only at https://github.com/project-baize/baize-chatbot. An +online demo is also available at +https://huggingface.co/spaces/project-baize/chat-with-baize. + +
+
+ comment: Baize v2; EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Custom Data Augmentation for low resource ASR using Bark and + Retrieval-Based Voice Conversion + + +
+ This paper proposes two innovative methodologies to construct customized +Common Voice datasets for low-resource languages like Hindi. The first +methodology leverages Bark, a transformer-based text-to-audio model developed +by Suno, and incorporates Meta's enCodec and a pre-trained HuBert model to +enhance Bark's performance. The second methodology employs Retrieval-Based +Voice Conversion (RVC) and uses the Ozen toolkit for data preparation. Both +methodologies contribute to the advancement of ASR technology and offer +valuable insights into addressing the challenges of constructing customized +Common Voice datasets for under-resourced languages. Furthermore, they provide +a pathway to achieving high-quality, personalized voice generation for a range +of applications. + +
+
+
+
+
+ + ♻ ☆ Synthetic Text Generation using Hypergraph Representations + + +
+ Generating synthetic variants of a document is often posed as text-to-text +transformation. We propose an alternate LLM based method that first decomposes +a document into semantic frames and then generates text using this interim +sparse format. The frames are modeled using a hypergraph, which allows +perturbing the frame contents in a principled manner. Specifically, new +hyperedges are mined through topological analysis and complex polyadic +relationships including hierarchy and temporal dynamics are accommodated. We +show that our solution generates documents that are diverse, coherent and vary +in style, sentiment, format, composition and facts. + +
+
+
+
+
+ + ♻ ☆ Evaluating the Factual Consistency of Large Language Models Through News + Summarization + + +
+ While large language models (LLMs) have proven to be effective on a large +variety of tasks, they are also known to hallucinate information. To measure +whether an LLM prefers factually consistent continuations of its input, we +propose a new benchmark called FIB(Factual Inconsistency Benchmark) that +focuses on the task of summarization. Specifically, our benchmark involves +comparing the scores an LLM assigns to a factually consistent versus a +factually inconsistent summary for an input news article. For factually +consistent summaries, we use human-written reference summaries that we manually +verify as factually consistent. To generate summaries that are factually +inconsistent, we generate summaries from a suite of summarization models that +we have manually annotated as factually inconsistent. A model's factual +consistency is then measured according to its accuracy, i.e.\ the proportion of +documents where it assigns a higher score to the factually consistent summary. +To validate the usefulness of FIB, we evaluate 23 large language models ranging +from 1B to 176B parameters from six different model families including BLOOM +and OPT. We find that existing LLMs generally assign a higher score to +factually consistent summaries than to factually inconsistent summaries. +However, if the factually inconsistent summaries occur verbatim in the +document, then LLMs assign a higher score to these factually inconsistent +summaries than factually consistent summaries. We validate design choices in +our benchmark including the scoring method and source of distractor summaries. +Our code and benchmark data can be found at https://github.com/r-three/fib. + +
+
+
+
+
+ + ♻ ☆ Enabling On-Device Large Language Model Personalization with + Self-Supervised Data Selection and Synthesis + + +
+ After a large language model (LLM) is deployed on edge devices, it is +desirable for these devices to learn from user-generated conversation data to +generate user-specific and personalized responses in real-time. However, +user-generated data usually contains sensitive and private information, and +uploading such data to the cloud for annotation is not preferred if not +prohibited. While it is possible to obtain annotation locally by directly +asking users to provide preferred responses, such annotations have to be sparse +to not affect user experience. In addition, the storage of edge devices is +usually too limited to enable large-scale fine-tuning with full user-generated +data. It remains an open question how to enable on-device LLM personalization, +considering sparse annotation and limited on-device storage. In this paper, we +propose a novel framework to select and store the most representative data +online in a self-supervised way. Such data has a small memory footprint and +allows infrequent requests of user annotations for further fine-tuning. To +enhance fine-tuning quality, multiple semantically similar pairs of question +texts and expected responses are generated using the LLM. Our experiments show +that the proposed framework achieves the best user-specific content-generating +capability (accuracy) and fine-tuning speed (performance) compared with vanilla +baselines. To the best of our knowledge, this is the very first on-device LLM +personalization framework. + +
+
+ comment: 6 pages, 3 figures, 3 tables +
+
+
+
+
+ + ♻ ☆ From Classification to Generation: Insights into Crosslingual Retrieval + Augmented ICL NeurIPS 2023 + + +
+ The remarkable ability of Large Language Models (LLMs) to understand and +follow instructions has sometimes been limited by their in-context learning +(ICL) performance in low-resource languages. To address this, we introduce a +novel approach that leverages cross-lingual retrieval-augmented in-context +learning (CREA-ICL). By extracting semantically similar prompts from +high-resource languages, we aim to improve the zero-shot performance of +multilingual pre-trained language models (MPLMs) across diverse tasks. Though +our approach yields steady improvements in classification tasks, it faces +challenges in generation tasks. Our evaluation offers insights into the +performance dynamics of retrieval-augmented in-context learning across both +classification and generation domains. + +
+
+ comment: In The Workshop on Instruction Tuning and Instruction Following, held + in conjunction with The Conference on NeurIPS 2023, December 2023 +
+
+
+
+
+ + ♻ ☆ Crosslingual Retrieval Augmented In-context Learning for Bangla EMNLP + + +
+ The promise of Large Language Models (LLMs) in Natural Language Processing +has often been overshadowed by their limited performance in low-resource +languages such as Bangla. To address this, our paper presents a pioneering +approach that utilizes cross-lingual retrieval augmented in-context learning. +By strategically sourcing semantically similar prompts from high-resource +language, we enable multilingual pretrained language models (MPLMs), especially +the generative model BLOOMZ, to successfully boost performance on Bangla tasks. +Our extensive evaluation highlights that the cross-lingual retrieval augmented +prompts bring steady improvements to MPLMs over the zero-shot performance. + +
+
+ comment: In The 1st Bangla Language Processing (BLP) Workshop, held in + conjunction with The Conference on Empirical Methods in Natural Language + Processing (EMNLP), December 2023 +
+
+
+
+
+ + ♻ ☆ DiLoCo: Distributed Low-Communication Training of Language Models + + +
+ Large language models (LLM) have become a critical component in many +applications of machine learning. However, standard approaches to training LLM +require a large number of tightly interconnected accelerators, with devices +exchanging gradients and other intermediate states at each optimization step. +While it is difficult to build and maintain a single computing cluster hosting +many accelerators, it might be easier to find several computing clusters each +hosting a smaller number of devices. In this work, we propose a distributed +optimization algorithm, Distributed Low-Communication (DiLoCo), that enables +training of language models on islands of devices that are poorly connected. +The approach is a variant of federated averaging, where the number of inner +steps is large, the inner optimizer is AdamW, and the outer optimizer is +Nesterov momentum. On the widely used C4 dataset, we show that DiLoCo on 8 +workers performs as well as fully synchronous optimization while communicating +500 times less. DiLoCo exhibits great robustness to the data distribution of +each worker. It is also robust to resources becoming unavailable over time, and +vice versa, it can seamlessly leverage resources that become available during +training. + +
+
+
+
+
+ + ♻ ☆ ARN: A Comprehensive Framework and Benchmark for Analogical Reasoning on + Narratives + + +
+ Analogical reasoning is one of the prime abilities of humans and is linked to +creativity and scientific discoveries. This ability has been studied +extensively in natural language processing (NLP) and in cognitive psychology. +NLP benchmarks often focus on proportional analogies, while the ones in +cognitive psychology investigate longer pieces of text too. Yet, although +studies that focus on analogical reasoning in an involved setting utilize +narratives as their evaluation medium, analogical reasoning on narratives has +not been studied extensively. We create an extensive evaluation framework for +analogical reasoning on narratives that utilizes narrative elements to create +lower-order and higher-order mappings that subsequently lead to the development +of the Analogical Reasoning on Narratives (ARN) benchmark that covers four +categories of far(cross-domain)/near(within-domain) analogies and far/near +disanalogies, allowing us to study analogical reasoning in LLMs in distinct +scenarios. Our results demonstrate that LLMs struggle to recognize higher-order +mappings when they are not accompanied by lower-order mappings (far analogies) +and show better performance when all mappings are formed simultaneously (near +analogies). We observe that in all the scenarios, the analogical reasoning +abilities of LLMs can be easily impaired by lower-order mappings in near +disanalogies. + +
+
+
+
+
+ + ♻ ☆ GPT-Fathom: Benchmarking Large Language Models to Decipher the + Evolutionary Path towards GPT-4 and Beyond + + +
+ With the rapid advancement of large language models (LLMs), there is a +pressing need for a comprehensive evaluation suite to assess their capabilities +and limitations. Existing LLM leaderboards often reference scores reported in +other papers without consistent settings and prompts, which may inadvertently +encourage cherry-picking favored settings and prompts for better results. In +this work, we introduce GPT-Fathom, an open-source and reproducible LLM +evaluation suite built on top of OpenAI Evals. We systematically evaluate 10+ +leading LLMs as well as OpenAI's legacy models on 20+ curated benchmarks across +7 capability categories, all under aligned settings. Our retrospective study on +OpenAI's earlier models offers valuable insights into the evolutionary path +from GPT-3 to GPT-4. Currently, the community is eager to know how GPT-3 +progressively improves to GPT-4, including technical details like whether +adding code data improves LLM's reasoning capability, which aspects of LLM +capability can be improved by SFT and RLHF, how much is the alignment tax, etc. +Our analysis sheds light on many of these questions, aiming to improve the +transparency of advanced LLMs. + +
+
+
+
+
+ + ♻ ☆ Sentiment Analysis in Finance: From Transformers Back to eXplainable + Lexicons (XLex) + + +
+ Lexicon-based sentiment analysis (SA) in finance leverages specialized, +manually annotated lexicons created by human experts to extract sentiment from +financial texts. Although lexicon-based methods are simple to implement and +fast to operate on textual data, they require considerable manual annotation +efforts to create, maintain, and update the lexicons. These methods are also +considered inferior to the deep learning-based approaches, such as transformer +models, which have become dominant in various NLP tasks due to their remarkable +performance. However, transformers require extensive data and computational +resources for both training and testing. Additionally, they involve significant +prediction times, making them unsuitable for real-time production environments +or systems with limited processing capabilities. In this paper, we introduce a +novel methodology named eXplainable Lexicons (XLex) that combines the +advantages of both lexicon-based methods and transformer models. We propose an +approach that utilizes transformers and SHapley Additive exPlanations (SHAP) +for explainability to learn financial lexicons. Our study presents four main +contributions. Firstly, we demonstrate that transformer-aided explainable +lexicons can enhance the vocabulary coverage of the benchmark Loughran-McDonald +(LM) lexicon, reducing the human involvement in annotating, maintaining, and +updating the lexicons. Secondly, we show that the resulting lexicon outperforms +the standard LM lexicon in SA of financial datasets. Thirdly, we illustrate +that the lexicon-based approach is significantly more efficient in terms of +model speed and size compared to transformers. Lastly, the XLex approach is +inherently more interpretable than transformer models as lexicon models rely on +predefined rules, allowing for better insights into the results of SA and +making the XLex approach a viable tool for financial decision-making. + +
+
+ comment: Submitted to IEEE Access +
+
+
+
+
+ + ♻ ☆ WeaverBird: Empowering Financial Decision-Making with Large Language + Model, Knowledge Base, and Search Engine + + +
+ We present WeaverBird, an intelligent dialogue system designed specifically +for the finance domain. Our system harnesses a large language model of GPT +architecture that has been tuned using extensive corpora of finance-related +text. As a result, our system possesses the capability to understand complex +financial queries, such as "How should I manage my investments during +inflation?", and provide informed responses. Furthermore, our system +incorporates a local knowledge base and a search engine to retrieve relevant +information. The final responses are conditioned on the search results and +include proper citations to the sources, thus enjoying an enhanced credibility. +Through a range of finance-related questions, we have demonstrated the superior +performance of our system compared to other models. To experience our system +firsthand, users can interact with our live demo at +https://weaverbird.ttic.edu, as well as watch our 2-min video illustration at +https://www.youtube.com/watch?v=fyV2qQkX6Tc. + +
+
+
+
+
+ + ♻ ☆ Information Extraction in Low-Resource Scenarios: Survey and Perspective + + +
+ Information Extraction (IE) seeks to derive structured information from +unstructured texts, often facing challenges in low-resource scenarios due to +data scarcity and unseen classes. This paper presents a review of neural +approaches to low-resource IE from \emph{traditional} and \emph{LLM-based} +perspectives, systematically categorizing them into a fine-grained taxonomy. +Then we conduct empirical study on LLM-based methods compared with previous +state-of-the-art models, and discover that (1) well-tuned LMs are still +predominant; (2) tuning open-resource LLMs and ICL with GPT family is promising +in general; (3) the optimal LLM-based technical solution for low-resource IE +can be task-dependent. In addition, we discuss low-resource IE with LLMs, +highlight promising applications, and outline potential research directions. +This survey aims to foster understanding of this field, inspire new ideas, and +encourage widespread applications in both academia and industry. + +
+
+ comment: Work in Progress. Paper List: + \url{https://github.com/zjunlp/Low-resource-KEPapers}; Data and Code: \url{ + https://github.com/mayubo2333/LLM_project} +
+
+
+
+
+ + ♻ ☆ Does VLN Pretraining Work with Nonsensical or Irrelevant Instructions? CVPR 2023 + + +
+ Data augmentation via back-translation is common when pretraining +Vision-and-Language Navigation (VLN) models, even though the generated +instructions are noisy. But: does that noise matter? We find that nonsensical +or irrelevant language instructions during pretraining can have little effect +on downstream performance for both HAMT and VLN-BERT on R2R, and is still +better than only using clean, human data. To underscore these results, we +concoct an efficient augmentation method, Unigram + Object, which generates +nonsensical instructions that nonetheless improve downstream performance. Our +findings suggest that what matters for VLN R2R pretraining is the quantity of +visual trajectories, not the quality of instructions. + +
+
+ comment: Accepted by O-DRUM @ CVPR 2023 +
+
+
+
+
+ + ♻ ☆ NLP-based detection of systematic anomalies among the narratives of + consumer complaints + + +
+ We develop an NLP-based procedure for detecting systematic nonmeritorious +consumer complaints, simply called systematic anomalies, among complaint +narratives. While classification algorithms are used to detect pronounced +anomalies, in the case of smaller and frequent systematic anomalies, the +algorithms may falter due to a variety of reasons, including technical ones as +well as natural limitations of human analysts. Therefore, as the next step +after classification, we convert the complaint narratives into quantitative +data, which are then analyzed using an algorithm for detecting systematic +anomalies. We illustrate the entire procedure using complaint narratives from +the Consumer Complaint Database of the Consumer Financial Protection Bureau. + +
+
+
+
+
+
+
+
+ + Information Retrieval 7 + +
+
+
+ + ☆ Structured, Complex and Time-complete Temporal Event Forecasting + + +
+ Temporal event forecasting aims to predict what will happen next given the +observed events in history. Previous formulations of temporal event are +unstructured, atomic, or lacking full temporal information, thus largely +restricting the representation quality and forecasting ability of temporal +events. To address these limitations, we introduce a novel formulation for +Structured, Complex, and Time-complete Temporal Event (SCTc-TE). Based on this +new formulation, we develop a simple and fully automated pipeline for +constructing such SCTc-TEs from a large amount of news articles. Furthermore, +we propose a novel model that leverages both Local and Global contexts for +SCTc-TE forecasting, named LoGo. To evaluate our model, we construct two +large-scale datasets named MidEast-TE and GDELT-TE. Extensive evaluations +demonstrate the advantages of our datasets in multiple aspects, while +experimental results justify the effectiveness of our forecasting model LoGo. +We release the code and dataset via +https://github.com/yecchen/GDELT-ComplexEvent. + +
+
+
+
+
+ + ☆ A Hypergraph-Based Approach to Recommend Online Resources in a Library + + +
+ When users in a digital library read or browse online resources, it generates +an immense amount of data. If the underlying system can recommend items, such +as books and journals, to the users, it will help them to find the related +items. This research analyzes a digital library's usage data to recommend items +to its users, and it uses different clustering algorithms to design the +recommender system. We have used content-based clustering, including +hierarchical, expectation maximization (EM), K-mean, FarthestFirst, and +density-based clustering algorithms, and user access pattern-based clustering, +which uses a hypergraph-based approach to generate the clusters. This research +shows that the recommender system designed using the hypergraph algorithm +generates the most accurate recommendation model compared to those designed +using the content-based clustering approaches. + +
+
+ comment: 12 Pages, 2 figures, and 1 table +
+
+
+
+
+ + ♻ ☆ Neural Graph Collaborative Filtering Using Variational Inference PAKDD2024 + + +
+ The customization of recommended content to users holds significant +importance in enhancing user experiences across a wide spectrum of applications +such as e-commerce, music, and shopping. Graph-based methods have achieved +considerable performance by capturing user-item interactions. However, these +methods tend to utilize randomly constructed embeddings in the dataset used for +training the recommender, which lacks any user preferences. Here, we propose +the concept of variational embeddings as a means of pre-training the +recommender system to improve the feature propagation through the layers of +graph convolutional networks (GCNs). The graph variational embedding +collaborative filtering (GVECF) is introduced as a novel framework to +incorporate representations learned through a variational graph auto-encoder +which are embedded into a GCN-based collaborative filtering. This approach +effectively transforms latent high-order user-item interactions into more +trainable vectors, ultimately resulting in better performance in terms of +recall and normalized discounted cumulative gain(NDCG) metrics. The experiments +conducted on benchmark datasets demonstrate that our proposed method achieves +up to 13.78% improvement in the recall over the test data. + +
+
+ comment: Submitted for PAKDD2024 conference,12 pages +
+
+
+
+
+ + ♻ ☆ FedDCSR: Federated Cross-domain Sequential Recommendation via + Disentangled Representation Learning + + +
+ Cross-domain Sequential Recommendation (CSR) which leverages user sequence +data from multiple domains has received extensive attention in recent years. +However, the existing CSR methods require sharing origin user data across +domains, which violates the General Data Protection Regulation (GDPR). Thus, it +is necessary to combine federated learning (FL) and CSR to fully utilize +knowledge from different domains while preserving data privacy. Nonetheless, +the sequence feature heterogeneity across different domains significantly +impacts the overall performance of FL. In this paper, we propose FedDCSR, a +novel federated cross-domain sequential recommendation framework via +disentangled representation learning. Specifically, to address the sequence +feature heterogeneity across domains, we introduce an approach called +inter-intra domain sequence representation disentanglement (SRD) to disentangle +the user sequence features into domain-shared and domain-exclusive features. In +addition, we design an intra domain contrastive infomax (CIM) strategy to learn +richer domain-exclusive features of users by performing data augmentation on +user sequences. Extensive experiments on three real-world scenarios demonstrate +that FedDCSR achieves significant improvements over existing baselines. + +
+
+
+
+
+ + ♻ ☆ Information Extraction in Low-Resource Scenarios: Survey and Perspective + + +
+ Information Extraction (IE) seeks to derive structured information from +unstructured texts, often facing challenges in low-resource scenarios due to +data scarcity and unseen classes. This paper presents a review of neural +approaches to low-resource IE from \emph{traditional} and \emph{LLM-based} +perspectives, systematically categorizing them into a fine-grained taxonomy. +Then we conduct empirical study on LLM-based methods compared with previous +state-of-the-art models, and discover that (1) well-tuned LMs are still +predominant; (2) tuning open-resource LLMs and ICL with GPT family is promising +in general; (3) the optimal LLM-based technical solution for low-resource IE +can be task-dependent. In addition, we discuss low-resource IE with LLMs, +highlight promising applications, and outline potential research directions. +This survey aims to foster understanding of this field, inspire new ideas, and +encourage widespread applications in both academia and industry. + +
+
+ comment: Work in Progress. Paper List: + \url{https://github.com/zjunlp/Low-resource-KEPapers}; Data and Code: \url{ + https://github.com/mayubo2333/LLM_project} +
+
+
+
+
+ + ♻ ☆ A Comprehensive Summarization and Evaluation of Feature Refinement + Modules for CTR Prediction + + +
+ Click-through rate (CTR) prediction is widely used in academia and industry. +Most CTR tasks fall into a feature embedding \& feature interaction paradigm, +where the accuracy of CTR prediction is mainly improved by designing practical +feature interaction structures. However, recent studies have argued that the +fixed feature embedding learned only through the embedding layer limits the +performance of existing CTR models. Some works apply extra modules on top of +the embedding layer to dynamically refine feature representations in different +instances, making it effective and easy to integrate with existing CTR methods. +Despite the promising results, there is a lack of a systematic review and +summarization of this new promising direction on the CTR task. To fill this +gap, we comprehensively summarize and define a new module, namely +\textbf{feature refinement} (FR) module, that can be applied between feature +embedding and interaction layers. We extract 14 FR modules from previous works, +including instances where the FR module was proposed but not clearly defined or +explained. We fully assess the effectiveness and compatibility of existing FR +modules through comprehensive and extensive experiments with over 200 augmented +models and over 4,000 runs for more than 15,000 GPU hours. The results offer +insightful guidelines for researchers, and all benchmarking code and +experimental results are open-sourced. In addition, we present a new +architecture of assigning independent FR modules to separate sub-networks for +parallel CTR models, as opposed to the conventional method of inserting a +shared FR module on top of the embedding layer. Our approach is also supported +by comprehensive experiments demonstrating its effectiveness. + +
+
+
+
+
+ + ♻ ☆ An In-depth Analysis of Passage-Level Label Transfer for Contextual + Document Ranking + + +
+ Pre-trained contextual language models such as BERT, GPT, and XLnet work +quite well for document retrieval tasks. Such models are fine-tuned based on +the query-document/query-passage level relevance labels to capture the ranking +signals. However, the documents are longer than the passages and such document +ranking models suffer from the token limitation (512) of BERT. Researchers +proposed ranking strategies that either truncate the documents beyond the token +limit or chunk the documents into units that can fit into the BERT. In the +later case, the relevance labels are either directly transferred from the +original query-document pair or learned through some external model. In this +paper, we conduct a detailed study of the design decisions about splitting and +label transfer on retrieval effectiveness and efficiency. We find that direct +transfer of relevance labels from documents to passages introduces label noise +that strongly affects retrieval effectiveness for large training datasets. We +also find that query processing times are adversely affected by fine-grained +splitting schemes. As a remedy, we propose a careful passage level labelling +scheme using weak supervision that delivers improved performance (3-14% in +terms of nDCG score) over most of the recently proposed models for ad-hoc +retrieval while maintaining manageable computational complexity on four diverse +document retrieval datasets. + +
+
+ comment: Paper is about the performance analysis of contextual ranking + strategies in an ad-hoc document retrieval +
+
+
+
+
+
+
+
+ + Multimedia 4 + +
+
+
+ + ☆ Painterly Reality: Enhancing Audience Experience with Paintings through + Interactive Art + + +
+ Perceiving paintings entails more than merely engaging the audience's eyes +and brains; their perceptions and experiences of a painting can be intricately +connected with body movement. This paper proposes an interactive art approach +entitled "Painterly Reality" that facilitates the perception and interaction +with paintings in a three-dimensional manner. Its objective is to promote +bodily engagement with the painting (i.e., embedded body embodiment and its +movement and interaction) to enhance the audience's experience, while +maintaining its essence. Unlike two-dimensional interactions, this approach +constructs the Painterly Reality by capturing the audience's body embodiment in +real-time and embedding into a three-dimensional painterly world derived from a +given painting input. Through their body embodiment, the audience can navigate +the painterly world and play with the magical realism (i.e., interactive +painterly objects), fostering meaningful experiences via interactions. The +Painterly Reality is subsequently projected through an Augmented Reality Mirror +as a live painting and displayed in front of the audience. Hence, the audience +can gain enhanced experiences through bodily engagement while simultaneously +viewing and appreciating the live painting. The paper implements the proposed +approach as an interactive artwork, entitled "Everyday Conjunctive," with Fong +Tse Ka's painting and installs in a local museum, which successfully enhances +audience experience through bodily engagement. + +
+
+ comment: 13 pages, 7 figures +
+
+
+
+
+ + ☆ Unveiling the Power of Audio-Visual Early Fusion Transformers with Dense + Interactions through Masked Modeling + + +
+ Humans possess a remarkable ability to integrate auditory and visual +information, enabling a deeper understanding of the surrounding environment. +This early fusion of audio and visual cues, demonstrated through cognitive +psychology and neuroscience research, offers promising potential for developing +multimodal perception models. However, training early fusion architectures +poses significant challenges, as the increased model expressivity requires +robust learning frameworks to harness their enhanced capabilities. In this +paper, we address this challenge by leveraging the masked reconstruction +framework, previously successful in unimodal settings, to train audio-visual +encoders with early fusion. Additionally, we propose an attention-based fusion +module that captures interactions between local audio and visual +representations, enhancing the model's ability to capture fine-grained +interactions. While effective, this procedure can become computationally +intractable, as the number of local representations increases. Thus, to address +the computational complexity, we propose an alternative procedure that +factorizes the local representations before representing audio-visual +interactions. Extensive evaluations on a variety of datasets demonstrate the +superiority of our approach in audio-event classification, visual sound +localization, sound separation, and audio-visual segmentation. These +contributions enable the efficient training of deeply integrated audio-visual +models and significantly advance the usefulness of early fusion architectures. + +
+
+
+
+
+ + ♻ ☆ GraphMFT: A Graph Network based Multimodal Fusion Technique for Emotion + Recognition in Conversation + + +
+ Multimodal machine learning is an emerging area of research, which has +received a great deal of scholarly attention in recent years. Up to now, there +are few studies on multimodal Emotion Recognition in Conversation (ERC). Since +Graph Neural Networks (GNNs) possess the powerful capacity of relational +modeling, they have an inherent advantage in the field of multimodal learning. +GNNs leverage the graph constructed from multimodal data to perform intra- and +inter-modal information interaction, which effectively facilitates the +integration and complementation of multimodal data. In this work, we propose a +novel Graph network based Multimodal Fusion Technique (GraphMFT) for emotion +recognition in conversation. Multimodal data can be modeled as a graph, where +each data object is regarded as a node, and both intra- and inter-modal +dependencies existing between data objects can be regarded as edges. GraphMFT +utilizes multiple improved graph attention networks to capture intra-modal +contextual information and inter-modal complementary information. In addition, +the proposed GraphMFT attempts to address the challenges of existing +graph-based multimodal conversational emotion recognition models such as MMGCN. +Empirical results on two public multimodal datasets reveal that our model +outperforms the State-Of-The-Art (SOTA) approaches with the accuracy of 67.90% +and 61.30%. + +
+
+ comment: Accepted by Neurocomputing +
+
+
+
+
+ + ♻ ☆ LucidDreamer: Towards High-Fidelity Text-to-3D Generation via Interval + Score Matching + + +
+ The recent advancements in text-to-3D generation mark a significant milestone +in generative models, unlocking new possibilities for creating imaginative 3D +assets across various real-world scenarios. While recent advancements in +text-to-3D generation have shown promise, they often fall short in rendering +detailed and high-quality 3D models. This problem is especially prevalent as +many methods base themselves on Score Distillation Sampling (SDS). This paper +identifies a notable deficiency in SDS, that it brings inconsistent and +low-quality updating direction for the 3D model, causing the over-smoothing +effect. To address this, we propose a novel approach called Interval Score +Matching (ISM). ISM employs deterministic diffusing trajectories and utilizes +interval-based score matching to counteract over-smoothing. Furthermore, we +incorporate 3D Gaussian Splatting into our text-to-3D generation pipeline. +Extensive experiments show that our model largely outperforms the +state-of-the-art in quality and training efficiency. + +
+
+ comment: The first two authors contributed equally to this work. Our code will + be available at: https://github.com/EnVision-Research/LucidDreamer +
+
+
+
+
+
+
+
+
+ +
+
+
+ + Computation and Language 56 + +
+
+
+ + ☆ Making Large Multimodal Models Understand Arbitrary Visual Prompts + + +
+ While existing large vision-language multimodal models focus on whole image +understanding, there is a prominent gap in achieving region-specific +comprehension. Current approaches that use textual coordinates or spatial +encodings often fail to provide a user-friendly interface for visual prompting. +To address this challenge, we introduce a novel multimodal model capable of +decoding arbitrary visual prompts. This allows users to intuitively mark images +and interact with the model using natural cues like a "red bounding box" or +"pointed arrow". Our simple design directly overlays visual markers onto the +RGB image, eliminating the need for complex region encodings, yet achieves +state-of-the-art performance on region-understanding tasks like Visual7W, +PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present +ViP-Bench, a comprehensive benchmark to assess the capability of models in +understanding visual prompts across multiple dimensions, enabling future +research in this domain. Code, data, and model are publicly available. + +
+
+ comment: Project page: https://vip-llava.github.io/ +
+
+
+
+
+ + ☆ Context Retrieval via Normalized Contextual Latent Interaction for + Conversational Agent ICDM + + +
+ Conversational agents leveraging AI, particularly deep learning, are emerging +in both academic research and real-world applications. However, these +applications still face challenges, including disrespecting knowledge and +facts, not personalizing to user preferences, and enormous demand for +computational resources during training and inference. Recent research efforts +have been focused on addressing these challenges from various aspects, +including supplementing various types of auxiliary information to the +conversational agents. However, existing methods are still not able to +effectively and efficiently exploit relevant information from these auxiliary +supplements to further unleash the power of the conversational agents and the +language models they use. In this paper, we present a novel method, PK-NCLI, +that is able to accurately and efficiently identify relevant auxiliary +information to improve the quality of conversational responses by learning the +relevance among persona, chat history, and knowledge background through +low-level normalized contextual latent interaction. Our experimental results +indicate that PK-NCLI outperforms the state-of-the-art method, PK-FoCus, by +47.80%/30.61%/24.14% in terms of perplexity, knowledge grounding, and training +efficiency, respectively, and maintained the same level of persona grounding +performance. We also provide a detailed analysis of how different factors, +including language model choices and trade-offs on training weights, would +affect the performance of PK-NCLI. + +
+
+ comment: 2023 IEEE International Conference on Data Mining Workshops (ICDMW) +
+
+
+
+
+ + ☆ Beyond ChatBots: ExploreLLM for Structured Thoughts and Personalized + Model Responses + + +
+ Large language model (LLM) powered chatbots are primarily text-based today, +and impose a large interactional cognitive load, especially for exploratory or +sensemaking tasks such as planning a trip or learning about a new city. Because +the interaction is textual, users have little scaffolding in the way of +structure, informational "scent", or ability to specify high-level preferences +or goals. We introduce ExploreLLM that allows users to structure thoughts, help +explore different options, navigate through the choices and recommendations, +and to more easily steer models to generate more personalized responses. We +conduct a user study and show that users find it helpful to use ExploreLLM for +exploratory or planning tasks, because it provides a useful schema-like +structure to the task, and guides users in planning. The study also suggests +that users can more easily personalize responses with high-level preferences +with ExploreLLM. Together, ExploreLLM points to a future where users interact +with LLMs beyond the form of chatbots, and instead designed to support complex +user tasks with a tighter integration between natural language and graphical +user interfaces. + +
+
+ comment: 19 pages, 11 figures +
+
+
+
+
+ + ☆ Mitigating Over-smoothing in Transformers via Regularized Nonlocal + Functionals + + +
+ Transformers have achieved remarkable success in a wide range of natural +language processing and computer vision applications. However, the +representation capacity of a deep transformer model is degraded due to the +over-smoothing issue in which the token representations become identical when +the model's depth grows. In this work, we show that self-attention layers in +transformers minimize a functional which promotes smoothness, thereby causing +token uniformity. We then propose a novel regularizer that penalizes the norm +of the difference between the smooth output tokens from self-attention and the +input tokens to preserve the fidelity of the tokens. Minimizing the resulting +regularized energy functional, we derive the Neural Transformer with a +Regularized Nonlocal Functional (NeuTRENO), a novel class of transformer models +that can mitigate the over-smoothing issue. We empirically demonstrate the +advantages of NeuTRENO over the baseline transformers and state-of-the-art +methods in reducing the over-smoothing of token representations on various +practical tasks, including object classification, image segmentation, and +language modeling. + +
+
+ comment: 24 papes +
+
+
+
+
+ + ☆ SeaLLMs -- Large Language Models for Southeast Asia + + +
+ Despite the remarkable achievements of large language models (LLMs) in +various tasks, there remains a linguistic bias that favors high-resource +languages, such as English, often at the expense of low-resource and regional +languages. To address this imbalance, we introduce SeaLLMs, an innovative +series of language models that specifically focuses on Southeast Asian (SEA) +languages. SeaLLMs are built upon the Llama-2 model and further advanced +through continued pre-training with an extended vocabulary, specialized +instruction and alignment tuning to better capture the intricacies of regional +languages. This allows them to respect and reflect local cultural norms, +customs, stylistic preferences, and legal considerations. Our comprehensive +evaluation demonstrates that SeaLLM-13b models exhibit superior performance +across a wide spectrum of linguistic tasks and assistant-style +instruction-following capabilities relative to comparable open-source models. +Moreover, they outperform ChatGPT-3.5 in non-Latin languages, such as Thai, +Khmer, Lao, and Burmese, by large margins while remaining lightweight and +cost-effective to operate. + +
+
+ comment: Technical report +
+
+
+
+
+ + ☆ Towards Transparency in Coreference Resolution: A Quantum-Inspired + Approach EMNLP 2023 + + +
+ Guided by grammatical structure, words compose to form sentences, and guided +by discourse structure, sentences compose to form dialogues and documents. The +compositional aspect of sentence and discourse units is often overlooked by +machine learning algorithms. A recent initiative called Quantum Natural +Language Processing (QNLP) learns word meanings as points in a Hilbert space +and acts on them via a translation of grammatical structure into Parametrised +Quantum Circuits (PQCs). Previous work extended the QNLP translation to +discourse structure using points in a closure of Hilbert spaces. In this paper, +we evaluate this translation on a Winograd-style pronoun resolution task. We +train a Variational Quantum Classifier (VQC) for binary classification and +implement an end-to-end pronoun resolution system. The simulations executed on +IBMQ software converged with an F1 score of 87.20%. The model outperformed two +out of three classical coreference resolution systems and neared +state-of-the-art SpanBERT. A mixed quantum-classical model yet improved these +results with an F1 score increase of around 6%. + +
+
+ comment: CRAC 2023, the Sixth Workshop on Computational Models of Reference, + Anaphora and Coreference, EMNLP 2023 +
+
+
+
+
+ + ☆ Contextualized word senses: from attention to compositionality + + +
+ The neural architectures of language models are becoming increasingly +complex, especially that of Transformers, based on the attention mechanism. +Although their application to numerous natural language processing tasks has +proven to be very fruitful, they continue to be models with little or no +interpretability and explainability. One of the tasks for which they are best +suited is the encoding of the contextual sense of words using contextualized +embeddings. In this paper we propose a transparent, interpretable, and +linguistically motivated strategy for encoding the contextual sense of words by +modeling semantic compositionality. Particular attention is given to dependency +relations and semantic notions such as selection preferences and paradigmatic +classes. A partial implementation of the proposed model is carried out and +compared with Transformer-based architectures for a given semantic task, namely +the similarity calculation of word senses in context. The results obtained show +that it is possible to be competitive with linguistically motivated models +instead of using the black boxes underlying complex neural architectures. + +
+
+
+
+
+ + ☆ The Efficiency Spectrum of Large Language Models: An Algorithmic Survey + + +
+ The rapid growth of Large Language Models (LLMs) has been a driving force in +transforming various domains, reshaping the artificial general intelligence +landscape. However, the increasing computational and memory demands of these +models present substantial challenges, hindering both academic research and +practical applications. To address these issues, a wide array of methods, +including both algorithmic and hardware solutions, have been developed to +enhance the efficiency of LLMs. This survey delivers a comprehensive review of +algorithmic advancements aimed at improving LLM efficiency. Unlike other +surveys that typically focus on specific areas such as training or model +compression, this paper examines the multi-faceted dimensions of efficiency +essential for the end-to-end algorithmic development of LLMs. Specifically, it +covers various topics related to efficiency, including scaling laws, data +utilization, architectural innovations, training and tuning strategies, and +inference techniques. This paper aims to serve as a valuable resource for +researchers and practitioners, laying the groundwork for future innovations in +this critical research area. Our repository of relevant references is +maintained at url{https://github.com/tding1/Efficient-LLM-Survey}. + +
+
+
+
+
+ + ☆ Nonparametric Variational Regularisation of Pretrained Transformers + + +
+ The current paradigm of large-scale pre-training and fine-tuning Transformer +large language models has lead to significant improvements across the board in +natural language processing. However, such large models are susceptible to +overfitting to their training data, and as a result the models perform poorly +when the domain changes. Also, due to the model's scale, the cost of +fine-tuning the model to the new domain is large. Nonparametric Variational +Information Bottleneck (NVIB) has been proposed as a regulariser for training +cross-attention in Transformers, potentially addressing the overfitting +problem. We extend the NVIB framework to replace all types of attention +functions in Transformers, and show that existing pretrained Transformers can +be reinterpreted as Nonparametric Variational (NV) models using a proposed +identity initialisation. We then show that changing the initialisation +introduces a novel, information-theoretic post-training regularisation in the +attention mechanism, which improves out-of-domain generalisation without any +training. This success supports the hypothesis that pretrained Transformers are +implicitly NV Bayesian models. + +
+
+
+
+
+ + ☆ The Ethics of Automating Legal Actors + + +
+ The introduction of large public legal datasets has brought about a +renaissance in legal NLP. Many of these datasets are comprised of legal +judgements - the product of judges deciding cases. This fact, together with the +way machine learning works, means that several legal NLP models are models of +judges. While some have argued for the automation of judges, in this position +piece, we argue that automating the role of the judge raises difficult ethical +challenges, in particular for common law legal systems. Our argument follows +from the social role of the judge in actively shaping the law, rather than +merely applying it. Since current NLP models come nowhere close to having the +facilities necessary for this task, they should not be used to automate judges. +Furthermore, even in the case the models could achieve human-level +capabilities, there would still be remaining ethical concerns inherent in the +automation of the legal process. + +
+
+
+
+
+ + ☆ Instruction-tuning Aligns LLMs to the Human Brain + + +
+ Instruction-tuning is a widely adopted method of finetuning that enables +large language models (LLMs) to generate output that more closely resembles +human responses to natural language queries, in many cases leading to +human-level performance on diverse testbeds. However, it remains unclear +whether instruction-tuning truly makes LLMs more similar to how humans process +language. We investigate the effect of instruction-tuning on LLM-human +similarity in two ways: (1) brain alignment, the similarity of LLM internal +representations to neural activity in the human language system, and (2) +behavioral alignment, the similarity of LLM and human behavior on a reading +task. We assess 25 vanilla and instruction-tuned LLMs across three datasets +involving humans reading naturalistic stories and sentences. We discover that +instruction-tuning generally enhances brain alignment by an average of 6%, but +does not have a similar effect on behavioral alignment. To identify the factors +underlying LLM-brain alignment, we compute correlations between the brain +alignment of LLMs and various model properties, such as model size, various +problem-solving abilities, and performance on tasks requiring world knowledge +spanning various domains. Notably, we find a strong positive correlation +between brain alignment and model size (r = 0.95), as well as performance on +tasks requiring world knowledge (r = 0.81). Our results demonstrate that +instruction-tuning LLMs improves both world knowledge representations and brain +alignment, suggesting that mechanisms that encode world knowledge in LLMs also +improve representational alignment to the human brain. + +
+
+
+
+
+ + ☆ Explanatory Argument Extraction of Correct Answers in Resident Medical + Exams + + +
+ Developing the required technology to assist medical experts in their +everyday activities is currently a hot topic in the Artificial Intelligence +research field. Thus, a number of large language models (LLMs) and automated +benchmarks have recently been proposed with the aim of facilitating information +extraction in Evidence-Based Medicine (EBM) using natural language as a tool +for mediating in human-AI interaction. The most representative benchmarks are +limited to either multiple-choice or long-form answers and are available only +in English. In order to address these shortcomings, in this paper we present a +new dataset which, unlike previous work: (i) includes not only explanatory +arguments for the correct answer, but also arguments to reason why the +incorrect answers are not correct; (ii) the explanations are written originally +by medical doctors to answer questions from the Spanish Residency Medical +Exams. Furthermore, this new benchmark allows us to setup a novel extractive +task which consists of identifying the explanation of the correct answer +written by medical doctors. An additional benefit of our setting is that we can +leverage the extractive QA paradigm to automatically evaluate performance of +LLMs without resorting to costly manual evaluation by medical experts. +Comprehensive experimentation with language models for Spanish shows that +sometimes multilingual models fare better than monolingual ones, even +outperforming models which have been adapted to the medical domain. +Furthermore, results across the monolingual models are mixed, with supposedly +smaller and inferior models performing competitively. In any case, the obtained +results show that our novel dataset and approach can be an effective technique +to help medical practitioners in identifying relevant evidence-based +explanations for medical questions. + +
+
+
+
+
+ + ☆ Questioning Biases in Case Judgment Summaries: Legal Datasets or Large + Language Models? + + +
+ The evolution of legal datasets and the advent of large language models +(LLMs) have significantly transformed the legal field, particularly in the +generation of case judgment summaries. However, a critical concern arises +regarding the potential biases embedded within these summaries. This study +scrutinizes the biases present in case judgment summaries produced by legal +datasets and large language models. The research aims to analyze the impact of +biases on legal decision making. By interrogating the accuracy, fairness, and +implications of biases in these summaries, this study contributes to a better +understanding of the role of technology in legal contexts and the implications +for justice systems worldwide. In this study, we investigate biases wrt +Gender-related keywords, Race-related keywords, Keywords related to crime +against women, Country names and religious keywords. The study shows +interesting evidences of biases in the outputs generated by the large language +models and pre-trained abstractive summarization models. The reasoning behind +these biases needs further studies. + +
+
+
+
+
+ + ☆ Improving Unsupervised Relation Extraction by Augmenting Diverse + Sentence Pairs EMNLP 2023 + + +
+ Unsupervised relation extraction (URE) aims to extract relations between +named entities from raw text without requiring manual annotations or +pre-existing knowledge bases. In recent studies of URE, researchers put a +notable emphasis on contrastive learning strategies for acquiring relation +representations. However, these studies often overlook two important aspects: +the inclusion of diverse positive pairs for contrastive learning and the +exploration of appropriate loss functions. In this paper, we propose AugURE +with both within-sentence pairs augmentation and augmentation through +cross-sentence pairs extraction to increase the diversity of positive pairs and +strengthen the discriminative power of contrastive learning. We also identify +the limitation of noise-contrastive estimation (NCE) loss for relation +representation learning and propose to apply margin loss for sentence pairs. +Experiments on NYT-FB and TACRED datasets demonstrate that the proposed +relation representation learning and a simple K-Means clustering achieves +state-of-the-art performance. + +
+
+ comment: Accepted by EMNLP 2023 Main Conference +
+
+
+
+
+ + ☆ Trained MT Metrics Learn to Cope with Machine-translated References + + +
+ Neural metrics trained on human evaluations of MT tend to correlate well with +human judgments, but their behavior is not fully understood. In this paper, we +perform a controlled experiment and compare a baseline metric that has not been +trained on human evaluations (Prism) to a trained version of the same metric +(Prism+FT). Surprisingly, we find that Prism+FT becomes more robust to +machine-translated references, which are a notorious problem in MT evaluation. +This suggests that the effects of metric training go beyond the intended effect +of improving overall correlation with human judgments. + +
+
+ comment: WMT 2023 +
+
+
+
+
+ + ☆ SurreyAI 2023 Submission for the Quality Estimation Shared Task + + +
+ Quality Estimation (QE) systems are important in situations where it is +necessary to assess the quality of translations, but there is no reference +available. This paper describes the approach adopted by the SurreyAI team for +addressing the Sentence-Level Direct Assessment shared task in WMT23. The +proposed approach builds upon the TransQuest framework, exploring various +autoencoder pre-trained language models within the MonoTransQuest architecture +using single and ensemble settings. The autoencoder pre-trained language models +employed in the proposed systems are XLMV, InfoXLM-large, and XLMR-large. The +evaluation utilizes Spearman and Pearson correlation coefficients, assessing +the relationship between machine-predicted quality scores and human judgments +for 5 language pairs (English-Gujarati, English-Hindi, English-Marathi, +English-Tamil and English-Telugu). The MonoTQ-InfoXLM-large approach emerges as +a robust strategy, surpassing all other individual models proposed in this +study by significantly improving over the baseline for the majority of the +language pairs. + +
+
+
+
+
+ + ☆ Summarization-based Data Augmentation for Document Classification + + +
+ Despite the prevalence of pretrained language models in natural language +understanding tasks, understanding lengthy text such as document is still +challenging due to the data sparseness problem. Inspired by that humans develop +their ability of understanding lengthy text from reading shorter text, we +propose a simple yet effective summarization-based data augmentation, SUMMaug, +for document classification. We first obtain easy-to-learn examples for the +target document classification task by summarizing the input of the original +training examples, while optionally merging the original labels to conform to +the summarized input. We then use the generated pseudo examples to perform +curriculum learning. Experimental results on two datasets confirmed the +advantage of our method compared to existing baseline methods in terms of +robustness and accuracy. We release our code and data at +https://github.com/etsurin/summaug. + +
+
+ comment: The 4th New Frontiers in Summarization (with LLMs) Workshop +
+
+
+
+
+ + ☆ Japanese Tort-case Dataset for Rationale-supported Legal Judgment + Prediction + + +
+ This paper presents the first dataset for Japanese Legal Judgment Prediction +(LJP), the Japanese Tort-case Dataset (JTD), which features two tasks: tort +prediction and its rationale extraction. The rationale extraction task +identifies the court's accepting arguments from alleged arguments by plaintiffs +and defendants, which is a novel task in the field. JTD is constructed based on +annotated 3,477 Japanese Civil Code judgments by 41 legal experts, resulting in +7,978 instances with 59,697 of their alleged arguments from the involved +parties. Our baseline experiments show the feasibility of the proposed two +tasks, and our error analysis by legal experts identifies sources of errors and +suggests future directions of the LJP research. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ☆ Abstract Syntax Tree for Programming Language Understanding and + Representation: How Far Are We? + + +
+ Programming language understanding and representation (a.k.a code +representation learning) has always been a hot and challenging task in software +engineering. It aims to apply deep learning techniques to produce numerical +representations of the source code features while preserving its semantics. +These representations can be used for facilitating subsequent code-related +tasks. The abstract syntax tree (AST), a fundamental code feature, illustrates +the syntactic information of the source code and has been widely used in code +representation learning. However, there is still a lack of systematic and +quantitative evaluation of how well AST-based code representation facilitates +subsequent code-related tasks. In this paper, we first conduct a comprehensive +empirical study to explore the effectiveness of the AST-based code +representation in facilitating follow-up code-related tasks. To do so, we +compare the performance of models trained with code token sequence (Token for +short) based code representation and AST-based code representation on three +popular types of code-related tasks. Surprisingly, the overall quantitative +statistical results demonstrate that models trained with AST-based code +representation consistently perform worse across all three tasks compared to +models trained with Token-based code representation. Our further quantitative +analysis reveals that models trained with AST-based code representation +outperform models trained with Token-based code representation in certain +subsets of samples across all three tasks. We also conduct comprehensive +experiments to evaluate and reveal the impact of the choice of AST +parsing/preprocessing/encoding methods on AST-based code representation and +subsequent code-related tasks. Our study provides future researchers with +detailed guidance on how to select solutions at each stage to fully exploit +AST. + +
+
+ comment: submitted to ACM Transactions on Software Engineering and + Methodology. arXiv admin note: text overlap with arXiv:2103.10668 by other + authors +
+
+
+
+
+ + ☆ CoLLiE: Collaborative Training of Large Language Models in an Efficient + Way EMNLP 2023 + + +
+ Large language models (LLMs) are increasingly pivotal in a wide range of +natural language processing tasks. Access to pre-trained models, courtesy of +the open-source community, has made it possible to adapt these models to +specific applications for enhanced performance. However, the substantial +resources required for training these models necessitate efficient solutions. +This paper introduces CoLLiE, an efficient library that facilitates +collaborative training of large language models using 3D parallelism, +parameter-efficient fine-tuning (PEFT) methods, and optimizers such as Lion, +Adan, Sophia, LOMO and AdaLomo. With its modular design and comprehensive +functionality, CoLLiE offers a balanced blend of efficiency, ease of use, and +customization. CoLLiE has proven superior training efficiency in comparison +with prevalent solutions in pre-training and fine-tuning scenarios. +Furthermore, we provide an empirical evaluation of the correlation between +model size and GPU memory consumption under different optimization methods, as +well as an analysis of the throughput. Lastly, we carry out a comprehensive +comparison of various optimizers and PEFT methods within the instruction-tuning +context. CoLLiE is available at https://github.com/OpenLMLab/collie. + +
+
+ comment: To appear at EMNLP 2023 Demo; Code is available at + https://github.com/OpenLMLab/collie +
+
+
+
+
+ + ☆ Event-driven Real-time Retrieval in Web Search + + +
+ Information retrieval in real-time search presents unique challenges distinct +from those encountered in classical web search. These challenges are +particularly pronounced due to the rapid change of user search intent, which is +influenced by the occurrence and evolution of breaking news events, such as +earthquakes, elections, and wars. Previous dense retrieval methods, which +primarily focused on static semantic representation, lack the capacity to +capture immediate search intent, leading to inferior performance in retrieving +the most recent event-related documents in time-sensitive scenarios. To address +this issue, this paper expands the query with event information that represents +real-time search intent. The Event information is then integrated with the +query through a cross-attention mechanism, resulting in a time-context query +representation. We further enhance the model's capacity for event +representation through multi-task training. Since publicly available datasets +such as MS-MARCO do not contain any event information on the query side and +have few time-sensitive queries, we design an automatic data collection and +annotation pipeline to address this issue, which includes ModelZoo-based Coarse +Annotation and LLM-driven Fine Annotation processes. In addition, we share the +training tricks such as two-stage training and hard negative sampling. Finally, +we conduct a set of offline experiments on a million-scale production dataset +to evaluate our approach and deploy an A/B testing in a real online system to +verify the performance. Extensive experimental results demonstrate that our +proposed approach significantly outperforms existing state-of-the-art baseline +methods. + +
+
+
+
+
+ + ☆ On Exploring the Reasoning Capability of Large Language Models with + Knowledge Graphs SIGIR 2023 + + +
+ This paper examines the capacity of LLMs to reason with knowledge graphs +using their internal knowledge graph, i.e., the knowledge graph they learned +during pre-training. Two research questions are formulated to investigate the +accuracy of LLMs in recalling information from pre-training knowledge graphs +and their ability to infer knowledge graph relations from context. To address +these questions, we employ LLMs to perform four distinct knowledge graph +reasoning tasks. Furthermore, we identify two types of hallucinations that may +occur during knowledge reasoning with LLMs: content and ontology hallucination. +Our experimental results demonstrate that LLMs can successfully tackle both +simple and complex knowledge graph reasoning tasks from their own memory, as +well as infer from input context. + +
+
+ comment: Presented at the Generative-IR Workshop during SIGIR 2023. + https://coda.io/@sigir/gen-ir +
+
+
+
+
+ + ☆ The Case for Scalable, Data-Driven Theory: A Paradigm for Scientific + Progress in NLP EMNLP 2023 + + +
+ I propose a paradigm for scientific progress in NLP centered around +developing scalable, data-driven theories of linguistic structure. The idea is +to collect data in tightly scoped, carefully defined ways which allow for +exhaustive annotation of behavioral phenomena of interest, and then use machine +learning to construct explanatory theories of these phenomena which can form +building blocks for intelligible AI systems. After laying some conceptual +groundwork, I describe several investigations into data-driven theories of +shallow semantic structure using Question-Answer driven Semantic Role Labeling +(QA-SRL), a schema for annotating verbal predicate-argument relations using +highly constrained question-answer pairs. While this only scratches the surface +of the complex language behaviors of interest in AI, I outline principles for +data collection and theoretical modeling which can inform future scientific +progress. This note summarizes and draws heavily on my PhD thesis. + +
+
+ comment: 13 pages, 3 figures, 2 tables. Presented at The Big Picture Workshop + at EMNLP 2023 +
+
+
+
+
+ + ☆ RTQ: Rethinking Video-language Understanding Based on Image-text Model ACM MM 2023 + + +
+ Recent advancements in video-language understanding have been established on +the foundation of image-text models, resulting in promising outcomes due to the +shared knowledge between images and videos. However, video-language +understanding presents unique challenges due to the inclusion of highly complex +semantic details, which result in information redundancy, temporal dependency, +and scene complexity. Current techniques have only partially tackled these +issues, and our quantitative analysis indicates that some of these methods are +complementary. In light of this, we propose a novel framework called RTQ +(Refine, Temporal model, and Query), which addresses these challenges +simultaneously. The approach involves refining redundant information within +frames, modeling temporal relations among frames, and querying task-specific +information from the videos. Remarkably, our model demonstrates outstanding +performance even in the absence of video-language pre-training, and the results +are comparable with or superior to those achieved by state-of-the-art +pre-training methods. + +
+
+ comment: Accepted by ACM MM 2023 as Oral representation +
+
+
+
+
+ + ☆ Agent-OM: Leveraging Large Language Models for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM-based agents +have become revolutionary in data engineering and have been applied creatively +in various domains, their potential for OM remains underexplored. This study +introduces a novel agent-powered LLM-based design paradigm for OM systems. With +thoughtful consideration of several specific challenges to leverage LLMs for +OM, we propose a generic framework, namely Agent-OM, consisting of two Siamese +agents for retrieval and matching, with a set of simple prompt-based OM tools. +Our framework is implemented in a proof-of-concept system. Evaluations of three +Ontology Alignment Evaluation Initiative (OAEI) tracks over state-of-the-art OM +systems show that our system can achieve very close results to the best +long-standing performance on simple OM tasks and significantly improve the +performance on complex and few-shot OM tasks. + +
+
+ comment: 14 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ PsyAttention: Psychological Attention Model for Personality Detection + + +
+ Work on personality detection has tended to incorporate psychological +features from different personality models, such as BigFive and MBTI. There are +more than 900 psychological features, each of which is helpful for personality +detection. However, when used in combination, the application of different +calculation standards among these features may result in interference between +features calculated using distinct systems, thereby introducing noise and +reducing performance. This paper adapts different psychological models in the +proposed PsyAttention for personality detection, which can effectively encode +psychological features, reducing their number by 85%. In experiments on the +BigFive and MBTI models, PysAttention achieved average accuracy of 65.66% and +86.30%, respectively, outperforming state-of-the-art methods, indicating that +it is effective at encoding psychological features. + +
+
+
+
+
+ + ☆ SEPSIS: I Can Catch Your Lies -- A New Paradigm for Deception Detection + + +
+ Deception is the intentional practice of twisting information. It is a +nuanced societal practice deeply intertwined with human societal evolution, +characterized by a multitude of facets. This research explores the problem of +deception through the lens of psychology, employing a framework that +categorizes deception into three forms: lies of omission, lies of commission, +and lies of influence. The primary focus of this study is specifically on +investigating only lies of omission. We propose a novel framework for deception +detection leveraging NLP techniques. We curated an annotated dataset of 876,784 +samples by amalgamating a popular large-scale fake news dataset and scraped +news headlines from the Twitter handle of Times of India, a well-known Indian +news media house. Each sample has been labeled with four layers, namely: (i) +the type of omission (speculation, bias, distortion, sounds factual, and +opinion), (ii) colors of lies(black, white, etc), and (iii) the intention of +such lies (to influence, etc) (iv) topic of lies (political, educational, +religious, etc). We present a novel multi-task learning pipeline that leverages +the dataless merging of fine-tuned language models to address the deception +detection task mentioned earlier. Our proposed model achieved an F1 score of +0.87, demonstrating strong performance across all layers including the type, +color, intent, and topic aspects of deceptive content. Finally, our research +explores the relationship between lies of omission and propaganda techniques. +To accomplish this, we conducted an in-depth analysis, uncovering compelling +findings. For instance, our analysis revealed a significant correlation between +loaded language and opinion, shedding light on their interconnectedness. To +encourage further research in this field, we will be making the models and +dataset available with the MIT License, making it favorable for open-source +research. + +
+
+
+
+
+ + ☆ Text Attribute Control via Closed-Loop Disentanglement ACL 2023 + + +
+ Changing an attribute of a text without changing the content usually requires +to first disentangle the text into irrelevant attributes and content +representations. After that, in the inference phase, the representation of one +attribute is tuned to a different value, expecting that the corresponding +attribute of the text can also be changed accordingly. The usual way of +disentanglement is to add some constraints on the latent space of an +encoder-decoder architecture, including adversarial-based constraints and +mutual-information-based constraints. However, the previous semi-supervised +processes of attribute change are usually not enough to guarantee the success +of attribute change and content preservation. In this paper, we propose a novel +approach to achieve a robust control of attributes while enhancing content +preservation. In this approach, we use a semi-supervised contrastive learning +method to encourage the disentanglement of attributes in latent spaces. +Differently from previous works, we re-disentangle the reconstructed sentence +and compare the re-disentangled latent space with the original latent space, +which makes a closed-loop disentanglement process. This also helps content +preservation. In addition, the contrastive learning method is also able to +replace the role of minimizing mutual information and adversarial training in +the disentanglement process, which alleviates the computation cost. We +conducted experiments on three text datasets, including the Yelp Service review +dataset, the Amazon Product review dataset, and the GoEmotions dataset. The +experimental results show the effectiveness of our model. + +
+
+ comment: accepted by TACL 2023 +
+
+
+
+
+ + ☆ Mark My Words: Analyzing and Evaluating Language Model Watermarks + + +
+ The capabilities of large language models have grown significantly in recent +years and so too have concerns about their misuse. In this context, the ability +to distinguish machine-generated text from human-authored content becomes +important. Prior works have proposed numerous schemes to watermark text, which +would benefit from a systematic evaluation framework. This work focuses on text +watermarking techniques - as opposed to image watermarks - and proposes a +comprehensive benchmark for them under different tasks as well as practical +attacks. We focus on three main metrics: quality, size (e.g. the number of +tokens needed to detect a watermark), and tamper-resistance. Current +watermarking techniques are good enough to be deployed: Kirchenbauer et al. can +watermark Llama2-7B-chat with no perceivable loss in quality in under 100 +tokens, and with good tamper-resistance to simple attacks, regardless of +temperature. We argue that watermark indistinguishability is too strong a +requirement: schemes that slightly modify logit distributions outperform their +indistinguishable counterparts with no noticeable loss in generation quality. +We publicly release our benchmark. + +
+
+ comment: 19 pages, 14 figures +
+
+
+
+
+ + ☆ Omni-SMoLA: Boosting Generalist Multimodal Models with Soft Mixture of + Low-rank Experts + + +
+ Large multi-modal models (LMMs) exhibit remarkable performance across +numerous tasks. However, generalist LMMs often suffer from performance +degradation when tuned over a large collection of tasks. Recent research +suggests that Mixture of Experts (MoE) architectures are useful for instruction +tuning, but for LMMs of parameter size around O(50-100B), the prohibitive cost +of replicating and storing the expert models severely limits the number of +experts we can use. We propose Omni-SMoLA, an architecture that uses the Soft +MoE approach to (softly) mix many multimodal low rank experts, and avoids +introducing a significant number of new parameters compared to conventional MoE +models. The core intuition here is that the large model provides a foundational +backbone, while different lightweight experts residually learn specialized +knowledge, either per-modality or multimodally. Extensive experiments +demonstrate that the SMoLA approach helps improve the generalist performance +across a broad range of generative vision-and-language tasks, achieving new +SoTA generalist performance that often matches or outperforms single +specialized LMM baselines, as well as new SoTA specialist performance. + +
+
+
+
+
+ + ☆ The Cost of Compression: Investigating the Impact of Compression on + Parametric Knowledge in Language Models EMNLP 2023 + + +
+ Compressing large language models (LLMs), often consisting of billions of +parameters, provides faster inference, smaller memory footprints, and enables +local deployment. Two standard compression techniques are pruning and +quantization, with the former eliminating redundant connections in model layers +and the latter representing model parameters with fewer bits. The key tradeoff +is between the degree of compression and the impact on the quality of the +compressed model. Existing research on LLM compression primarily focuses on +performance in terms of general metrics like perplexity or downstream task +accuracy. More fine-grained metrics, such as those measuring parametric +knowledge, remain significantly underexplored. To help bridge this gap, we +present a comprehensive analysis across multiple model families (ENCODER, +ENCODER-DECODER, and DECODER) using the LAMA and LM-HARNESS benchmarks in order +to systematically quantify the effect of commonly employed compression +techniques on model performance. A particular focus is on tradeoffs involving +parametric knowledge, with the goal of providing practitioners with practical +insights to help make informed decisions on compression. We release our +codebase1 to enable further research. + +
+
+ comment: Accepted to EMNLP 2023 Findings +
+
+
+
+
+ + ☆ Hyperparameter Optimization for Large Language Model Instruction-Tuning + + +
+ The fine-tuning of Large Language Models (LLMs) has enabled them to recently +achieve milestones in natural language processing applications. The emergence +of ever larger LLMs has paved the way for more efficient fine-tuning methods. +Among these, the Low-Rank Adaptation (LoRA) method keeps most of the weights of +the pre-trained LLM frozen while introducing a low-rank decomposition of the +weight matrix, enabling the tuning of only a very small proportion of the +network. The performance on downstream tasks of models fine-tuned with LoRA +heavily relies on a set of hyperparameters including the rank of the +decomposition. In this work, we investigate the choice of these hyperparameters +through two main blackbox optimization (BBO) techniques. We examine the whole +pipeline of performing fine-tuning and validation on a pre-trained LLM as a +blackbox and efficiently explore the space of hyperparameters with the \nomad +algorithm, achieving a boost in performance and human alignment of the tuned +model. + +
+
+
+
+
+ + ☆ Quick Back-Translation for Unsupervised Machine Translation EMNLP 2023 + + +
+ The field of unsupervised machine translation has seen significant +advancement from the marriage of the Transformer and the back-translation +algorithm. The Transformer is a powerful generative model, and back-translation +leverages Transformer's high-quality translations for iterative +self-improvement. However, the Transformer is encumbered by the run-time of +autoregressive inference during back-translation, and back-translation is +limited by a lack of synthetic data efficiency. We propose a two-for-one +improvement to Transformer back-translation: Quick Back-Translation (QBT). QBT +re-purposes the encoder as a generative model, and uses encoder-generated +sequences to train the decoder in conjunction with the original autoregressive +back-translation step, improving data throughput and utilization. Experiments +on various WMT benchmarks demonstrate that a relatively small number of +refining steps of QBT improve current unsupervised machine translation models, +and that QBT dramatically outperforms standard back-translation only method in +terms of training efficiency for comparable translation qualities. + +
+
+ comment: Published in EMNLP 2023 Findings. Source code can be found at the + following link: https://github.com/bbrimacombe/Quick-Back-Translation +
+
+
+
+
+ + ☆ Hi-ArG: Exploring the Integration of Hierarchical Argumentation Graphs + in Language Pretraining EMNLP 2023 + + +
+ The knowledge graph is a structure to store and represent knowledge, and +recent studies have discussed its capability to assist language models for +various applications. Some variations of knowledge graphs aim to record +arguments and their relations for computational argumentation tasks. However, +many must simplify semantic types to fit specific schemas, thus losing +flexibility and expression ability. In this paper, we propose the Hierarchical +Argumentation Graph (Hi-ArG), a new structure to organize arguments. We also +introduce two approaches to exploit Hi-ArG, including a text-graph multi-modal +model GreaseArG and a new pre-training framework augmented with graph +information. Experiments on two argumentation tasks have shown that after +further pre-training and fine-tuning, GreaseArG supersedes same-scale language +models on these tasks, while incorporating graph information during further +pre-training can also improve the performance of vanilla language models. Code +for this paper is available at https://github.com/ljcleo/Hi-ArG . + +
+
+ comment: to be published in EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ Persona-Coded Poly-Encoder: Persona-Guided Multi-Stream Conversational + Sentence Scoring ICTAI + + +
+ Recent advances in machine learning and deep learning have led to the +widespread use of Conversational AI in many practical applications. However, it +is still very challenging to leverage auxiliary information that can provide +conversational context or personalized tuning to improve the quality of +conversations. For example, there has only been limited research on using an +individuals persona information to improve conversation quality, and even +state-of-the-art conversational AI techniques are unable to effectively +leverage signals from heterogeneous sources of auxiliary data, such as +multi-modal interaction data, demographics, SDOH data, etc. In this paper, we +present a novel Persona-Coded Poly-Encoder method that leverages persona +information in a multi-stream encoding scheme to improve the quality of +response generation for conversations. To show the efficacy of the proposed +method, we evaluate our method on two different persona-based conversational +datasets, and compared against two state-of-the-art methods. Our experimental +results and analysis demonstrate that our method can improve conversation +quality over the baseline method Poly-Encoder by 3.32% and 2.94% in terms of +BLEU score and HR@1, respectively. More significantly, our method offers a path +to better utilization of multi-modal data in conversational tasks. Lastly, our +study outlines several challenges and future research directions for advancing +personalized conversational AI technology. + +
+
+ comment: The 35th IEEE International Conference on Tools with Artificial + Intelligence (ICTAI) +
+
+
+
+
+ + ♻ ☆ MLLMs-Augmented Visual-Language Representation Learning + + +
+ Visual-language pre-training (VLP) has achieved remarkable success in +multi-modal tasks, largely attributed to the availability of large-scale +image-text datasets. In this work, we demonstrate that multi-modal large +language models (MLLMs) can enhance visual-language representation learning by +improving data quality. Our approach is simple, utilizing MLLMs to extend +multiple captions for each image. To prevent the bias introduced by MLLMs' +hallucinations and intrinsic caption styles, we propose "text shearing" to +maintain the same length for extended captions as that of the original +captions. In image-text retrieval, our method consistently obtains 5.6 ~ 35.0% +and 16.8 ~ 46.1% improvement on R@1 under the fine-tuning and zero-shot +settings, respectively. Notably, we obtain zero-shot results that are +comparable to fine-tuning on target datasets, which encourages more exploration +of the versatile use of MLLMs. + +
+
+
+
+
+ + ♻ ☆ Dissecting the Runtime Performance of the Training, Fine-tuning, and + Inference of Large Language Models + + +
+ Large Language Models (LLMs) have seen great advance in both academia and +industry, and their popularity results in numerous open-source frameworks and +techniques in accelerating LLM pre-training, fine-tuning, and inference. +Training and deploying LLMs are expensive as it requires considerable computing +resources and memory, hence many efficient approaches have been developed for +improving system pipelines as well as operators. However, the runtime +performance can vary significantly across hardware and software stacks, which +makes it difficult to choose the best configuration. In this work, we aim to +benchmark the performance from both macro and micro perspectives. First, we +benchmark the end-to-end performance of pre-training, fine-tuning, and serving +LLMs in different sizes , i.e., 7, 13, and 70 billion parameters (7B, 13B, and +70B) on three 8-GPU platforms with and without individual optimization +techniques, including ZeRO, quantization, recomputation, FlashAttention. Then, +we dive deeper to provide a detailed runtime analysis of the sub-modules, +including computing and communication operators in LLMs. For end users, our +benchmark and findings help better understand different optimization +techniques, training and inference frameworks, together with hardware platforms +in choosing configurations for deploying LLMs. For researchers, our in-depth +module-wise analyses discover potential opportunities for future work to +further optimize the runtime performance of LLMs. + +
+
+
+
+
+ + ♻ ☆ An Adversarial Non-Autoregressive Model for Text Generation with + Incomplete Information + + +
+ Non-autoregressive models have been widely studied in the Complete +Information Scenario (CIS), in which the input has complete information of +corresponding output. However, their explorations in the Incomplete Information +Scenario (IIS) are extremely limited. Our analyses reveal that the IIS's +incomplete input information will augment the inherent limitations of existing +non-autoregressive models trained under Maximum Likelihood Estimation. In this +paper, we propose for the IIS an Adversarial Non-autoregressive Transformer +(ANT) which has two features: 1) Position-Aware Self-Modulation to provide more +reasonable hidden representations, and 2) Dependency Feed Forward Network to +strengthen its capacity in dependency modeling. We compare ANT with other +mainstream models in the IIS and demonstrate that ANT can achieve comparable +performance with much fewer decoding iterations. Furthermore, we show its great +potential in various applications like latent interpolation and semi-supervised +learning. + +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 29 pages, 12 figures, Published in Transactions on Machine Learning + Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Does Conceptual Representation Require Embodiment? Insights From Large + Language Models + + +
+ To what extent can language alone give rise to complex concepts, or is +embodied experience essential? Recent advancements in large language models +(LLMs) offer fresh perspectives on this question. Although LLMs are trained on +restricted modalities, they exhibit human-like performance in diverse +psychological tasks. Our study compared representations of 4,442 lexical +concepts between humans and ChatGPTs (GPT-3.5 and GPT-4) across multiple +dimensions, including five key domains: emotion, salience, mental +visualization, sensory, and motor experience. We identify two main findings: 1) +Both models strongly align with human representations in non-sensorimotor +domains but lag in sensory and motor areas, with GPT-4 outperforming GPT-3.5; +2) GPT-4's gains are associated with its additional visual learning, which also +appears to benefit related dimensions like haptics and imageability. These +results highlight the limitations of language in isolation, and that the +integration of diverse modalities of inputs leads to a more human-like +conceptual representation. + +
+
+
+
+
+ + ♻ ☆ H_eval: A new hybrid evaluation metric for automatic speech recognition + tasks + + +
+ Many studies have examined the shortcomings of word error rate (WER) as an +evaluation metric for automatic speech recognition (ASR) systems. Since WER +considers only literal word-level correctness, new evaluation metrics based on +semantic similarity such as semantic distance (SD) and BERTScore have been +developed. However, we found that these metrics have their own limitations, +such as a tendency to overly prioritise keywords. We propose H_eval, a new +hybrid evaluation metric for ASR systems that considers both semantic +correctness and error rate and performs significantly well in scenarios where +WER and SD perform poorly. Due to lighter computation compared to BERTScore, it +offers 49 times reduction in metric computation time. Furthermore, we show that +H_eval correlates strongly with downstream NLP tasks. Also, to reduce the +metric calculation time, we built multiple fast and lightweight models using +distillation techniques + +
+
+ comment: Accepted in ASRU 2023 +
+
+
+
+
+ + ♻ ☆ Pointwise Mutual Information Based Metric and Decoding Strategy for + Faithful Generation in Document Grounded Dialogs EMNLP 2023 + + +
+ A major concern in using deep learning based generative models for +document-grounded dialogs is the potential generation of responses that are not +\textit{faithful} to the underlying document. Existing automated metrics used +for evaluating the faithfulness of response with respect to the grounding +document measure the degree of similarity between the generated response and +the document's content. However, these automated metrics are far from being +well aligned with human judgments. Therefore, to improve the measurement of +faithfulness, we propose a new metric that utilizes (Conditional) Point-wise +Mutual Information (PMI) between the generated response and the source +document, conditioned on the dialogue. PMI quantifies the extent to which the +document influences the generated response -- with a higher PMI indicating a +more faithful response. We build upon this idea to create a new decoding +technique that incorporates PMI into the response generation process to predict +more faithful responses. Our experiments on the BEGIN benchmark demonstrate an +improved correlation of our metric with human evaluation. We also show that our +decoding technique is effective in generating more faithful responses when +compared to standard decoding techniques on a set of publicly available +document-grounded dialog datasets. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds + + +
+ The unprecedented advancements in Large Language Models (LLMs) have shown a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, enabling LLMs to understand point clouds and offering a new +avenue beyond 2D visual data. PointLLM understands colored object point clouds +with human instructions and generates contextually appropriate responses, +illustrating its grasp of point clouds and common sense. Specifically, it +leverages a point cloud encoder with a powerful LLM to effectively fuse +geometric, appearance, and linguistic information. We collect a novel dataset +comprising 660K simple and 70K complex point-text instruction pairs to enable a +two-stage training strategy: aligning latent spaces and subsequently +instruction-tuning the unified model. To rigorously evaluate the perceptual and +generalization capabilities of PointLLM, we establish two benchmarks: +Generative 3D Object Classification and 3D Object Captioning, assessed through +three different methods, including human evaluation, GPT-4/ChatGPT evaluation, +and traditional metrics. Experimental results reveal PointLLM's superior +performance over existing 2D and 3D baselines, with a notable achievement in +human-evaluated object captioning tasks where it surpasses human annotators in +over 50% of the samples. Codes, datasets, and benchmarks are available at +https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: 28 pages. Empowering large language models with 3D point cloud + understanding, accompanied by a novel dataset and carefully designed + benchmarks. Project page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ♻ ☆ QuantEase: Optimization-based Quantization for Language Models + + +
+ With the rising popularity of Large Language Models (LLMs), there has been an +increasing interest in compression techniques that enable their efficient +deployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs. +Drawing from recent advances, our work introduces QuantEase, a layer-wise +quantization framework where individual layers undergo separate quantization. +The problem is framed as a discrete-structured non-convex optimization, +prompting the development of algorithms rooted in Coordinate Descent (CD) +techniques. These CD-based methods provide high-quality solutions to the +complex non-convex layer-wise quantization problems. Notably, our CD-based +approach features straightforward updates, relying solely on matrix and vector +operations, circumventing the need for matrix inversion or decomposition. We +also explore an outlier-aware variant of our approach, allowing for retaining +significant weights (outliers) with complete precision. Our proposal attains +state-of-the-art performance in terms of perplexity and zero-shot accuracy in +empirical evaluations across various LLMs and datasets, with relative +improvements up to 15% over methods such as GPTQ. Leveraging careful linear +algebra optimizations, QuantEase can quantize models like Falcon-180B on a +single NVIDIA A100 GPU in $\sim$3 hours. Particularly noteworthy is our +outlier-aware algorithm's capability to achieve near or sub-3-bit quantization +of LLMs with an acceptable drop in accuracy, obviating the need for non-uniform +quantization or grouping techniques, improving upon methods such as SpQR by up +to two times in terms of perplexity. + +
+
+
+
+
+ + ♻ ☆ Token-Level Adaptation of LoRA Adapters for Downstream Task + Generalization + + +
+ This paper introduces a method for adapting LoRA adapters in smaller-sized +language models to arbitrary downstream tasks. Unlike standard +mixture-of-expert architectures, our method employs a gradient-free routing +function to choose a weighted combination of experts without increasing the +compute requirements for training or inference. The results show that +token-level adaptation of LoRA adapters outperforms the base Llama-2-7b model +across mathematical (GSM8K), scientific (ARC-Challenge), reading comprehension +(SQuAD), and coding (CodeAlpaca-20k) tasks. Further evaluations also show that +the average performance of token-level adaptation outperforms individual models +fine-tuned for each of the tasks with the best performance observed in +adaptation of every-other token during inference. The code for this study is +made available through a public repository. + +
+
+
+
+
+ + ♻ ☆ Llemma: An Open Language Model For Mathematics + + +
+ We present Llemma, a large language model for mathematics. We continue +pretraining Code Llama on the Proof-Pile-2, a mixture of scientific papers, web +data containing mathematics, and mathematical code, yielding Llemma. On the +MATH benchmark Llemma outperforms all known open base models, as well as the +unreleased Minerva model suite on an equi-parameter basis. Moreover, Llemma is +capable of tool use and formal theorem proving without any further finetuning. +We openly release all artifacts, including 7 billion and 34 billion parameter +models, the Proof-Pile-2, and code to replicate our experiments. + +
+
+ comment: Updated references; corrected description of COPRA search budget +
+
+
+
+
+ + ♻ ☆ Retrieving Multimodal Information for Augmented Generation: A Survey + + +
+ As Large Language Models (LLMs) become popular, there emerged an important +trend of using multimodality to augment the LLMs' generation ability, which +enables LLMs to better interact with the world. However, there lacks a unified +perception of at which stage and how to incorporate different modalities. In +this survey, we review methods that assist and augment generative models by +retrieving multimodal knowledge, whose formats range from images, codes, +tables, graphs, to audio. Such methods offer a promising solution to important +concerns such as factuality, reasoning, interpretability, and robustness. By +providing an in-depth review, this survey is expected to provide scholars with +a deeper understanding of the methods' applications and encourage them to adapt +existing techniques to the fast-growing field of LLMs. + +
+
+
+
+
+ + ♻ ☆ RLAIF: Scaling Reinforcement Learning from Human Feedback with AI + Feedback + + +
+ Reinforcement learning from human feedback (RLHF) has proven effective in +aligning large language models (LLMs) with human preferences. However, +gathering high-quality human preference labels can be a time-consuming and +expensive endeavor. RL from AI Feedback (RLAIF), introduced by Bai et al., +offers a promising alternative that leverages a powerful off-the-shelf LLM to +generate preferences in lieu of human annotators. Across the tasks of +summarization, helpful dialogue generation, and harmless dialogue generation, +RLAIF achieves comparable or superior performance to RLHF, as rated by human +evaluators. Furthermore, RLAIF demonstrates the ability to outperform a +supervised fine-tuned baseline even when the LLM preference labeler is the same +size as the policy. In another experiment, directly prompting the LLM for +reward scores achieves superior performance to the canonical RLAIF setup, where +LLM preference labels are first distilled into a reward model. Finally, we +conduct extensive studies on techniques for generating aligned AI preferences. +Our results suggest that RLAIF can achieve human-level performance, offering a +potential solution to the scalability limitations of RLHF. + +
+
+ comment: Added two more tasks and many more experiments and analyses (e.g. + same-size RLAIF, direct RLAIF, cost analysis) +
+
+
+
+
+ + ♻ ☆ Large Language Models of Code Fail at Completing Code with Potential + Bugs NeurIPS 2023 + + +
+ Large language models of code (Code-LLMs) have recently brought tremendous +advances to code completion, a fundamental feature of programming assistance +and code intelligence. However, most existing works ignore the possible +presence of bugs in the code context for generation, which are inevitable in +software development. Therefore, we introduce and study the buggy-code +completion problem, inspired by the realistic scenario of real-time code +suggestion where the code context contains potential bugs -- anti-patterns that +can become bugs in the completed program. To systematically study the task, we +introduce two datasets: one with synthetic bugs derived from semantics-altering +operator changes (buggy-HumanEval) and one with realistic bugs derived from +user submissions to coding problems (buggy-FixEval). We find that the presence +of potential bugs significantly degrades the generation performance of the +high-performing Code-LLMs. For instance, the passing rates of CODEGEN-2B-MONO +on test cases of buggy-HumanEval drop more than 50% given a single potential +bug in the context. Finally, we investigate several post-hoc methods for +mitigating the adverse effect of potential bugs and find that there remains a +significant gap in post-mitigation performance. + +
+
+ comment: 27 pages, accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ JMedLoRA:Medical Domain Adaptation on Japanese Large Language Models + using Instruction-tuning + + +
+ In the ongoing wave of impact driven by large language models (LLMs) like +ChatGPT, the adaptation of LLMs to medical domain has emerged as a crucial +research frontier. Since mainstream LLMs tend to be designed for +general-purpose applications, constructing a medical LLM through domain +adaptation is a huge challenge. While instruction-tuning is used to fine-tune +some LLMs, its precise roles in domain adaptation remain unknown. Here we show +the contribution of LoRA-based instruction-tuning to performance in Japanese +medical question-answering tasks. In doing so, we employ a multifaceted +evaluation for multiple-choice questions, including scoring based on "Exact +match" and "Gestalt distance" in addition to the conventional accuracy. Our +findings suggest that LoRA-based instruction-tuning can partially incorporate +domain-specific knowledge into LLMs, with larger models demonstrating more +pronounced effects. Furthermore, our results underscore the potential of +adapting English-centric models for Japanese applications in domain adaptation, +while also highlighting the persisting limitations of Japanese-centric models. +This initiative represents a pioneering effort in enabling medical institutions +to fine-tune and operate models without relying on external services. + +
+
+ comment: 8 pages, 1 figures +
+
+
+
+
+ + ♻ ☆ A Question Answering Framework for Decontextualizing User-facing + Snippets from Scientific Documents EMNLP2023 + + +
+ Many real-world applications (e.g., note taking, search) require extracting a +sentence or paragraph from a document and showing that snippet to a human +outside of the source document. Yet, users may find snippets difficult to +understand as they lack context from the original document. In this work, we +use language models to rewrite snippets from scientific documents to be read on +their own. First, we define the requirements and challenges for this +user-facing decontextualization task, such as clarifying where edits occur and +handling references to other documents. Second, we propose a framework that +decomposes the task into three stages: question generation, question answering, +and rewriting. Using this framework, we collect gold decontextualizations from +experienced scientific article readers. We then conduct a range of experiments +across state-of-the-art commercial and open-source language models to identify +how to best provide missing-but-relevant information to models for our task. +Finally, we develop QaDecontext, a simple prompting strategy inspired by our +framework that improves over end-to-end prompting. We conclude with analysis +that finds, while rewriting is easy, question generation and answering remain +challenging for today's models. + +
+
+ comment: 19 pages, 2 figures, 8 tables, EMNLP2023 +
+
+
+
+
+ + ♻ ☆ Stealing the Decoding Algorithms of Language Models + + +
+ A key component of generating text from modern language models (LM) is the +selection and tuning of decoding algorithms. These algorithms determine how to +generate text from the internal probability distribution generated by the LM. +The process of choosing a decoding algorithm and tuning its hyperparameters +takes significant time, manual effort, and computation, and it also requires +extensive human evaluation. Therefore, the identity and hyperparameters of such +decoding algorithms are considered to be extremely valuable to their owners. In +this work, we show, for the first time, that an adversary with typical API +access to an LM can steal the type and hyperparameters of its decoding +algorithms at very low monetary costs. Our attack is effective against popular +LMs used in text generation APIs, including GPT-2, GPT-3 and GPT-Neo. We +demonstrate the feasibility of stealing such information with only a few +dollars, e.g., $\$0.8$, $\$1$, $\$4$, and $\$40$ for the four versions of +GPT-3. + +
+
+
+
+
+ + ♻ ☆ General-Purpose vs. Domain-Adapted Large Language Models for Extraction + of Data from Thoracic Radiology Reports + + +
+ Radiologists produce unstructured data that could be valuable for clinical +care when consumed by information systems. However, variability in style limits +usage. Study compares performance of system using domain-adapted language model +(RadLing) and general-purpose large language model (GPT-4) in extracting common +data elements (CDE) from thoracic radiology reports. Three radiologists +annotated a retrospective dataset of 1300 thoracic reports (900 training, 400 +test) and mapped to 21 pre-selected relevant CDEs. RadLing was used to generate +embeddings for sentences and identify CDEs using cosine-similarity, which were +mapped to values using light-weight mapper. GPT-4 system used OpenAI's +general-purpose embeddings to identify relevant CDEs and used GPT-4 to map to +values. The output CDE:value pairs were compared to the reference standard; an +identical match was considered true positive. Precision (positive predictive +value) was 96% (2700/2824) for RadLing and 99% (2034/2047) for GPT-4. Recall +(sensitivity) was 94% (2700/2876) for RadLing and 70% (2034/2887) for GPT-4; +the difference was statistically significant (P<.001). RadLing's domain-adapted +embeddings were more sensitive in CDE identification (95% vs 71%) and its +light-weight mapper had comparable precision in value assignment (95.4% vs +95.0%). RadLing system exhibited higher performance than GPT-4 system in +extracting CDEs from radiology reports. RadLing system's domain-adapted +embeddings outperform general-purpose embeddings from OpenAI in CDE +identification and its light-weight value mapper achieves comparable precision +to large GPT-4. RadLing system offers operational advantages including local +deployment and reduced runtime costs. Domain-adapted RadLing system surpasses +GPT-4 system in extracting common data elements from radiology reports, while +providing benefits of local deployment and lower costs. + +
+
+
+
+
+ + ♻ ☆ Centering the Margins: Outlier-Based Identification of Harmed + Populations in Toxicity Detection EMNLP 2023 + + +
+ The impact of AI models on marginalized communities has traditionally been +measured by identifying performance differences between specified demographic +subgroups. Though this approach aims to center vulnerable groups, it risks +obscuring patterns of harm faced by intersectional subgroups or shared across +multiple groups. To address this, we draw on theories of marginalization from +disability studies and related disciplines, which state that people farther +from the norm face greater adversity, to consider the "margins" in the domain +of toxicity detection. We operationalize the "margins" of a dataset by +employing outlier detection to identify text about people with demographic +attributes distant from the "norm". We find that model performance is +consistently worse for demographic outliers, with mean squared error (MSE) +between outliers and non-outliers up to 70.4% worse across toxicity types. It +is also worse for text outliers, with a MSE up to 68.4% higher for outliers +than non-outliers. We also find text and demographic outliers to be +particularly susceptible to errors in the classification of severe toxicity and +identity attacks. Compared to analysis of disparities using traditional +demographic breakdowns, we find that our outlier analysis frequently surfaces +greater harms faced by a larger, more intersectional group, which suggests that +outlier analysis is particularly beneficial for identifying harms against those +groups. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ♻ ☆ "Kelly is a Warm Person, Joseph is a Role Model": Gender Biases in + LLM-Generated Reference Letters EMNLP 2023 + + +
+ Large Language Models (LLMs) have recently emerged as an effective tool to +assist individuals in writing various types of content, including professional +documents such as recommendation letters. Though bringing convenience, this +application also introduces unprecedented fairness concerns. Model-generated +reference letters might be directly used by users in professional scenarios. If +underlying biases exist in these model-constructed letters, using them without +scrutinization could lead to direct societal harms, such as sabotaging +application success rates for female applicants. In light of this pressing +issue, it is imminent and necessary to comprehensively study fairness issues +and associated harms in this real-world use case. In this paper, we critically +examine gender biases in LLM-generated reference letters. Drawing inspiration +from social science findings, we design evaluation methods to manifest biases +through 2 dimensions: (1) biases in language style and (2) biases in lexical +content. We further investigate the extent of bias propagation by analyzing the +hallucination bias of models, a term that we define to be bias exacerbation in +model-hallucinated contents. Through benchmarking evaluation on 2 popular LLMs- +ChatGPT and Alpaca, we reveal significant gender biases in LLM-generated +recommendation letters. Our findings not only warn against using LLMs for this +application without scrutinization, but also illuminate the importance of +thoroughly studying hidden biases and harms in LLM-generated professional +documents. + +
+
+ comment: Accepted to EMNLP 2023 Findings +
+
+
+
+
+ + ♻ ☆ FIND: A Function Description Benchmark for Evaluating Interpretability + Methods + + +
+ Labeling neural network submodules with human-legible descriptions is useful +for many downstream tasks: such descriptions can surface failures, guide +interventions, and perhaps even explain important model behaviors. To date, +most mechanistic descriptions of trained networks have involved small models, +narrowly delimited phenomena, and large amounts of human labor. Labeling all +human-interpretable sub-computations in models of increasing size and +complexity will almost certainly require tools that can generate and validate +descriptions automatically. Recently, techniques that use learned models +in-the-loop for labeling have begun to gain traction, but methods for +evaluating their efficacy are limited and ad-hoc. How should we validate and +compare open-ended labeling tools? This paper introduces FIND (Function +INterpretation and Description), a benchmark suite for evaluating the building +blocks of automated interpretability methods. FIND contains functions that +resemble components of trained neural networks, and accompanying descriptions +of the kind we seek to generate. The functions span textual and numeric +domains, and involve a range of real-world complexities. We evaluate methods +that use pretrained language models (LMs) to produce descriptions of function +behavior in natural language and code. Additionally, we introduce a new +interactive method in which an Automated Interpretability Agent (AIA) generates +function descriptions. We find that an AIA, built from an LM with black-box +access to functions, can infer function structure, acting as a scientist by +forming hypotheses, proposing experiments, and updating descriptions in light +of new data. However, AIA descriptions tend to capture global function behavior +and miss local details. These results suggest that FIND will be useful for +evaluating more sophisticated interpretability methods before they are applied +to real-world models. + +
+
+ comment: 28 pages, 10 figures +
+
+
+
+
+
+
+
+ + Computer Vision and Pattern Recognition 127 + +
+
+
+ + ☆ Dense Optical Tracking: Connecting the Dots + + +
+ Recent approaches to point tracking are able to recover the trajectory of any +scene point through a large portion of a video despite the presence of +occlusions. They are, however, too slow in practice to track every point +observed in a single frame in a reasonable amount of time. This paper +introduces DOT, a novel, simple and efficient method for solving this problem. +It first extracts a small set of tracks from key regions at motion boundaries +using an off-the-shelf point tracking algorithm. Given source and target +frames, DOT then computes rough initial estimates of a dense flow field and +visibility mask through nearest-neighbor interpolation, before refining them +using a learnable optical flow estimator that explicitly handles occlusions and +can be trained on synthetic data with ground-truth correspondences. We show +that DOT is significantly more accurate than current optical flow techniques, +outperforms sophisticated "universal" trackers like OmniMotion, and is on par +with, or better than, the best point tracking algorithms like CoTracker while +being at least two orders of magnitude faster. Quantitative and qualitative +experiments with synthetic and real videos validate the promise of the proposed +approach. Code, data, and videos showcasing the capabilities of our approach +are available in the project webpage: https://16lemoing.github.io/dot . + +
+
+
+
+
+ + ☆ Sequential Modeling Enables Scalable Learning for Large Vision Models + + +
+ We introduce a novel sequential modeling approach which enables learning a +Large Vision Model (LVM) without making use of any linguistic data. To do this, +we define a common format, "visual sentences", in which we can represent raw +images and videos as well as annotated data sources such as semantic +segmentations and depth reconstructions without needing any meta-knowledge +beyond the pixels. Once this wide variety of visual data (comprising 420 +billion tokens) is represented as sequences, the model can be trained to +minimize a cross-entropy loss for next token prediction. By training across +various scales of model architecture and data diversity, we provide empirical +evidence that our models scale effectively. Many different vision tasks can be +solved by designing suitable visual prompts at test time. + +
+
+ comment: Website: https://yutongbai.com/lvm.html +
+
+
+
+
+ + ☆ Making Large Multimodal Models Understand Arbitrary Visual Prompts + + +
+ While existing large vision-language multimodal models focus on whole image +understanding, there is a prominent gap in achieving region-specific +comprehension. Current approaches that use textual coordinates or spatial +encodings often fail to provide a user-friendly interface for visual prompting. +To address this challenge, we introduce a novel multimodal model capable of +decoding arbitrary visual prompts. This allows users to intuitively mark images +and interact with the model using natural cues like a "red bounding box" or +"pointed arrow". Our simple design directly overlays visual markers onto the +RGB image, eliminating the need for complex region encodings, yet achieves +state-of-the-art performance on region-understanding tasks like Visual7W, +PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present +ViP-Bench, a comprehensive benchmark to assess the capability of models in +understanding visual prompts across multiple dimensions, enabling future +research in this domain. Code, data, and model are publicly available. + +
+
+ comment: Project page: https://vip-llava.github.io/ +
+
+
+
+
+ + ☆ MorpheuS: Neural Dynamic 360° Surface Reconstruction from Monocular + RGB-D Video + + +
+ Neural rendering has demonstrated remarkable success in dynamic scene +reconstruction. Thanks to the expressiveness of neural representations, prior +works can accurately capture the motion and achieve high-fidelity +reconstruction of the target object. Despite this, real-world video scenarios +often feature large unobserved regions where neural representations struggle to +achieve realistic completion. To tackle this challenge, we introduce MorpheuS, +a framework for dynamic 360{\deg} surface reconstruction from a casually +captured RGB-D video. Our approach models the target scene as a canonical field +that encodes its geometry and appearance, in conjunction with a deformation +field that warps points from the current frame to the canonical space. We +leverage a view-dependent diffusion prior and distill knowledge from it to +achieve realistic completion of unobserved regions. Experimental results on +various real-world and synthetic datasets show that our method can achieve +high-fidelity 360{\deg} surface reconstruction of a deformable object from a +monocular RGB-D video. + +
+
+ comment: Project page: https://hengyiwang.github.io/projects/morpheus +
+
+
+
+
+ + ☆ VideoBooth: Diffusion-based Video Generation with Image Prompts + + +
+ Text-driven video generation witnesses rapid progress. However, merely using +text prompts is not enough to depict the desired subject appearance that +accurately aligns with users' intents, especially for customized content +creation. In this paper, we study the task of video generation with image +prompts, which provide more accurate and direct content control beyond the text +prompts. Specifically, we propose a feed-forward framework VideoBooth, with two +dedicated designs: 1) We propose to embed image prompts in a coarse-to-fine +manner. Coarse visual embeddings from image encoder provide high-level +encodings of image prompts, while fine visual embeddings from the proposed +attention injection module provide multi-scale and detailed encoding of image +prompts. These two complementary embeddings can faithfully capture the desired +appearance. 2) In the attention injection module at fine level, multi-scale +image prompts are fed into different cross-frame attention layers as additional +keys and values. This extra spatial information refines the details in the +first frame and then it is propagated to the remaining frames, which maintains +temporal consistency. Extensive experiments demonstrate that VideoBooth +achieves state-of-the-art performance in generating customized high-quality +videos with subjects specified in image prompts. Notably, VideoBooth is a +generalizable framework where a single model works for a wide range of image +prompts with feed-forward pass. + +
+
+ comment: Project page: https://vchitect.github.io/VideoBooth-project/ +
+
+
+
+
+ + ☆ Towards Generalizable Zero-Shot Manipulation via Translating Human + Interaction Plans + + +
+ We pursue the goal of developing robots that can interact zero-shot with +generic unseen objects via a diverse repertoire of manipulation skills and show +how passive human videos can serve as a rich source of data for learning such +generalist robots. Unlike typical robot learning approaches which directly +learn how a robot should act from interaction data, we adopt a factorized +approach that can leverage large-scale human videos to learn how a human would +accomplish a desired task (a human plan), followed by translating this plan to +the robots embodiment. Specifically, we learn a human plan predictor that, +given a current image of a scene and a goal image, predicts the future hand and +object configurations. We combine this with a translation module that learns a +plan-conditioned robot manipulation policy, and allows following humans plans +for generic manipulation tasks in a zero-shot manner with no deployment-time +training. Importantly, while the plan predictor can leverage large-scale human +videos for learning, the translation module only requires a small amount of +in-domain data, and can generalize to tasks not seen during training. We show +that our learned system can perform over 16 manipulation skills that generalize +to 40 objects, encompassing 100 real-world tasks for table-top manipulation and +diverse in-the-wild manipulation. https://homangab.github.io/hopman/ + +
+
+ comment: Preprint. Under Review +
+
+
+
+
+ + ☆ Automated Material Properties Extraction For Enhanced Beauty Product + Discovery and Makeup Virtual Try-on + + +
+ The multitude of makeup products available can make it challenging to find +the ideal match for desired attributes. An intelligent approach for product +discovery is required to enhance the makeup shopping experience to make it more +convenient and satisfying. However, enabling accurate and efficient product +discovery requires extracting detailed attributes like color and finish type. +Our work introduces an automated pipeline that utilizes multiple customized +machine learning models to extract essential material attributes from makeup +product images. Our pipeline is versatile and capable of handling various +makeup products. To showcase the efficacy of our pipeline, we conduct extensive +experiments on eyeshadow products (both single and multi-shade ones), a +challenging makeup product known for its diverse range of shapes, colors, and +finish types. Furthermore, we demonstrate the applicability of our approach by +successfully extending it to other makeup categories like lipstick and +foundation, showcasing its adaptability and effectiveness across different +beauty products. Additionally, we conduct ablation experiments to demonstrate +the superiority of our machine learning pipeline over human labeling methods in +terms of reliability. Our proposed method showcases its effectiveness in +cross-category product discovery, specifically in recommending makeup products +that perfectly match a specified outfit. Lastly, we also demonstrate the +application of these material attributes in enabling virtual-try-on experiences +which makes makeup shopping experience significantly more engaging. + +
+
+ comment: Presented in Fifth Workshop on Recommender Systems in + Fashion(fashionxrecsys) of ACM Conference on Recommender Systems +
+
+
+
+
+ + ☆ Deep Unlearning: Fast and Efficient Training-free Approach to Controlled + Forgetting + + +
+ Machine unlearning has emerged as a prominent and challenging area of +interest, driven in large part by the rising regulatory demands for industries +to delete user data upon request and the heightened awareness of privacy. +Existing approaches either retrain models from scratch or use several +finetuning steps for every deletion request, often constrained by computational +resource limitations and restricted access to the original training data. In +this work, we introduce a novel class unlearning algorithm designed to +strategically eliminate an entire class or a group of classes from the learned +model. To that end, our algorithm first estimates the Retain Space and the +Forget Space, representing the feature or activation spaces for samples from +classes to be retained and unlearned, respectively. To obtain these spaces, we +propose a novel singular value decomposition-based technique that requires +layer wise collection of network activations from a few forward passes through +the network. We then compute the shared information between these spaces and +remove it from the forget space to isolate class-discriminatory feature space +for unlearning. Finally, we project the model weights in the orthogonal +direction of the class-discriminatory space to obtain the unlearned model. We +demonstrate our algorithm's efficacy on ImageNet using a Vision Transformer +with only $\sim$1.5% drop in retain accuracy compared to the original model +while maintaining under 1% accuracy on the unlearned class samples. Further, +our algorithm consistently performs well when subject to Membership Inference +Attacks showing 7.8% improvement on average across a variety of image +classification datasets and network architectures, as compared to other +baselines while being $\sim$6x more computationally efficient. + +
+
+
+
+
+ + ☆ Adversarial Score Distillation: When score distillation meets GAN + + +
+ Existing score distillation methods are sensitive to classifier-free guidance +(CFG) scale: manifested as over-smoothness or instability at small CFG scales, +while over-saturation at large ones. To explain and analyze these issues, we +revisit the derivation of Score Distillation Sampling (SDS) and decipher +existing score distillation with the Wasserstein Generative Adversarial Network +(WGAN) paradigm. With the WGAN paradigm, we find that existing score +distillation either employs a fixed sub-optimal discriminator or conducts +incomplete discriminator optimization, resulting in the scale-sensitive issue. +We propose the Adversarial Score Distillation (ASD), which maintains an +optimizable discriminator and updates it using the complete optimization +objective. Experiments show that the proposed ASD performs favorably in 2D +distillation and text-to-3D tasks against existing methods. Furthermore, to +explore the generalization ability of our WGAN paradigm, we extend ASD to the +image editing task, which achieves competitive results. The project page and +code are at https://github.com/2y7c3/ASD. + +
+
+
+
+
+ + ☆ Gaussian Grouping: Segment and Edit Anything in 3D Scenes + + +
+ The recent Gaussian Splatting achieves high-quality and real-time novel-view +synthesis of the 3D scenes. However, it is solely concentrated on the +appearance and geometry modeling, while lacking in fine-grained object-level +scene understanding. To address this issue, we propose Gaussian Grouping, which +extends Gaussian Splatting to jointly reconstruct and segment anything in +open-world 3D scenes. We augment each Gaussian with a compact Identity +Encoding, allowing the Gaussians to be grouped according to their object +instance or stuff membership in the 3D scene. Instead of resorting to expensive +3D labels, we supervise the Identity Encodings during the differentiable +rendering by leveraging the 2D mask predictions by SAM, along with introduced +3D spatial consistency regularization. Comparing to the implicit NeRF +representation, we show that the discrete and grouped 3D Gaussians can +reconstruct, segment and edit anything in 3D with high visual quality, fine +granularity and efficiency. Based on Gaussian Grouping, we further propose a +local Gaussian Editing scheme, which shows efficacy in versatile scene editing +applications, including 3D object removal, inpainting, colorization and scene +recomposition. Our code and models will be at +https://github.com/lkeab/gaussian-grouping. + +
+
+ comment: We propose Gaussian Grouping, which extends Gaussian Splatting to + fine-grained open-world 3D scene understanding. Github: + https://github.com/lkeab/gaussian-grouping +
+
+
+
+
+ + ☆ PointBeV: A Sparse Approach to BeV Predictions + + +
+ Bird's-eye View (BeV) representations have emerged as the de-facto shared +space in driving applications, offering a unified space for sensor data fusion +and supporting various downstream tasks. However, conventional models use grids +with fixed resolution and range and face computational inefficiencies due to +the uniform allocation of resources across all cells. To address this, we +propose PointBeV, a novel sparse BeV segmentation model operating on sparse BeV +cells instead of dense grids. This approach offers precise control over memory +usage, enabling the use of long temporal contexts and accommodating +memory-constrained platforms. PointBeV employs an efficient two-pass strategy +for training, enabling focused computation on regions of interest. At inference +time, it can be used with various memory/performance trade-offs and flexibly +adjusts to new specific use cases. PointBeV achieves state-of-the-art results +on the nuScenes dataset for vehicle, pedestrian, and lane segmentation, +showcasing superior performance in static and temporal settings despite being +trained solely with sparse signals. We will release our code along with two new +efficient modules used in the architecture: Sparse Feature Pulling, designed +for the effective extraction of features from images to BeV, and Submanifold +Attention, which enables efficient temporal modeling. Our code is available at +https://github.com/valeoai/PointBeV. + +
+
+ comment: https://github.com/valeoai/PointBeV +
+
+
+
+
+ + ☆ GIFT: Generative Interpretable Fine-Tuning Transformers + + +
+ We present GIFT (Generative Interpretable Fine-tuning Transformers) for +fine-tuning pretrained (often large) Transformer models at downstream tasks in +a parameter-efficient way with built-in interpretability. Our GIFT is a deep +parameter-residual learning method, which addresses two problems in fine-tuning +a pretrained Transformer model: Where to apply the parameter-efficient +fine-tuning (PEFT) to be extremely lightweight yet sufficiently expressive, and +How to learn the PEFT to better exploit the knowledge of the pretrained model +in a direct way? For the former, we select the final projection (linear) layer +in the multi-head self-attention of a Transformer model, and verify its +effectiveness. For the latter, in contrast to the prior art that directly +introduce new model parameters (often in low-rank approximation form) to be +learned in fine-tuning with downstream data, we propose a method for learning +to generate the fine-tuning parameters. Our GIFT is a hyper-Transformer which +take as input the pretrained parameters of the projection layer to generate its +fine-tuning parameters using a proposed Parameter-to-Cluster Attention (PaCa). +The PaCa results in a simple clustering-based forward explainer that plays the +role of semantic segmentation in testing. In experiments, our proposed GIFT is +tested on the VTAB benchmark and the fine-grained visual classification (FGVC) +benchmark. It obtains significantly better performance than the prior art. Our +code is available at https://github.com/savadikarc/gift + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ Rethinking Detection Based Table Structure Recognition for Visually Rich + Documents + + +
+ Table Structure Recognition (TSR) aims at transforming unstructured table +images into structured formats, such as HTML sequences. One type of popular +solution is using detection models to detect components of a table, such as +columns and rows, then applying a rule-based post-processing method to convert +detection results into HTML sequences. However, existing detection-based +studies often have the following limitations. First, these studies usually pay +more attention to improving the detection performance, which does not +necessarily lead to better performance regarding cell-level metrics, such as +TEDS. Second, some solutions over-simplify the problem and can miss some +critical information. Lastly, even though some studies defined the problem to +detect more components to provide as much information as other types of +solutions, these studies ignore the fact this problem definition is a +multi-label detection because row, projected row header and column header can +share identical bounding boxes. Besides, there is often a performance gap +between two-stage and transformer-based detection models regarding the +structure-only TEDS, even though they have similar performance regarding the +COCO metrics. Therefore, we revisit the limitations of existing detection-based +solutions, compare two-stage and transformer-based detection models, and +identify the key design aspects for the success of a two-stage detection model +for the TSR task, including the multi-class problem definition, the aspect +ratio for anchor box generation, and the feature generation of the backbone +network. We applied simple methods to improve these aspects of the Cascade +R-CNN model, achieved state-of-the-art performance, and improved the baseline +Cascade R-CNN model by 19.32%, 11.56% and 14.77% regarding the structure-only +TEDS on SciTSR, FinTabNet, and PubTables1M datasets. + +
+
+ comment: submitted to Pattern Recognition +
+
+
+
+
+ + ☆ Object Detector Differences when using Synthetic and Real Training Data + + +
+ To train well-performing generalizing neural networks, sufficiently large and +diverse datasets are needed. Collecting data while adhering to privacy +legislation becomes increasingly difficult and annotating these large datasets +is both a resource-heavy and time-consuming task. An approach to overcome these +difficulties is to use synthetic data since it is inherently scalable and can +be automatically annotated. However, how training on synthetic data affects the +layers of a neural network is still unclear. In this paper, we train the YOLOv3 +object detector on real and synthetic images from city environments. We perform +a similarity analysis using Centered Kernel Alignment (CKA) to explore the +effects of training on synthetic data on a layer-wise basis. The analysis +captures the architecture of the detector while showing both different and +similar patterns between different models. With this similarity analysis we +want to give insights on how training synthetic data affects each layer and to +give a better understanding of the inner workings of complex neural networks. +The results show that the largest similarity between a detector trained on real +data and a detector trained on synthetic data was in the early layers, and the +largest difference was in the head part. The results also show that no major +difference in performance or similarity could be seen between frozen and +unfrozen backbone. + +
+
+ comment: 27 pages. The Version of Record of this article is published in + Springer Nature Computer Science 2023, and is available online at + https://doi.org/10.1007/s42979-023-01704-5 +
+
+
+
+
+ + ☆ VisionaryVR: An Optical Simulation Tool for Evaluating and Optimizing + Vision Correction Solutions in Virtual Reality + + +
+ Developing and evaluating vision science methods require robust and efficient +tools for assessing their performance in various real-world scenarios. This +study presents a novel virtual reality (VR) simulation tool that simulates +real-world optical methods while giving high experimental control to the +experiment. The tool incorporates an experiment controller, to smoothly and +easily handle multiple conditions, a generic eye-tracking controller, that +works with most common VR eye-trackers, a configurable defocus simulator, and a +generic VR questionnaire loader to assess participants' behavior in virtual +reality. This VR-based simulation tool bridges the gap between theoretical and +applied research on new optical methods, corrections, and therapies. It enables +vision scientists to increase their research tools with a robust, realistic, +and fast research environment. + +
+
+
+
+
+ + ☆ Open-vocabulary object 6D pose estimation + + +
+ We introduce the new setting of open-vocabulary object 6D pose estimation, in +which a textual prompt is used to specify the object of interest. In contrast +to existing approaches, in our setting (i) the object of interest is specified +solely through the textual prompt, (ii) no object model (e.g. CAD or video +sequence) is required at inference, (iii) the object is imaged from two +different viewpoints of two different scenes, and (iv) the object was not +observed during the training phase. To operate in this setting, we introduce a +novel approach that leverages a Vision-Language Model to segment the object of +interest from two distinct scenes and to estimate its relative 6D pose. The key +of our approach is a carefully devised strategy to fuse object-level +information provided by the prompt with local image features, resulting in a +feature space that can generalize to novel concepts. We validate our approach +on a new benchmark based on two popular datasets, REAL275 and Toyota-Light, +which collectively encompass 39 object instances appearing in four thousand +image pairs. The results demonstrate that our approach outperforms both a +well-established hand-crafted method and a recent deep learning-based baseline +in estimating the relative 6D pose of objects in different scenes. Project +website: https://jcorsetti.github.io/oryon-website/. + +
+
+ comment: Technical report. 21 pages, 15 figures, 6 tables +
+
+
+
+
+ + ☆ Infrared Image Super-Resolution via GAN + + +
+ The ability of generative models to accurately fit data distributions has +resulted in their widespread adoption and success in fields such as computer +vision and natural language processing. In this chapter, we provide a brief +overview of the application of generative models in the domain of infrared (IR) +image super-resolution, including a discussion of the various challenges and +adversarial training methods employed. We propose potential areas for further +investigation and advancement in the application of generative models for IR +image super-resolution. + +
+
+ comment: Applications of Generative AI, Chapter 28 +
+
+
+
+
+ + ☆ Unsupervised Adaptive Implicit Neural Representation Learning for + Scan-Specific MRI Reconstruction + + +
+ In recent studies on MRI reconstruction, advances have shown significant +promise for further accelerating the MRI acquisition. Most state-of-the-art +methods require a large amount of fully-sampled data to optimise reconstruction +models, which is impractical and expensive under certain clinical settings. On +the other hand, for unsupervised scan-specific reconstruction methods, +overfitting is likely to happen due to insufficient supervision, while +restrictions on acceleration rates and under-sampling patterns further limit +their applicability. To this end, we propose an unsupervised, adaptive +coarse-to-fine framework that enhances reconstruction quality without being +constrained by the sparsity levels or patterns in under-sampling. The framework +employs an implicit neural representation for scan-specific MRI reconstruction, +learning a mapping from multi-dimensional coordinates to their corresponding +signal intensities. Moreover, we integrate a novel learning strategy that +progressively refines the use of acquired k-space signals for self-supervision. +This approach effectively adjusts the proportion of supervising signals from +unevenly distributed information across different frequency bands, thus +mitigating the issue of overfitting while improving the overall reconstruction. +Comprehensive evaluation on a public dataset, including both 2D and 3D data, +has shown that our method outperforms current state-of-the-art scan-specific +MRI reconstruction techniques, for up to 8-fold under-sampling. + +
+
+
+
+
+ + ☆ LightCLIP: Learning Multi-Level Interaction for Lightweight + Vision-Language Models + + +
+ Vision-language pre-training like CLIP has shown promising performance on +various downstream tasks such as zero-shot image classification and image-text +retrieval. Most of the existing CLIP-alike works usually adopt relatively large +image encoders like ResNet50 and ViT, while the lightweight counterparts are +rarely discussed. In this paper, we propose a multi-level interaction paradigm +for training lightweight CLIP models. Firstly, to mitigate the problem that +some image-text pairs are not strictly one-to-one correspondence, we improve +the conventional global instance-level alignment objective by softening the +label of negative samples progressively. Secondly, a relaxed bipartite matching +based token-level alignment objective is introduced for finer-grained alignment +between image patches and textual words. Moreover, based on the observation +that the accuracy of CLIP model does not increase correspondingly as the +parameters of text encoder increase, an extra objective of masked language +modeling (MLM) is leveraged for maximizing the potential of the shortened text +encoder. In practice, an auxiliary fusion module injecting unmasked image +embedding into masked text embedding at different network stages is proposed +for enhancing the MLM. Extensive experiments show that without introducing +additional computational cost during inference, the proposed method achieves a +higher performance on multiple downstream tasks. + +
+
+
+
+
+ + ☆ CellMixer: Annotation-free Semantic Cell Segmentation of Heterogeneous + Cell Populations NeurIPS 2023 + + +
+ In recent years, several unsupervised cell segmentation methods have been +presented, trying to omit the requirement of laborious pixel-level annotations +for the training of a cell segmentation model. Most if not all of these methods +handle the instance segmentation task by focusing on the detection of different +cell instances ignoring their type. While such models prove adequate for +certain tasks, like cell counting, other applications require the +identification of each cell's type. In this paper, we present CellMixer, an +innovative annotation-free approach for the semantic segmentation of +heterogeneous cell populations. Our augmentation-based method enables the +training of a segmentation model from image-level labels of homogeneous cell +populations. Our results show that CellMixer can achieve competitive +segmentation performance across multiple cell types and imaging modalities, +demonstrating the method's scalability and potential for broader applications +in medical imaging, cellular biology, and diagnostics. + +
+
+ comment: Medical Imaging Meets NeurIPS 2023 +
+
+
+
+
+ + ☆ Generalized Label-Efficient 3D Scene Parsing via Hierarchical Feature + Aligned Pre-Training and Region-Aware Fine-tuning + + +
+ Deep neural network models have achieved remarkable progress in 3D scene +understanding while trained in the closed-set setting and with full labels. +However, the major bottleneck for current 3D recognition approaches is that +they do not have the capacity to recognize any unseen novel classes beyond the +training categories in diverse kinds of real-world applications. In the +meantime, current state-of-the-art 3D scene understanding approaches primarily +require high-quality labels to train neural networks, which merely perform well +in a fully supervised manner. This work presents a generalized and simple +framework for dealing with 3D scene understanding when the labeled scenes are +quite limited. To extract knowledge for novel categories from the pre-trained +vision-language models, we propose a hierarchical feature-aligned pre-training +and knowledge distillation strategy to extract and distill meaningful +information from large-scale vision-language models, which helps benefit the +open-vocabulary scene understanding tasks. To leverage the boundary +information, we propose a novel energy-based loss with boundary awareness +benefiting from the region-level boundary predictions. To encourage latent +instance discrimination and to guarantee efficiency, we propose the +unsupervised region-level semantic contrastive learning scheme for point +clouds, using confident predictions of the neural network to discriminate the +intermediate feature embeddings at multiple stages. Extensive experiments with +both indoor and outdoor scenes demonstrated the effectiveness of our approach +in both data-efficient learning and open-world few-shot learning. All codes, +models, and data are made publicly available at: +https://drive.google.com/drive/folders/1M58V-PtR8DBEwD296zJkNg_m2qq-MTAP?usp=sharing. + +
+
+ comment: IEEE Transactions on Pattern Analysis and Machine Intelligence, + Manuscript Info: 22 Pages, 16 Figures, and 8 Tables +
+
+
+
+
+ + ☆ Dual-Domain Multi-Contrast MRI Reconstruction with Synthesis-based + Fusion Network + + +
+ Purpose: To develop an efficient dual-domain reconstruction framework for +multi-contrast MRI, with the focus on minimising cross-contrast misalignment in +both the image and the frequency domains to enhance optimisation. Theory and +Methods: Our proposed framework, based on deep learning, facilitates the +optimisation for under-sampled target contrast using fully-sampled reference +contrast that is quicker to acquire. The method consists of three key steps: 1) +Learning to synthesise data resembling the target contrast from the reference +contrast; 2) Registering the multi-contrast data to reduce inter-scan motion; +and 3) Utilising the registered data for reconstructing the target contrast. +These steps involve learning in both domains with regularisation applied to +ensure their consistency. We also compare the reconstruction performance with +existing deep learning-based methods using a dataset of brain MRI scans. +Results: Extensive experiments demonstrate the superiority of our proposed +framework, for up to an 8-fold acceleration rate, compared to state-of-the-art +algorithms. Comprehensive analysis and ablation studies further present the +effectiveness of the proposed components. Conclusion:Our dual-domain framework +offers a promising approach to multi-contrast MRI reconstruction. It can also +be integrated with existing methods to further enhance the reconstruction. + +
+
+
+
+
+ + ☆ TrackDiffusion: Multi-object Tracking Data Generation via Diffusion + Models + + +
+ Diffusion models have gained prominence in generating data for perception +tasks such as image classification and object detection. However, the potential +in generating high-quality tracking sequences, a crucial aspect in the field of +video perception, has not been fully investigated. To address this gap, we +propose TrackDiffusion, a novel architecture designed to generate continuous +video sequences from the tracklets. TrackDiffusion represents a significant +departure from the traditional layout-to-image (L2I) generation and copy-paste +synthesis focusing on static image elements like bounding boxes by empowering +image diffusion models to encompass dynamic and continuous tracking +trajectories, thereby capturing complex motion nuances and ensuring instance +consistency among video frames. For the first time, we demonstrate that the +generated video sequences can be utilized for training multi-object tracking +(MOT) systems, leading to significant improvement in tracker performance. +Experimental results show that our model significantly enhances instance +consistency in generated video sequences, leading to improved perceptual +metrics. Our approach achieves an improvement of 8.7 in TrackAP and 11.8 in +TrackAP$_{50}$ on the YTVIS dataset, underscoring its potential to redefine the +standards of video data generation for MOT tasks and beyond. + +
+
+
+
+
+ + ☆ SPOT: Self-Training with Patch-Order Permutation for Object-Centric + Learning with Autoregressive Transformers + + +
+ Unsupervised object-centric learning aims to decompose scenes into +interpretable object entities, termed slots. Slot-based auto-encoders stand out +as a prominent method for this task. Within them, crucial aspects include +guiding the encoder to generate object-specific slots and ensuring the decoder +utilizes them during reconstruction. This work introduces two novel techniques, +(i) an attention-based self-training approach, which distills superior +slot-based attention masks from the decoder to the encoder, enhancing object +segmentation, and (ii) an innovative patch-order permutation strategy for +autoregressive transformers that strengthens the role of slot vectors in +reconstruction. The effectiveness of these strategies is showcased +experimentally. The combined approach significantly surpasses prior slot-based +autoencoder methods in unsupervised object segmentation, especially with +complex real-world images. We provide the implementation code at +https://github.com/gkakogeorgiou/spot . + +
+
+
+
+
+ + ☆ EvE: Exploiting Generative Priors for Radiance Field Enrichment + + +
+ Modeling large-scale scenes from unconstrained image collections in-the-wild +has proven to be a major challenge in computer vision. Existing methods +tackling in-the-wild neural rendering operate in a closed-world setting, where +knowledge is limited to a scene's captured images within a training set. We +propose EvE, which is, to the best of our knowledge, the first method +leveraging generative priors to improve in-the-wild scene modeling. We employ +pre-trained generative networks to enrich K-Planes representations with +extrinsic knowledge. To this end, we define an alternating training procedure +to conduct optimization guidance of K-Planes trained on the training set. We +carry out extensive experiments and verify the merit of our method on synthetic +data as well as real tourism photo collections. EvE enhances rendered scenes +with richer details and outperforms the state of the art on the task of novel +view synthesis in-the-wild. Our project page can be found at +https://eve-nvs.github.io . + +
+
+
+
+
+ + ☆ A Recent Survey of Vision Transformers for Medical Image Segmentation + + +
+ Medical image segmentation plays a crucial role in various healthcare +applications, enabling accurate diagnosis, treatment planning, and disease +monitoring. In recent years, Vision Transformers (ViTs) have emerged as a +promising technique for addressing the challenges in medical image +segmentation. In medical images, structures are usually highly interconnected +and globally distributed. ViTs utilize their multi-scale attention mechanism to +model the long-range relationships in the images. However, they do lack +image-related inductive bias and translational invariance, potentially +impacting their performance. Recently, researchers have come up with various +ViT-based approaches that incorporate CNNs in their architectures, known as +Hybrid Vision Transformers (HVTs) to capture local correlation in addition to +the global information in the images. This survey paper provides a detailed +review of the recent advancements in ViTs and HVTs for medical image +segmentation. Along with the categorization of ViT and HVT-based medical image +segmentation approaches we also present a detailed overview of their real-time +applications in several medical image modalities. This survey may serve as a +valuable resource for researchers, healthcare practitioners, and students in +understanding the state-of-the-art approaches for ViT-based medical image +segmentation. + +
+
+
+
+
+ + ☆ Towards Efficient 3D Object Detection in Bird's-Eye-View Space for + Autonomous Driving: A Convolutional-Only Approach + + +
+ 3D object detection in Bird's-Eye-View (BEV) space has recently emerged as a +prevalent approach in the field of autonomous driving. Despite the demonstrated +improvements in accuracy and velocity estimation compared to perspective view +methods, the deployment of BEV-based techniques in real-world autonomous +vehicles remains challenging. This is primarily due to their reliance on +vision-transformer (ViT) based architectures, which introduce quadratic +complexity with respect to the input resolution. To address this issue, we +propose an efficient BEV-based 3D detection framework called BEVENet, which +leverages a convolutional-only architectural design to circumvent the +limitations of ViT models while maintaining the effectiveness of BEV-based +methods. Our experiments show that BEVENet is 3$\times$ faster than +contemporary state-of-the-art (SOTA) approaches on the NuScenes challenge, +achieving a mean average precision (mAP) of 0.456 and a nuScenes detection +score (NDS) of 0.555 on the NuScenes validation dataset, with an inference +speed of 47.6 frames per second. To the best of our knowledge, this study +stands as the first to achieve such significant efficiency improvements for +BEV-based methods, highlighting their enhanced feasibility for real-world +autonomous driving applications. + +
+
+
+
+
+ + ☆ Rethinking the Domain Gap in Near-infrared Face Recognition + + +
+ Heterogeneous face recognition (HFR) involves the intricate task of matching +face images across the visual domains of visible (VIS) and near-infrared (NIR). +While much of the existing literature on HFR identifies the domain gap as a +primary challenge and directs efforts towards bridging it at either the input +or feature level, our work deviates from this trend. We observe that large +neural networks, unlike their smaller counterparts, when pre-trained on large +scale homogeneous VIS data, demonstrate exceptional zero-shot performance in +HFR, suggesting that the domain gap might be less pronounced than previously +believed. By approaching the HFR problem as one of low-data fine-tuning, we +introduce a straightforward framework: comprehensive pre-training, succeeded by +a regularized fine-tuning strategy, that matches or surpasses the current +state-of-the-art on four publicly available benchmarks. Corresponding codes can +be found at https://github.com/michaeltrs/RethinkNIRVIS. + +
+
+ comment: 5 pages, 3 figures, 6 tables +
+
+
+
+
+ + ☆ Learning from One Continuous Video Stream + + +
+ We introduce a framework for online learning from a single continuous video +stream -- the way people and animals learn, without mini-batches, data +augmentation or shuffling. This poses great challenges given the high +correlation between consecutive video frames and there is very little prior +work on it. Our framework allows us to do a first deep dive into the topic and +includes a collection of streams and tasks composed from two existing video +datasets, plus methodology for performance evaluation that considers both +adaptation and generalization. We employ pixel-to-pixel modelling as a +practical and flexible way to switch between pre-training and single-stream +evaluation as well as between arbitrary tasks, without ever requiring changes +to models and always using the same pixel loss. Equipped with this framework we +obtained large single-stream learning gains from pre-training with a novel +family of future prediction tasks, found that momentum hurts, and that the pace +of weight updates matters. The combination of these insights leads to matching +the performance of IID learning with batch size 1, when using the same +architecture and without costly replay buffers. + +
+
+
+
+
+ + ☆ UAVs and Birds: Enhancing Short-Range Navigation through Budgerigar + Flight Studies + + +
+ This study delves into the flight behaviors of Budgerigars (Melopsittacus +undulatus) to gain insights into their flight trajectories and movements. Using +3D reconstruction from stereo video camera recordings, we closely examine the +velocity and acceleration patterns during three flight motion takeoff, flying +and landing. The findings not only contribute to our understanding of bird +behaviors but also hold significant implications for the advancement of +algorithms in Unmanned Aerial Vehicles (UAVs). The research aims to bridge the +gap between biological principles observed in birds and the application of +these insights in developing more efficient and autonomous UAVs. In the context +of the increasing use of drones, this study focuses on the biologically +inspired principles drawn from bird behaviors, particularly during takeoff, +flying and landing flight, to enhance UAV capabilities. The dataset created for +this research sheds light on Budgerigars' takeoff, flying, and landing +techniques, emphasizing their ability to control speed across different +situations and surfaces. The study underscores the potential of incorporating +these principles into UAV algorithms, addressing challenges related to +short-range navigation, takeoff, flying, and landing. + +
+
+ comment: 26 pages, 10 figures +
+
+
+
+
+ + ☆ BCN: Batch Channel Normalization for Image Classification + + +
+ Normalization techniques have been widely used in the field of deep learning +due to their capability of enabling higher learning rates and are less careful +in initialization. However, the effectiveness of popular normalization +technologies is typically limited to specific areas. Unlike the standard Batch +Normalization (BN) and Layer Normalization (LN), where BN computes the mean and +variance along the (N,H,W) dimensions and LN computes the mean and variance +along the (C,H,W) dimensions (N, C, H and W are the batch, channel, spatial +height and width dimension, respectively), this paper presents a novel +normalization technique called Batch Channel Normalization (BCN). To exploit +both the channel and batch dependence and adaptively and combine the advantages +of BN and LN based on specific datasets or tasks, BCN separately normalizes +inputs along the (N, H, W) and (C, H, W) axes, then combines the normalized +outputs based on adaptive parameters. As a basic block, BCN can be easily +integrated into existing models for various applications in the field of +computer vision. Empirical results show that the proposed technique can be +seamlessly applied to various versions of CNN or Vision Transformer +architecture. The code is publicly available at +https://github.com/AfifaKhaled/BatchChannel-Normalization + +
+
+
+
+
+ + ☆ Event Recognition in Laparoscopic Gynecology Videos with Hybrid + Transformers + + +
+ Analyzing laparoscopic surgery videos presents a complex and multifaceted +challenge, with applications including surgical training, intra-operative +surgical complication prediction, and post-operative surgical assessment. +Identifying crucial events within these videos is a significant prerequisite in +a majority of these applications. In this paper, we introduce a comprehensive +dataset tailored for relevant event recognition in laparoscopic gynecology +videos. Our dataset includes annotations for critical events associated with +major intra-operative challenges and post-operative complications. To validate +the precision of our annotations, we assess event recognition performance using +several CNN-RNN architectures. Furthermore, we introduce and evaluate a hybrid +transformer architecture coupled with a customized training-inference framework +to recognize four specific events in laparoscopic surgery videos. Leveraging +the Transformer networks, our proposed architecture harnesses inter-frame +dependencies to counteract the adverse effects of relevant content occlusion, +motion blur, and surgical scene variation, thus significantly enhancing event +recognition accuracy. Moreover, we present a frame sampling strategy designed +to manage variations in surgical scenes and the surgeons' skill level, +resulting in event recognition with high temporal resolution. We empirically +demonstrate the superiority of our proposed methodology in event recognition +compared to conventional CNN-RNN architectures through a series of extensive +experiments. + +
+
+
+
+
+ + ☆ Tracking Object Positions in Reinforcement Learning: A Metric for + Keypoint Detection (extended version) + + +
+ Reinforcement learning (RL) for robot control typically requires a detailed +representation of the environment state, including information about +task-relevant objects not directly measurable. Keypoint detectors, such as +spatial autoencoders (SAEs), are a common approach to extracting a +low-dimensional representation from high-dimensional image data. SAEs aim at +spatial features such as object positions, which are often useful +representations in robotic RL. However, whether an SAE is actually able to +track objects in the scene and thus yields a spatial state representation well +suited for RL tasks has rarely been examined due to a lack of established +metrics. In this paper, we propose to assess the performance of an SAE instance +by measuring how well keypoints track ground truth objects in images. We +present a computationally lightweight metric and use it to evaluate common +baseline SAE architectures on image data from a simulated robot task. We find +that common SAEs differ substantially in their spatial extraction capability. +Furthermore, we validate that SAEs that perform well in our metric achieve +superior performance when used in downstream RL. Thus, our metric is an +effective and lightweight indicator of RL performance before executing +expensive RL training. Building on these insights, we identify three key +modifications of SAE architectures to improve tracking performance. We make our +code available at anonymous.4open.science/r/sae-rl. + +
+
+
+
+
+ + ☆ Less is More: Learning Reference Knowledge Using No-Reference Image + Quality Assessment + + +
+ Image Quality Assessment (IQA) with reference images have achieved great +success by imitating the human vision system, in which the image quality is +effectively assessed by comparing the query image with its pristine reference +image. However, for the images in the wild, it is quite difficult to access +accurate reference images. We argue that it is possible to learn reference +knowledge under the No-Reference Image Quality Assessment (NR-IQA) setting, +which is effective and efficient empirically. Concretely, by innovatively +introducing a novel feature distillation method in IQA, we propose a new +framework to learn comparative knowledge from non-aligned reference images. And +then, to achieve fast convergence and avoid overfitting, we further propose an +inductive bias regularization. Such a framework not only solves the congenital +defects of NR-IQA but also improves the feature extraction framework, enabling +it to express more abundant quality information. Surprisingly, our method +utilizes less input while obtaining a more significant improvement compared to +the teacher models. Extensive experiments on eight standard NR-IQA datasets +demonstrate the superior performance to the state-of-the-art NR-IQA methods, +i.e., achieving the PLCC values of 0.917 (vs. 0.884 in LIVEC) and 0.686 (vs. +0.661 in LIVEFB). + +
+
+
+
+
+ + ☆ Generative models for visualising abstract social processes: Guiding + streetview image synthesis of StyleGAN2 with indices of deprivation + + +
+ This paper presents a novel application of Generative Adverserial Networks +(GANs) to study visual aspects of social processes. I train a a StyleGAN2-model +on a custom dataset of 14,564 images of London, sourced from Google Streetview +taken in London. After training, I invert the images in the training set, +finding points in the model's latent space that correspond to them, and compare +results from three inversion techniques. I connect each data point with +metadata from the Indices of Multiple Deprivation, describing income, health +and environmental quality in the area where the photographs were taken. It is +then possible to map which parts of the model's latent space encode visual +features that are distinctive for health, income and environmental quality, and +condition the synthesis of new images based on these factors. The synthetic +images created reflect visual features of social processes that were previously +unknown and difficult to study, describing recurring visual differences between +deprived and privileged areas in London. GANs are known for their capability to +produce a continuous range of images that exhibit visual differences. The paper +tests how to exploit this ability through visual comparisons in still images as +well as through an interactive website where users can guide image synthesis +with sliders. Though conditioned synthesis has its limitations and the results +are difficult to validate, the paper points to the potential for generative +models to be repurposed to be parts of social scientific methods. + +
+
+ comment: 10 pages, 3 figures, 1 table, associated website with interactive + interface at http://site.knuutila.net/thisinequalitydoesnotexist +
+
+
+
+
+ + ☆ Domain Adaptive Imitation Learning with Visual Observation NeurIPS 2023 + + +
+ In this paper, we consider domain-adaptive imitation learning with visual +observation, where an agent in a target domain learns to perform a task by +observing expert demonstrations in a source domain. Domain adaptive imitation +learning arises in practical scenarios where a robot, receiving visual sensory +data, needs to mimic movements by visually observing other robots from +different angles or observing robots of different shapes. To overcome the +domain shift in cross-domain imitation learning with visual observation, we +propose a novel framework for extracting domain-independent behavioral features +from input observations that can be used to train the learner, based on dual +feature extraction and image reconstruction. Empirical results demonstrate that +our approach outperforms previous algorithms for imitation learning from visual +observation with domain shift. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ☆ LiDAR-based curb detection for ground truth annotation in automated + driving validation + + +
+ Curb detection is essential for environmental awareness in Automated Driving +(AD), as it typically limits drivable and non-drivable areas. Annotated data +are necessary for developing and validating an AD function. However, the number +of public datasets with annotated point cloud curbs is scarce. This paper +presents a method for detecting 3D curbs in a sequence of point clouds captured +from a LiDAR sensor, which consists of two main steps. First, our approach +detects the curbs at each scan using a segmentation deep neural network. Then, +a sequence-level processing step estimates the 3D curbs in the reconstructed +point cloud using the odometry of the vehicle. From these 3D points of the +curb, we obtain polylines structured following ASAM OpenLABEL standard. These +detections can be used as pre-annotations in labelling pipelines to efficiently +generate curb-related ground truth data. We validate our approach through an +experiment in which different human annotators were required to annotate curbs +in a group of LiDAR-based sequences with and without our automatically +generated pre-annotations. The results show that the manual annotation time is +reduced by 50.99% thanks to our detections, keeping the data quality level. + +
+
+
+
+
+ + ☆ DeepDR: Deep Structure-Aware RGB-D Inpainting for Diminished Reality 3DV 2024 + + +
+ Diminished reality (DR) refers to the removal of real objects from the +environment by virtually replacing them with their background. Modern DR +frameworks use inpainting to hallucinate unobserved regions. While recent deep +learning-based inpainting is promising, the DR use case is complicated by the +need to generate coherent structure and 3D geometry (i.e., depth), in +particular for advanced applications, such as 3D scene editing. In this paper, +we propose DeepDR, a first RGB-D inpainting framework fulfilling all +requirements of DR: Plausible image and geometry inpainting with coherent +structure, running at real-time frame rates, with minimal temporal artifacts. +Our structure-aware generative network allows us to explicitly condition color +and depth outputs on the scene semantics, overcoming the difficulty of +reconstructing sharp and consistent boundaries in regions with complex +backgrounds. Experimental results show that the proposed framework can +outperform related work qualitatively and quantitatively. + +
+
+ comment: 11 pages, 8 figures + 13 pages, 10 figures supplementary. Accepted at + 3DV 2024 +
+
+
+
+
+ + ☆ Algorithm-based diagnostic application for diabetic retinopathy + detection + + +
+ Diabetic retinopathy (DR) is a growing health problem worldwide and is a +leading cause of visual impairment and blindness, especially among working +people aged 20-65. Its incidence is increasing along with the number of +diabetes cases, and it is more common in developed countries than in developing +countries. Recent research in the field of diabetic retinopathy diagnosis is +using advanced technologies, such as analysis of images obtained by +ophthalmoscopy. Automatic methods for analyzing eye images based on neural +networks, deep learning and image analysis algorithms can improve the +efficiency of diagnosis. This paper describes an automatic DR diagnosis method +that includes processing and analysis of ophthalmoscopic images of the eye. It +uses morphological algorithms to identify the optic disc and lesions +characteristic of DR, such as microaneurysms, hemorrhages and exudates. +Automated DR diagnosis has the potential to improve the efficiency of early +detection of this disease and contribute to reducing the number of cases of +diabetes-related visual impairment. The final step was to create an application +with a graphical user interface that allowed retinal images taken at +cooperating ophthalmology offices to be uploaded to the server. These images +were then analyzed using a developed algorithm to make a diagnosis. + +
+
+ comment: 18 pages, 9 figures, preprint +
+
+
+
+
+ + ☆ Global Localization: Utilizing Relative Spatio-Temporal Geometric + Constraints from Adjacent and Distant Cameras IROS + + +
+ Re-localizing a camera from a single image in a previously mapped area is +vital for many computer vision applications in robotics and augmented/virtual +reality. In this work, we address the problem of estimating the 6 DoF camera +pose relative to a global frame from a single image. We propose to leverage a +novel network of relative spatial and temporal geometric constraints to guide +the training of a Deep Network for localization. We employ simultaneously +spatial and temporal relative pose constraints that are obtained not only from +adjacent camera frames but also from camera frames that are distant in the +spatio-temporal space of the scene. We show that our method, through these +constraints, is capable of learning to localize when little or very sparse +ground-truth 3D coordinates are available. In our experiments, this is less +than 1% of available ground-truth data. We evaluate our method on 3 common +visual localization datasets and show that it outperforms other direct pose +estimation methods. + +
+
+ comment: To be published in the proceedings of IEEE/RSJ International + Conference on Intelligent Robots and Systems (IROS) 2023 +
+
+
+
+
+ + ☆ Explainable AI in Diagnosing and Anticipating Leukemia Using Transfer + Learning Method + + +
+ This research paper focuses on Acute Lymphoblastic Leukemia (ALL), a form of +blood cancer prevalent in children and teenagers, characterized by the rapid +proliferation of immature white blood cells (WBCs). These atypical cells can +overwhelm healthy cells, leading to severe health consequences. Early and +accurate detection of ALL is vital for effective treatment and improving +survival rates. Traditional diagnostic methods are time-consuming, costly, and +prone to errors. The paper proposes an automated detection approach using +computer-aided diagnostic (CAD) models, leveraging deep learning techniques to +enhance the accuracy and efficiency of leukemia diagnosis. The study utilizes +various transfer learning models like ResNet101V2, VGG19, InceptionV3, and +InceptionResNetV2 for classifying ALL. The methodology includes using the Local +Interpretable Model-Agnostic Explanations (LIME) for ensuring the validity and +reliability of the AI system's predictions. This approach is critical for +overcoming the "black box" nature of AI, where decisions made by models are +often opaque and unaccountable. The paper highlights that the proposed method +using the InceptionV3 model achieved an impressive 98.38% accuracy, +outperforming other tested models. The results, verified by the LIME algorithm, +showcase the potential of this method in accurately identifying ALL, providing +a valuable tool for medical practitioners. The research underscores the impact +of explainable artificial intelligence (XAI) in medical diagnostics, paving the +way for more transparent and trustworthy AI applications in healthcare. + +
+
+
+
+
+ + ☆ Unfolder: Fast localization and image rectification of a document with a + crease from folding in half + + +
+ Presentation of folded documents is not an uncommon case in modern society. +Digitizing such documents by capturing them with a smartphone camera can be +tricky since a crease can divide the document contents into separate planes. To +unfold the document, one could hold the edges potentially obscuring it in a +captured image. While there are many geometrical rectification methods, they +were usually developed for arbitrary bends and folds. We consider such +algorithms and propose a novel approach Unfolder developed specifically for +images of documents with a crease from folding in half. Unfolder is robust to +projective distortions of the document image and does not fragment the image in +the vicinity of a crease after rectification. A new Folded Document Images +dataset was created to investigate the rectification accuracy of folded (2, 3, +4, and 8 folds) documents. The dataset includes 1600 images captured when +document placed on a table and when held in hand. The Unfolder algorithm +allowed for a recognition error rate of 0.33, which is better than the advanced +neural network methods DocTr (0.44) and DewarpNet (0.57). The average runtime +for Unfolder was only 0.25 s/image on an iPhone XR. + +
+
+ comment: This is a preprint of the article accepted for publication in the + journal "Computer Optics" +
+
+
+
+
+ + ☆ Learning Unorthogonalized Matrices for Rotation Estimation + + +
+ Estimating 3D rotations is a common procedure for 3D computer vision. The +accuracy depends heavily on the rotation representation. One form of +representation -- rotation matrices -- is popular due to its continuity, +especially for pose estimation tasks. The learning process usually incorporates +orthogonalization to ensure orthonormal matrices. Our work reveals, through +gradient analysis, that common orthogonalization procedures based on the +Gram-Schmidt process and singular value decomposition will slow down training +efficiency. To this end, we advocate removing orthogonalization from the +learning process and learning unorthogonalized `Pseudo' Rotation Matrices +(PRoM). An optimization analysis shows that PRoM converges faster and to a +better solution. By replacing the orthogonalization incorporated representation +with our proposed PRoM in various rotation-related tasks, we achieve +state-of-the-art results on large-scale benchmarks for human pose estimation. + +
+
+
+
+
+ + ☆ An Encoding Framework for Binarized Images using HyperDimensional + Computing + + +
+ Hyperdimensional Computing (HDC) is a brain-inspired and light-weight machine +learning method. It has received significant attention in the literature as a +candidate to be applied in the wearable internet of things, near-sensor +artificial intelligence applications and on-device processing. HDC is +computationally less complex than traditional deep learning algorithms and +typically achieves moderate to good classification performance. A key aspect +that determines the performance of HDC is the encoding of the input data to the +hyperdimensional (HD) space. This article proposes a novel light-weight +approach relying only on native HD arithmetic vector operations to encode +binarized images that preserves similarity of patterns at nearby locations by +using point of interest selection and local linear mapping. The method reaches +an accuracy of 97.35% on the test set for the MNIST data set and 84.12% for the +Fashion-MNIST data set. These results outperform other studies using baseline +HDC with different encoding approaches and are on par with more complex hybrid +HDC models. The proposed encoding approach also demonstrates a higher +robustness to noise and blur compared to the baseline encoding. + +
+
+
+
+
+ + ☆ Towards Generalizable Referring Image Segmentation via Target Prompt and + Visual Coherence + + +
+ Referring image segmentation (RIS) aims to segment objects in an image +conditioning on free-from text descriptions. Despite the overwhelming progress, +it still remains challenging for current approaches to perform well on cases +with various text expressions or with unseen visual entities, limiting its +further application. In this paper, we present a novel RIS approach, which +substantially improves the generalization ability by addressing the two +dilemmas mentioned above. Specially, to deal with unconstrained texts, we +propose to boost a given expression with an explicit and crucial prompt, which +complements the expression in a unified context, facilitating target capturing +in the presence of linguistic style changes. Furthermore, we introduce a +multi-modal fusion aggregation module with visual guidance from a powerful +pretrained model to leverage spatial relations and pixel coherences to handle +the incomplete target masks and false positive irregular clumps which often +appear on unseen visual entities. Extensive experiments are conducted in the +zero-shot cross-dataset settings and the proposed approach achieves consistent +gains compared to the state-of-the-art, e.g., 4.15\%, 5.45\%, and 4.64\% mIoU +increase on RefCOCO, RefCOCO+ and ReferIt respectively, demonstrating its +effectiveness. Additionally, the results on GraspNet-RIS show that our approach +also generalizes well to new scenarios with large domain shifts. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ FSGS: Real-Time Few-shot View Synthesis using Gaussian Splatting + + +
+ Novel view synthesis from limited observations remains an important and +persistent task. However, high efficiency in existing NeRF-based few-shot view +synthesis is often compromised to obtain an accurate 3D representation. To +address this challenge, we propose a few-shot view synthesis framework based on +3D Gaussian Splatting that enables real-time and photo-realistic view synthesis +with as few as three training views. The proposed method, dubbed FSGS, handles +the extremely sparse initialized SfM points with a thoughtfully designed +Gaussian Unpooling process. Our method iteratively distributes new Gaussians +around the most representative locations, subsequently infilling local details +in vacant areas. We also integrate a large-scale pre-trained monocular depth +estimator within the Gaussians optimization process, leveraging online +augmented views to guide the geometric optimization towards an optimal +solution. Starting from sparse points observed from limited input viewpoints, +our FSGS can accurately grow into unseen regions, comprehensively covering the +scene and boosting the rendering quality of novel views. Overall, FSGS achieves +state-of-the-art performance in both accuracy and rendering efficiency across +diverse datasets, including LLFF, Mip-NeRF360, and Blender. Project website: +https://zehaozhu.github.io/FSGS/. + +
+
+ comment: Project page: https://zehaozhu.github.io/FSGS/ +
+
+
+
+
+ + ☆ Dolphins: Multimodal Language Model for Driving + + +
+ The quest for fully autonomous vehicles (AVs) capable of navigating complex +real-world scenarios with human-like understanding and responsiveness. In this +paper, we introduce Dolphins, a novel vision-language model architected to +imbibe human-like abilities as a conversational driving assistant. Dolphins is +adept at processing multimodal inputs comprising video (or image) data, text +instructions, and historical control signals to generate informed outputs +corresponding to the provided instructions. Building upon the open-sourced +pretrained Vision-Language Model, OpenFlamingo, we first enhance Dolphins's +reasoning capabilities through an innovative Grounded Chain of Thought (GCoT) +process. Then we tailored Dolphins to the driving domain by constructing +driving-specific instruction data and conducting instruction tuning. Through +the utilization of the BDD-X dataset, we designed and consolidated four +distinct AV tasks into Dolphins to foster a holistic understanding of intricate +driving scenarios. As a result, the distinctive features of Dolphins are +characterized into two dimensions: (1) the ability to provide a comprehensive +understanding of complex and long-tailed open-world driving scenarios and solve +a spectrum of AV tasks, and (2) the emergence of human-like capabilities +including gradient-free instant adaptation via in-context learning and error +recovery via reflection. + +
+
+ comment: The project page is available at https://vlm-driver.github.io/ +
+
+
+
+
+ + ☆ Enhancing Image Captioning with Neural Models + + +
+ This research explores the realm of neural image captioning using deep +learning models. The study investigates the performance of different neural +architecture configurations, focusing on the inject architecture, and proposes +a novel quality metric for evaluating caption generation. Through extensive +experimentation and analysis, this work sheds light on the challenges and +opportunities in image captioning, providing insights into model behavior and +overfitting. The results reveal that while the merge models exhibit a larger +vocabulary and higher ROUGE scores, the inject architecture generates relevant +and concise image captions. The study also highlights the importance of +refining training data and optimizing hyperparameters for improved model +performance. This research contributes to the growing body of knowledge in +neural image captioning and encourages further exploration in the field, +emphasizing the democratization of artificial intelligence. + +
+
+ comment: 14 pages, 14 figures +
+
+
+
+
+ + ☆ A Low-Power Neuromorphic Approach for Efficient Eye-Tracking + + +
+ This paper introduces a neuromorphic methodology for eye tracking, harnessing +pure event data captured by a Dynamic Vision Sensor (DVS) camera. The framework +integrates a directly trained Spiking Neuron Network (SNN) regression model and +leverages a state-of-the-art low power edge neuromorphic processor - Speck, +collectively aiming to advance the precision and efficiency of eye-tracking +systems. First, we introduce a representative event-based eye-tracking dataset, +"Ini-30", which was collected with two glass-mounted DVS cameras from thirty +volunteers. Then,a SNN model, based on Integrate And Fire (IAF) neurons, named +"Retina", is described , featuring only 64k parameters (6.63x fewer than the +latest) and achieving pupil tracking error of only 3.24 pixels in a 64x64 DVS +input. The continous regression output is obtained by means of convolution +using a non-spiking temporal 1D filter slided across the output spiking layer. +Finally, we evaluate Retina on the neuromorphic processor, showing an +end-to-end power between 2.89-4.8 mW and a latency of 5.57-8.01 mS dependent on +the time window. We also benchmark our model against the latest event-based +eye-tracking method, "3ET", which was built upon event frames. Results show +that Retina achieves superior precision with 1.24px less pupil centroid error +and reduced computational complexity with 35 times fewer MAC operations. We +hope this work will open avenues for further investigation of close-loop +neuromorphic solutions and true event-based training pursuing edge performance. + +
+
+
+
+
+ + ☆ Towards Explaining Satellite Based Poverty Predictions with + Convolutional Neural Networks + + +
+ Deep convolutional neural networks (CNNs) have been shown to predict poverty +and development indicators from satellite images with surprising accuracy. This +paper presents a first attempt at analyzing the CNNs responses in detail and +explaining the basis for the predictions. The CNN model, while trained on +relatively low resolution day- and night-time satellite images, is able to +outperform human subjects who look at high-resolution images in ranking the +Wealth Index categories. Multiple explainability experiments performed on the +model indicate the importance of the sizes of the objects, pixel colors in the +image, and provide a visualization of the importance of different structures in +input images. A visualization is also provided of type images that maximize the +network prediction of Wealth Index, which provides clues on what the CNN +prediction is based on. + +
+
+ comment: Accepted to the DSAA 2023 conference, published in IEEE Xplore +
+
+
+
+
+ + ☆ Large-scale Vision-Language Models Learn Super Images for Efficient and + High-Performance Partially Relevant Video Retrieval + + +
+ In this paper, we propose an efficient and high-performance method for +partially relevant video retrieval (PRVR), which aims to retrieve untrimmed +long videos that contain at least one relevant moment to the input text query. +In terms of both efficiency and performance, the overlooked bottleneck of +previous studies is the visual encoding of dense frames. This guides +researchers to choose lightweight visual backbones, yielding sub-optimal +retrieval performance due to their limited capabilities of learned visual +representations. However, it is undesirable to simply replace them with +high-performance large-scale vision-and-language models (VLMs) due to their low +efficiency. To address these issues, instead of dense frames, we focus on super +images, which are created by rearranging the video frames in a $N \times N$ +grid layout. This reduces the number of visual encodings to $\frac{1}{N^2}$ and +compensates for the low efficiency of large-scale VLMs, allowing us to adopt +them as powerful encoders. Surprisingly, we discover that with a simple +query-image attention trick, VLMs generalize well to super images effectively +and demonstrate promising zero-shot performance against SOTA methods +efficiently. In addition, we propose a fine-tuning approach by incorporating a +few trainable modules into the VLM backbones. The experimental results +demonstrate that our approaches efficiently achieve the best performance on +ActivityNet Captions and TVR. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ SCHEME: Scalable Channer Mixer for Vision Transformers + + +
+ Vision Transformers have received significant attention due to their +impressive performance in many vision tasks. While the token mixer or attention +block has been studied in great detail, the channel mixer or feature mixing +block (FFN or MLP) has not been explored in depth albeit it accounts for a bulk +of the parameters and computation in a model. In this work, we study whether +sparse feature mixing can replace the dense connections and confirm this with a +block diagonal MLP structure that improves the accuracy by supporting larger +expansion ratios. To improve the feature clusters formed by this structure and +thereby further improve the accuracy, a lightweight, parameter-free, channel +covariance attention (CCA) mechanism is introduced as a parallel branch during +training. This design of CCA enables gradual feature mixing across channel +groups during training whose contribution decays to zero as the training +progresses to convergence. This allows the CCA block to be discarded during +inference, thus enabling enhanced performance with no additional computational +cost. The resulting $\textit{Scalable CHannEl MixEr}$ (SCHEME) can be plugged +into any ViT architecture to obtain a gamut of models with different trade-offs +between complexity and performance by controlling the block diagonal structure +size in the MLP. This is shown by the introduction of a new family of +SCHEMEformer models. Experiments on image classification, object detection, and +semantic segmentation, with different ViT backbones, consistently demonstrate +substantial accuracy gains over existing designs, especially under lower FLOPs +regimes. For example, the SCHEMEformer establishes a new SOTA of 79.7% accuracy +for ViTs using pure attention mixers on ImageNet-1K at 1.77G FLOPs. + +
+
+ comment: Preprint. Under review +
+
+
+
+
+ + ☆ VIoTGPT: Learning to Schedule Vision Tools towards Intelligent Video + Internet of Things + + +
+ Video Internet of Things (VIoT) has shown full potential in collecting an +unprecedented volume of video data. Learning to schedule perceiving models and +analyzing the collected videos intelligently will be potential sparks for VIoT. +In this paper, to address the challenges posed by the fine-grained and +interrelated vision tool usage of VIoT, we build VIoTGPT, the framework based +on LLMs to correctly interact with humans, query knowledge videos, and invoke +vision models to accomplish complicated tasks. To support VIoTGPT and related +future works, we meticulously crafted the training dataset and established +benchmarks involving 11 representative vision models across three categories +based on semi-automatic annotations. To guide LLM to act as the intelligent +agent towards intelligent VIoT, we resort to ReAct instruction tuning based on +the collected VIoT dataset to learn the tool capability. Quantitative and +qualitative experimental results and analyses demonstrate the effectiveness of +VIoTGPT. + +
+
+
+
+
+ + ☆ Learning to Estimate Critical Gait Parameters from Single-View RGB + Videos with Transformer-Based Attention Network + + +
+ Musculoskeletal diseases and cognitive impairments in patients lead to +difficulties in movement as well as negative effects on their psychological +health. Clinical gait analysis, a vital tool for early diagnosis and treatment, +traditionally relies on expensive optical motion capture systems. Recent +advances in computer vision and deep learning have opened the door to more +accessible and cost-effective alternatives. This paper introduces a novel +spatio-temporal Transformer network to estimate critical gait parameters from +RGB videos captured by a single-view camera. Empirical evaluations on a public +dataset of cerebral palsy patients indicate that the proposed framework +surpasses current state-of-the-art approaches and show significant improvements +in predicting general gait parameters (including Walking Speed, Gait Deviation +Index - GDI, and Knee Flexion Angle at Maximum Extension), while utilizing +fewer parameters and alleviating the need for manual feature extraction. + +
+
+
+
+
+ + ☆ Study and Survey on Gesture Recognition Systems + + +
+ In recent years, there has been a considerable amount of research in the +Gesture Recognition domain, mainly owing to the technological advancements in +Computer Vision. Various new applications have been conceptualised and +developed in this field. This paper discusses the implementation of gesture +recognition systems in multiple sectors such as gaming, healthcare, home +appliances, industrial robots, and virtual reality. Different methodologies for +capturing gestures are compared and contrasted throughout this survey. Various +data sources and data acquisition techniques have been discussed. The role of +gestures in sign language has been studied and existing approaches have been +reviewed. Common challenges faced while building gesture recognition systems +have also been explored. + +
+
+ comment: 6 pages, accepted at the ICCUBEA, IEEE 2023 conference +
+
+
+
+
+ + ☆ Partition-based K-space Synthesis for Multi-contrast Parallel Imaging + + +
+ Multi-contrast magnetic resonance imaging is a significant and essential +medical imaging technique.However, multi-contrast imaging has longer +acquisition time and is easy to cause motion artifacts. In particular, the +acquisition time for a T2-weighted image is prolonged due to its longer +repetition time (TR). On the contrary, T1-weighted image has a shorter TR. +Therefore,utilizing complementary information across T1 and T2-weighted image +is a way to decrease the overall imaging time. Previous T1-assisted T2 +reconstruction methods have mostly focused on image domain using whole-based +image fusion approaches. The image domain reconstruction method has the defects +of high computational complexity and limited flexibility. To address this +issue, we propose a novel multi-contrast imaging method called partition-based +k-space synthesis (PKS) which can achieve super reconstruction quality of +T2-weighted image by feature fusion. Concretely, we first decompose +fully-sampled T1 k-space data and under-sampled T2 k-space data into two +sub-data, separately. Then two new objects are constructed by combining the two +sub-T1/T2 data. After that, the two new objects as the whole data to realize +the reconstruction of T2-weighted image. Finally, the objective T2 is +synthesized by extracting the sub-T2 data of each part. Experimental results +showed that our combined technique can achieve comparable or better results +than using traditional k-space parallel imaging(SAKE) that processes each +contrast independently. + +
+
+
+
+
+ + ☆ Local monotone operator learning using non-monotone operators: MnM-MOL + + +
+ The recovery of magnetic resonance (MR) images from undersampled measurements +is a key problem that has seen extensive research in recent years. Unrolled +approaches, which rely on end-to-end training of convolutional neural network +(CNN) blocks within iterative reconstruction algorithms, offer state-of-the-art +performance. These algorithms require a large amount of memory during training, +making them difficult to employ in high-dimensional applications. Deep +equilibrium (DEQ) models and the recent monotone operator learning (MOL) +approach were introduced to eliminate the need for unrolling, thus reducing the +memory demand during training. Both approaches require a Lipschitz constraint +on the network to ensure that the forward and backpropagation iterations +converge. Unfortunately, the constraint often results in reduced performance +compared to unrolled methods. The main focus of this work is to relax the +constraint on the CNN block in two different ways. Inspired by +convex-non-convex regularization strategies, we now impose the monotone +constraint on the sum of the gradient of the data term and the CNN block, +rather than constrain the CNN itself to be a monotone operator. This approach +enables the CNN to learn possibly non-monotone score functions, which can +translate to improved performance. In addition, we only restrict the operator +to be monotone in a local neighborhood around the image manifold. Our +theoretical results show that the proposed algorithm is guaranteed to converge +to the fixed point and that the solution is robust to input perturbations, +provided that it is initialized close to the true solution. Our empirical +results show that the relaxed constraints translate to improved performance and +that the approach enjoys robustness to input perturbations similar to MOL. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ SynFundus: Generating a synthetic fundus images dataset with millions of + samples and multi-disease annotations + + +
+ In the field of medical imaging, the scarcity of large-scale datasets due to +privacy restrictions stands as a significant barrier to develop large models +for medical. To address this issue, we introduce SynFundus-1M, a high-quality +synthetic dataset with over 1 million retinal fundus images and extensive +disease and pathologies annotations, which is generated by a Denoising +Diffusion Probabilistic Model. The SynFundus-Generator and SynFundus-1M achieve +superior Frechet Inception Distance (FID) scores compared to existing methods +on main-stream public real datasets. Furthermore, the ophthalmologists +evaluation validate the difficulty in discerning these synthetic images from +real ones, confirming the SynFundus-1M's authenticity. Through extensive +experiments, we demonstrate that both CNN and ViT can benifit from SynFundus-1M +by pretraining or training directly. Compared to datasets like ImageNet or +EyePACS, models train on SynFundus-1M not only achieve better performance but +also faster convergence on various downstream tasks. + +
+
+
+
+
+ + ☆ Text-Guided 3D Face Synthesis -- From Generation to Editing + + +
+ Text-guided 3D face synthesis has achieved remarkable results by leveraging +text-to-image (T2I) diffusion models. However, most existing works focus solely +on the direct generation, ignoring the editing, restricting them from +synthesizing customized 3D faces through iterative adjustments. In this paper, +we propose a unified text-guided framework from face generation to editing. In +the generation stage, we propose a geometry-texture decoupled generation to +mitigate the loss of geometric details caused by coupling. Besides, decoupling +enables us to utilize the generated geometry as a condition for texture +generation, yielding highly geometry-texture aligned results. We further employ +a fine-tuned texture diffusion model to enhance texture quality in both RGB and +YUV space. In the editing stage, we first employ a pre-trained diffusion model +to update facial geometry or texture based on the texts. To enable sequential +editing, we introduce a UV domain consistency preservation regularization, +preventing unintentional changes to irrelevant facial attributes. Besides, we +propose a self-guided consistency weight strategy to improve editing efficacy +while preserving consistency. Through comprehensive experiments, we showcase +our method's superiority in face synthesis. Project page: +https://faceg2e.github.io/. + +
+
+
+
+
+ + ☆ Benchmarking Multi-Domain Active Learning on Image Classification + + +
+ Active learning aims to enhance model performance by strategically labeling +informative data points. While extensively studied, its effectiveness on +large-scale, real-world datasets remains underexplored. Existing research +primarily focuses on single-source data, ignoring the multi-domain nature of +real-world data. We introduce a multi-domain active learning benchmark to +bridge this gap. Our benchmark demonstrates that traditional single-domain +active learning strategies are often less effective than random selection in +multi-domain scenarios. We also introduce CLIP-GeoYFCC, a novel large-scale +image dataset built around geographical domains, in contrast to existing +genre-based domain datasets. Analysis on our benchmark shows that all +multi-domain strategies exhibit significant tradeoffs, with no strategy +outperforming across all datasets or all metrics, emphasizing the need for +future research. + +
+
+
+
+
+ + ☆ Dancing with Images: Video Distillation via Static-Dynamic + Disentanglement + + +
+ Recently, dataset distillation has paved the way towards efficient machine +learning, especially for image datasets. However, the distillation for videos, +characterized by an exclusive temporal dimension, remains an underexplored +domain. In this work, we provide the first systematic study of video +distillation and introduce a taxonomy to categorize temporal compression. Our +investigation reveals that the temporal information is usually not well learned +during distillation , and the temporal dimension of synthetic data contributes +little. The observations motivate our unified framework of disentangling the +dynamic and static information in the videos. It first distills the videos into +still images as static memory and then compensates the dynamic and motion +information with a learnable dynamic memory block. Our method achieves +state-of-the-art on video datasets at different scales, with notably smaller +storage expenditure. Our code will be publicly available. + +
+
+
+
+
+ + ☆ Efficient Multimodal Semantic Segmentation via Dual-Prompt Learning + + +
+ Multimodal (e.g., RGB-Depth/RGB-Thermal) fusion has shown great potential for +improving semantic segmentation in complex scenes (e.g., indoor/low-light +conditions). Existing approaches often fully fine-tune a dual-branch +encoder-decoder framework with a complicated feature fusion strategy for +achieving multimodal semantic segmentation, which is training-costly due to the +massive parameter updates in feature extraction and fusion. To address this +issue, we propose a surprisingly simple yet effective dual-prompt learning +network (dubbed DPLNet) for training-efficient multimodal (e.g., RGB-D/T) +semantic segmentation. The core of DPLNet is to directly adapt a frozen +pre-trained RGB model to multimodal semantic segmentation, reducing parameter +updates. For this purpose, we present two prompt learning modules, comprising +multimodal prompt generator (MPG) and multimodal feature adapter (MFA). MPG +works to fuse the features from different modalities in a compact manner and is +inserted from shadow to deep stages to generate the multi-level multimodal +prompts that are injected into the frozen backbone, while MPG adapts prompted +multimodal features in the frozen backbone for better multimodal semantic +segmentation. Since both the MPG and MFA are lightweight, only a few trainable +parameters (3.88M, 4.4% of the pre-trained backbone parameters) are introduced +for multimodal feature fusion and learning. Using a simple decoder (3.27M +parameters), DPLNet achieves new state-of-the-art performance or is on a par +with other complex approaches on four RGB-D/T semantic segmentation datasets +while satisfying parameter efficiency. Moreover, we show that DPLNet is general +and applicable to other multimodal tasks such as salient object detection and +video semantic segmentation. Without special design, DPLNet outperforms many +complicated models. Our code will be available at +github.com/ShaohuaDong2021/DPLNet. + +
+
+
+
+
+ + ☆ Impact of Data Augmentation on QCNNs + + +
+ In recent years, Classical Convolutional Neural Networks (CNNs) have been +applied for image recognition successfully. Quantum Convolutional Neural +Networks (QCNNs) are proposed as a novel generalization to CNNs by using +quantum mechanisms. The quantum mechanisms lead to an efficient training +process in QCNNs by reducing the size of input from $N$ to $log_2N$. This paper +implements and compares both CNNs and QCNNs by testing losses and prediction +accuracy on three commonly used datasets. The datasets include the MNIST +hand-written digits, Fashion MNIST and cat/dog face images. Additionally, data +augmentation (DA), a technique commonly used in CNNs to improve the performance +of classification by generating similar images based on original inputs, is +also implemented in QCNNs. Surprisingly, the results showed that data +augmentation didn't improve QCNNs performance. The reasons and logic behind +this result are discussed, hoping to expand our understanding of Quantum +machine learning theory. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ A Generalizable Deep Learning System for Cardiac MRI + + +
+ Cardiac MRI allows for a comprehensive assessment of myocardial structure, +function, and tissue characteristics. Here we describe a foundational vision +system for cardiac MRI, capable of representing the breadth of human +cardiovascular disease and health. Our deep learning model is trained via +self-supervised contrastive learning, by which visual concepts in cine-sequence +cardiac MRI scans are learned from the raw text of the accompanying radiology +reports. We train and evaluate our model on data from four large academic +clinical institutions in the United States. We additionally showcase the +performance of our models on the UK BioBank, and two additional publicly +available external datasets. We explore emergent zero-shot capabilities of our +system, and demonstrate remarkable performance across a range of tasks; +including the problem of left ventricular ejection fraction regression, and the +diagnosis of 35 different conditions such as cardiac amyloidosis and +hypertrophic cardiomyopathy. We show that our deep learning system is capable +of not only understanding the staggering complexity of human cardiovascular +disease, but can be directed towards clinical problems of interest yielding +impressive, clinical grade diagnostic accuracy with a fraction of the training +data typically required for such tasks. + +
+
+ comment: 21 page main manuscript, 4 figures. Supplementary Appendix and code + will be made available on publication +
+
+
+
+
+ + ☆ Manipulating the Label Space for In-Context Classification + + +
+ After pre-training by generating the next word conditional on previous words, +the Language Model (LM) acquires the ability of In-Context Learning (ICL) that +can learn a new task conditional on the context of the given in-context +examples (ICEs). Similarly, visually-conditioned Language Modelling is also +used to train Vision-Language Models (VLMs) with ICL ability. However, such +VLMs typically exhibit weaker classification abilities compared to contrastive +learning-based models like CLIP, since the Language Modelling objective does +not directly contrast whether an object is paired with a text. To improve the +ICL of classification, using more ICEs to provide more knowledge is a +straightforward way. However, this may largely increase the selection time, and +more importantly, the inclusion of additional in-context images tends to extend +the length of the in-context sequence beyond the processing capacity of a VLM. +To alleviate these limitations, we propose to manipulate the label space of +each ICE to increase its knowledge density, allowing for fewer ICEs to convey +as much information as a larger set would. Specifically, we propose two +strategies which are Label Distribution Enhancement and Visual Descriptions +Enhancement to improve In-context classification performance on diverse +datasets, including the classic ImageNet and more fine-grained datasets like +CUB-200. Specifically, using our approach on ImageNet, we increase accuracy +from 74.70\% in a 4-shot setting to 76.21\% with just 2 shots. surpassing CLIP +by 0.67\%. On CUB-200, our method raises 1-shot accuracy from 48.86\% to +69.05\%, 12.15\% higher than CLIP. The code is given in +https://anonymous.4open.science/r/MLS_ICC. + +
+
+
+
+
+ + ☆ Student Activity Recognition in Classroom Environments using Transfer + Learning + + +
+ The recent advances in artificial intelligence and deep learning facilitate +automation in various applications including home automation, smart +surveillance systems, and healthcare among others. Human Activity Recognition +is one of its emerging applications, which can be implemented in a classroom +environment to enhance safety, efficiency, and overall educational quality. +This paper proposes a system for detecting and recognizing the activities of +students in a classroom environment. The dataset has been structured and +recorded by the authors since a standard dataset for this task was not +available at the time of this study. Transfer learning, a widely adopted method +within the field of deep learning, has proven to be helpful in complex tasks +like image and video processing. Pretrained models including VGG-16, ResNet-50, +InceptionV3, and Xception are used for feature extraction and classification +tasks. Xception achieved an accuracy of 93%, on the novel classroom dataset, +outperforming the other three models in consideration. The system proposed in +this study aims to introduce a safer and more productive learning environment +for students and educators. + +
+
+ comment: 6 pages, 12 figures, accepted at the IEEE International Conference on + Computational Intelligence, Networks and Security (ICCINS) 2023 +
+
+
+
+
+ + ☆ RTQ: Rethinking Video-language Understanding Based on Image-text Model ACM MM 2023 + + +
+ Recent advancements in video-language understanding have been established on +the foundation of image-text models, resulting in promising outcomes due to the +shared knowledge between images and videos. However, video-language +understanding presents unique challenges due to the inclusion of highly complex +semantic details, which result in information redundancy, temporal dependency, +and scene complexity. Current techniques have only partially tackled these +issues, and our quantitative analysis indicates that some of these methods are +complementary. In light of this, we propose a novel framework called RTQ +(Refine, Temporal model, and Query), which addresses these challenges +simultaneously. The approach involves refining redundant information within +frames, modeling temporal relations among frames, and querying task-specific +information from the videos. Remarkably, our model demonstrates outstanding +performance even in the absence of video-language pre-training, and the results +are comparable with or superior to those achieved by state-of-the-art +pre-training methods. + +
+
+ comment: Accepted by ACM MM 2023 as Oral representation +
+
+
+
+
+ + ☆ OpenStereo: A Comprehensive Benchmark for Stereo Matching and Strong + Baseline + + +
+ Stereo matching, a pivotal technique in computer vision, plays a crucial role +in robotics, autonomous navigation, and augmented reality. Despite the +development of numerous impressive methods in recent years, replicating their +results and determining the most suitable architecture for practical +application remains challenging. Addressing this gap, our paper introduces a +comprehensive benchmark focusing on practical applicability rather than solely +on performance enhancement. Specifically, we develop a flexible and efficient +stereo matching codebase, called OpenStereo. OpenStereo includes training and +inference codes of more than 12 network models, making it, to our knowledge, +the most complete stereo matching toolbox available. Based on OpenStereo, we +conducted experiments on the SceneFlow dataset and have achieved or surpassed +the performance metrics reported in the original paper. Additionally, we +conduct an in-depth revisitation of recent developments in stereo matching +through ablative experiments. These investigations inspired the creation of +StereoBase, a simple yet strong baseline model. Our extensive comparative +analyses of StereoBase against numerous contemporary stereo matching methods on +the SceneFlow dataset demonstrate its remarkably strong performance. The source +code is available at https://github.com/XiandaGuo/OpenStereo. + +
+
+
+
+
+ + ☆ Learning Anatomically Consistent Embedding for Chest Radiography BMVC 2023 + + +
+ Self-supervised learning (SSL) approaches have recently shown substantial +success in learning visual representations from unannotated images. Compared +with photographic images, medical images acquired with the same imaging +protocol exhibit high consistency in anatomy. To exploit this anatomical +consistency, this paper introduces a novel SSL approach, called PEAC (patch +embedding of anatomical consistency), for medical image analysis. Specifically, +in this paper, we propose to learn global and local consistencies via stable +grid-based matching, transfer pre-trained PEAC models to diverse downstream +tasks, and extensively demonstrate that (1) PEAC achieves significantly better +performance than the existing state-of-the-art fully/self-supervised methods, +and (2) PEAC captures the anatomical structure consistency across views of the +same patient and across patients of different genders, weights, and healthy +statuses, which enhances the interpretability of our method for medical image +analysis. + +
+
+ comment: BMVC 2023, oral +
+
+
+
+
+ + ☆ StyleCrafter: Enhancing Stylized Text-to-Video Generation with Style + Adapter + + +
+ Text-to-video (T2V) models have shown remarkable capabilities in generating +diverse videos. However, they struggle to produce user-desired stylized videos +due to (i) text's inherent clumsiness in expressing specific styles and (ii) +the generally degraded style fidelity. To address these challenges, we +introduce StyleCrafter, a generic method that enhances pre-trained T2V models +with a style control adapter, enabling video generation in any style by +providing a reference image. Considering the scarcity of stylized video +datasets, we propose to first train a style control adapter using style-rich +image datasets, then transfer the learned stylization ability to video +generation through a tailor-made finetuning paradigm. To promote content-style +disentanglement, we remove style descriptions from the text prompt and extract +style information solely from the reference image using a decoupling learning +strategy. Additionally, we design a scale-adaptive fusion module to balance the +influences of text-based content features and image-based style features, which +helps generalization across various text and style combinations. StyleCrafter +efficiently generates high-quality stylized videos that align with the content +of the texts and resemble the style of the reference images. Experiments +demonstrate that our approach is more flexible and efficient than existing +competitors. + +
+
+ comment: Project page: https://gongyeliu.github.io/StyleCrafter.github.io/ ; + GitHub repository: https://github.com/GongyeLiu/StyleCrafter +
+
+
+
+
+ + ☆ Improving Efficiency of DNN-based Relocalization Module for Autonomous + Driving with Server-side Computing + + +
+ In this work, we present a novel framework for camera relocation in +autonomous vehicles, leveraging deep neural networks (DNN). While existing +literature offers various DNN-based camera relocation methods, their deployment +is hindered by their high computational demands during inference. In contrast, +our approach addresses this challenge through edge cloud collaboration. +Specifically, we strategically offload certain modules of the neural network to +the server and evaluate the inference time of data frames under different +network segmentation schemes to guide our offloading decisions. Our findings +highlight the vital role of server-side offloading in DNN-based camera +relocation for autonomous vehicles, and we also discuss the results of data +fusion. Finally, we validate the effectiveness of our proposed framework +through experimental evaluation. + +
+
+
+
+
+ + ☆ Improving Normalization with the James-Stein Estimator + + +
+ Stein's paradox holds considerable sway in high-dimensional statistics, +highlighting that the sample mean, traditionally considered the de facto +estimator, might not be the most efficacious in higher dimensions. To address +this, the James-Stein estimator proposes an enhancement by steering the sample +means toward a more centralized mean vector. In this paper, first, we establish +that normalization layers in deep learning use inadmissible estimators for mean +and variance. Next, we introduce a novel method to employ the James-Stein +estimator to improve the estimation of mean and variance within normalization +layers. We evaluate our method on different computer vision tasks: image +classification, semantic segmentation, and 3D object classification. Through +these evaluations, it is evident that our improved normalization layers +consistently yield superior accuracy across all tasks without extra +computational burden. Moreover, recognizing that a plethora of shrinkage +estimators surpass the traditional estimator in performance, we study two other +prominent shrinkage estimators: Ridge and LASSO. Additionally, we provide +visual representations to intuitively demonstrate the impact of shrinkage on +the estimated layer statistics. Finally, we study the effect of regularization +and batch size on our modified batch normalization. The studies show that our +method is less sensitive to batch size and regularization, improving accuracy +under various setups. + +
+
+
+
+
+ + ☆ Segment Anything Model-guided Collaborative Learning Network for + Scribble-supervised Polyp Segmentation + + +
+ Polyp segmentation plays a vital role in accurately locating polyps at an +early stage, which holds significant clinical importance for the prevention of +colorectal cancer. Various polyp segmentation methods have been developed using +fully-supervised deep learning techniques. However, pixel-wise annotation for +polyp images by physicians during the diagnosis is both time-consuming and +expensive. Moreover, visual foundation models such as the Segment Anything +Model (SAM) have shown remarkable performance. Nevertheless, directly applying +SAM to medical segmentation may not produce satisfactory results due to the +inherent absence of medical knowledge. In this paper, we propose a novel +SAM-guided Collaborative Learning Network (SAM-CLNet) for scribble-supervised +polyp segmentation, enabling a collaborative learning process between our +segmentation network and SAM to boost the model performance. Specifically, we +first propose a Cross-level Enhancement and Aggregation Network (CEA-Net) for +weakly-supervised polyp segmentation. Within CEA-Net, we propose a Cross-level +Enhancement Module (CEM) that integrates the adjacent features to enhance the +representation capabilities of different resolution features. Additionally, a +Feature Aggregation Module (FAM) is employed to capture richer features across +multiple levels. Moreover, we present a box-augmentation strategy that combines +the segmentation maps generated by CEA-Net with scribble annotations to create +more precise prompts. These prompts are then fed into SAM, generating +segmentation SAM-guided masks, which can provide additional supervision to +train CEA-Net effectively. Furthermore, we present an Image-level Filtering +Mechanism to filter out unreliable SAM-guided masks. Extensive experimental +results show that our SAM-CLNet outperforms state-of-the-art weakly-supervised +segmentation methods. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ 3D Face Reconstruction with the Geometric Guidance of Facial Part + Segmentation + + +
+ 3D Morphable Models (3DMMs) provide promising 3D face reconstructions in +various applications. However, existing methods struggle to reconstruct faces +with extreme expressions due to deficiencies in supervisory signals, such as +sparse or inaccurate landmarks. Segmentation information contains effective +geometric contexts for face reconstruction. Certain attempts intuitively depend +on differentiable renderers to compare the rendered silhouettes of +reconstruction with segmentation, which is prone to issues like local optima +and gradient instability. In this paper, we fully utilize the facial part +segmentation geometry by introducing Part Re-projection Distance Loss (PRDL). +Specifically, PRDL transforms facial part segmentation into 2D points and +re-projects the reconstruction onto the image plane. Subsequently, by +introducing grid anchors and computing different statistical distances from +these anchors to the point sets, PRDL establishes geometry descriptors to +optimize the distribution of the point sets for face reconstruction. PRDL +exhibits a clear gradient compared to the renderer-based methods and presents +state-of-the-art reconstruction performance in extensive quantitative and +qualitative experiments. The project will be publicly available. + +
+
+
+
+
+ + ☆ A knowledge-based data-driven (KBDD) framework for all-day + identification of cloud types using satellite remote sensing + + +
+ Cloud types, as a type of meteorological data, are of particular significance +for evaluating changes in rainfall, heatwaves, water resources, floods and +droughts, food security and vegetation cover, as well as land use. In order to +effectively utilize high-resolution geostationary observations, a +knowledge-based data-driven (KBDD) framework for all-day identification of +cloud types based on spectral information from Himawari-8/9 satellite sensors +is designed. And a novel, simple and efficient network, named CldNet, is +proposed. Compared with widely used semantic segmentation networks, including +SegNet, PSPNet, DeepLabV3+, UNet, and ResUnet, our proposed model CldNet with +an accuracy of 80.89+-2.18% is state-of-the-art in identifying cloud types and +has increased by 32%, 46%, 22%, 2%, and 39%, respectively. With the assistance +of auxiliary information (e.g., satellite zenith/azimuth angle, solar +zenith/azimuth angle), the accuracy of CldNet-W using visible and near-infrared +bands and CldNet-O not using visible and near-infrared bands on the test +dataset is 82.23+-2.14% and 73.21+-2.02%, respectively. Meanwhile, the total +parameters of CldNet are only 0.46M, making it easy for edge deployment. More +importantly, the trained CldNet without any fine-tuning can predict cloud types +with higher spatial resolution using satellite spectral data with spatial +resolution 0.02{\deg}*0.02{\deg}, which indicates that CldNet possesses a +strong generalization ability. In aggregate, the KBDD framework using CldNet is +a highly effective cloud-type identification system capable of providing a +high-fidelity, all-day, spatiotemporal cloud-type database for many climate +assessment fields. + +
+
+
+
+
+ + ☆ RadioGalaxyNET: Dataset and Novel Computer Vision Algorithms for the + Detection of Extended Radio Galaxies and Infrared Hosts + + +
+ Creating radio galaxy catalogues from next-generation deep surveys requires +automated identification of associated components of extended sources and their +corresponding infrared hosts. In this paper, we introduce RadioGalaxyNET, a +multimodal dataset, and a suite of novel computer vision algorithms designed to +automate the detection and localization of multi-component extended radio +galaxies and their corresponding infrared hosts. The dataset comprises 4,155 +instances of galaxies in 2,800 images with both radio and infrared channels. +Each instance provides information about the extended radio galaxy class, its +corresponding bounding box encompassing all components, the pixel-level +segmentation mask, and the keypoint position of its corresponding infrared host +galaxy. RadioGalaxyNET is the first dataset to include images from the highly +sensitive Australian Square Kilometre Array Pathfinder (ASKAP) radio telescope, +corresponding infrared images, and instance-level annotations for galaxy +detection. We benchmark several object detection algorithms on the dataset and +propose a novel multimodal approach to simultaneously detect radio galaxies and +the positions of infrared hosts. + +
+
+ comment: Accepted for publication in PASA. The paper has 17 pages, 6 figures, + 5 tables +
+
+
+
+
+ + ☆ Developmental Pretraining (DPT) for Image Classification Networks + + +
+ In the backdrop of increasing data requirements of Deep Neural Networks for +object recognition that is growing more untenable by the day, we present +Developmental PreTraining (DPT) as a possible solution. DPT is designed as a +curriculum-based pre-training approach designed to rival traditional +pre-training techniques that are data-hungry. These training approaches also +introduce unnecessary features that could be misleading when the network is +employed in a downstream classification task where the data is sufficiently +different from the pre-training data and is scarce. We design the curriculum +for DPT by drawing inspiration from human infant visual development. DPT +employs a phased approach where carefully-selected primitive and universal +features like edges and shapes are taught to the network participating in our +pre-training regime. A model that underwent the DPT regime is tested against +models with randomised weights to evaluate the viability of DPT. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ QIENet: Quantitative irradiance estimation network using recurrent + neural network based on satellite remote sensing data + + +
+ Global horizontal irradiance (GHI) plays a vital role in estimating solar +energy resources, which are used to generate sustainable green energy. In order +to estimate GHI with high spatial resolution, a quantitative irradiance +estimation network, named QIENet, is proposed. Specifically, the temporal and +spatial characteristics of remote sensing data of the satellite Himawari-8 are +extracted and fused by recurrent neural network (RNN) and convolution +operation, respectively. Not only remote sensing data, but also GHI-related +time information (hour, day, and month) and geographical information (altitude, +longitude, and latitude), are used as the inputs of QIENet. The satellite +spectral channels B07 and B11 - B15 and time are recommended as model inputs +for QIENet according to the spatial distributions of annual solar energy. +Meanwhile, QIENet is able to capture the impact of various clouds on hourly GHI +estimates. More importantly, QIENet does not overestimate ground observations +and can also reduce RMSE by 27.51%/18.00%, increase R2 by 20.17%/9.42%, and +increase r by 8.69%/3.54% compared with ERA5/NSRDB. Furthermore, QIENet is +capable of providing a high-fidelity hourly GHI database with spatial +resolution 0.02{\deg} * 0.02{\deg}(approximately 2km * 2km) for many applied +energy fields. + +
+
+
+
+
+ + ☆ Adaptability of Computer Vision at the Tactical Edge: Addressing + Environmental Uncertainty + + +
+ Computer Vision (CV) systems are increasingly being adopted into Command and +Control (C2) systems to improve intelligence analysis on the battlefield, the +tactical edge. CV systems leverage Artificial Intelligence (AI) algorithms to +help visualize and interpret the environment, enhancing situational awareness. +However, the adaptability of CV systems at the tactical edge remains +challenging due to rapidly changing environments and objects which can confuse +the deployed models. A CV model leveraged in this environment can become +uncertain in its predictions, as the environment and the objects existing in +the environment begin to change. Additionally, mission objectives can rapidly +change leading to adjustments in technology, camera angles, and image +resolutions. All of which can negatively affect the performance of and +potentially introduce uncertainty into the system. When the training +environment and/or technology differs from the deployment environment, CV +models can perform unexpectedly. Unfortunately, most scenarios at the tactical +edge do not incorporate Uncertainty Quantification (UQ) into their deployed C2 +and CV systems. This concept paper explores the idea of synchronizing robust +data operations and model fine-tuning driven by UQ all at the tactical edge. +Specifically, curating datasets and training child models based on the +residuals of predictions, using these child models to calculate prediction +intervals (PI), and then using these PI to calibrate the deployed models. By +incorporating UQ into the core operations surrounding C2 and CV systems at the +tactical edge, we can help drive purposeful adaptability on the battlefield. + +
+
+ comment: Accepted paper for the 28th annual International Command and Control + Research and Technology Symposium (ICCRTS), Johns Hopkins Applied Physics + Laboratory. Baltimore, MD. (2023) +
+
+
+
+
+ + ♻ ☆ Diagnosing Human-object Interaction Detectors + + +
+ We have witnessed significant progress in human-object interaction (HOI) +detection. The reliance on mAP (mean Average Precision) scores as a summary +metric, however, does not provide sufficient insight into the nuances of model +performance (e.g., why one model is better than another), which can hinder +further innovation in this field. To address this issue, in this paper, we +introduce a diagnosis toolbox to provide detailed quantitative break-down +analysis of HOI detection models, inspired by the success of object detection +diagnosis toolboxes. We first conduct holistic investigations in the pipeline +of HOI detection. By defining a set of errors and the oracles to fix each of +them, we can have a quantitative analysis of the significance of different +errors according to the mAP improvement obtained from fixing each error. We +then delve into two sub-tasks of HOI detection: human-object pair detection and +interaction classification, respectively. For the first detection task, we +compute the coverage of ground-truth human-object pairs as well as the +noisiness level in the detection results. For the second classification task, +we measure a model's performance of differentiating positive and negative +detection results and also classifying the actual interactions when the +human-object pairs are correctly detected. We analyze eight state-of-the-art +HOI detection models and provide valuable diagnosis insights to foster future +research. For instance, our diagnosis shows that state-of-the-art model RLIPv2 +outperforms others mainly because it significantly improves the multi-label +interaction classification accuracy. Our toolbox is applicable for different +methods across different datasets and available at +https://github.com/neu-vi/Diag-HOI. + +
+
+
+
+
+ + ♻ ☆ Revisiting DETR Pre-training for Object Detection + + +
+ Motivated by the remarkable achievements of DETR-based approaches on COCO +object detection and segmentation benchmarks, recent endeavors have been +directed towards elevating their performance through self-supervised +pre-training of Transformers while preserving a frozen backbone. Noteworthy +advancements in accuracy have been documented in certain studies. Our +investigation delved deeply into a representative approach, DETReg, and its +performance assessment in the context of emerging models like +$\mathcal{H}$-Deformable-DETR. Regrettably, DETReg proves inadequate in +enhancing the performance of robust DETR-based models under full data +conditions. To dissect the underlying causes, we conduct extensive experiments +on COCO and PASCAL VOC probing elements such as the selection of pre-training +datasets and strategies for pre-training target generation. By contrast, we +employ an optimized approach named Simple Self-training which leads to marked +enhancements through the combination of an improved box predictor and the +Objects$365$ benchmark. The culmination of these endeavors results in a +remarkable AP score of $59.3\%$ on the COCO val set, outperforming +$\mathcal{H}$-Deformable-DETR + Swin-L without pre-training by $1.4\%$. +Moreover, a series of synthetic pre-training datasets, generated by merging +contemporary image-to-text(LLaVA) and text-to-image (SDXL) models, +significantly amplifies object detection capabilities. + +
+
+
+
+
+ + ♻ ☆ Hard View Selection for Self-Supervised Learning + + +
+ Many Self-Supervised Learning (SSL) methods train their models to be +invariant to different "views" of an image input for which a good data +augmentation pipeline is crucial. While considerable efforts were directed +towards improving pre-text tasks, architectures, or robustness (e.g., Siamese +networks or teacher-softmax centering), the majority of these methods remain +strongly reliant on the random sampling of operations within the image +augmentation pipeline, such as the random resized crop or color distortion +operation. In this paper, we argue that the role of the view generation and its +effect on performance has so far received insufficient attention. To address +this, we propose an easy, learning-free, yet powerful Hard View Selection (HVS) +strategy designed to extend the random view generation to expose the pretrained +model to harder samples during SSL training. It encompasses the following +iterative steps: 1) randomly sample multiple views and create pairs of two +views, 2) run forward passes for each view pair on the currently trained model, +3) adversarially select the pair yielding the worst loss, and 4) run the +backward pass with the selected pair. In our empirical analysis we show that +under the hood, HVS increases task difficulty by controlling the Intersection +over Union of views during pretraining. With only 300-epoch pretraining, HVS is +able to closely rival the 800-epoch DINO baseline which remains very favorable +even when factoring in the slowdown induced by the additional forwards of HVS. +Additionally, HVS consistently achieves accuracy improvements on ImageNet +between 0.4% and 1.9% on linear evaluation and similar improvements on transfer +tasks across multiple SSL methods, such as DINO, SimSiam, iBOT, and SimCLR. + +
+
+
+
+
+ + ♻ ☆ Blind Face Restoration for Under-Display Camera via Dictionary Guided + Transformer + + +
+ By hiding the front-facing camera below the display panel, Under-Display +Camera (UDC) provides users with a full-screen experience. However, due to the +characteristics of the display, images taken by UDC suffer from significant +quality degradation. Methods have been proposed to tackle UDC image restoration +and advances have been achieved. There are still no specialized methods and +datasets for restoring UDC face images, which may be the most common problem in +the UDC scene. To this end, considering color filtering, brightness +attenuation, and diffraction in the imaging process of UDC, we propose a +two-stage network UDC Degradation Model Network named UDC-DMNet to synthesize +UDC images by modeling the processes of UDC imaging. Then we use UDC-DMNet and +high-quality face images from FFHQ and CelebA-Test to create UDC face training +datasets FFHQ-P/T and testing datasets CelebA-Test-P/T for UDC face +restoration. We propose a novel dictionary-guided transformer network named +DGFormer. Introducing the facial component dictionary and the characteristics +of the UDC image in the restoration makes DGFormer capable of addressing blind +face restoration in UDC scenarios. Experiments show that our DGFormer and +UDC-DMNet achieve state-of-the-art performance. + +
+
+ comment: To appear in IEEE TCSVT +
+
+
+
+
+ + ♻ ☆ Physics-based Indirect Illumination for Inverse Rendering 3DV 2024 + + +
+ We present a physics-based inverse rendering method that learns the +illumination, geometry, and materials of a scene from posed multi-view RGB +images. To model the illumination of a scene, existing inverse rendering works +either completely ignore the indirect illumination or model it by coarse +approximations, leading to sub-optimal illumination, geometry, and material +prediction of the scene. In this work, we propose a physics-based illumination +model that first locates surface points through an efficient refined sphere +tracing algorithm, then explicitly traces the incoming indirect lights at each +surface point based on reflection. Then, we estimate each identified indirect +light through an efficient neural network. Moreover, we utilize the Leibniz's +integral rule to resolve non-differentiability in the proposed illumination +model caused by boundary lights inspired by differentiable irradiance in +computer graphics. As a result, the proposed differentiable illumination model +can be learned end-to-end together with geometry and materials estimation. As a +side product, our physics-based inverse rendering model also facilitates +flexible and realistic material editing as well as relighting. Extensive +experiments on synthetic and real-world datasets demonstrate that the proposed +method performs favorably against existing inverse rendering methods on novel +view synthesis and inverse rendering. + +
+
+ comment: 3DV 2024; Project Page: https://denghilbert.github.io/pii +
+
+
+
+
+ + ♻ ☆ Uncertainty Estimation and Out-of-Distribution Detection for Deep + Learning-Based Image Reconstruction using the Local Lipschitz + + +
+ Accurate image reconstruction is at the heart of diagnostics in medical +imaging. Supervised deep learning-based approaches have been investigated for +solving inverse problems including image reconstruction. However, these trained +models encounter unseen data distributions that are widely shifted from +training data during deployment. Therefore, it is essential to assess whether a +given input falls within the training data distribution for diagnostic +purposes. Uncertainty estimation approaches exist but focus on providing an +uncertainty map to radiologists, rather than assessing the training +distribution fit. In this work, we propose a method based on the local +Lipschitz-based metric to distinguish out-of-distribution images from +in-distribution with an area under the curve of 99.94%. Empirically, we +demonstrate a very strong relationship between the local Lipschitz value and +mean absolute error (MAE), supported by a high Spearman's rank correlation +coefficient of 0.8475, which determines the uncertainty estimation threshold +for optimal model performance. Through the identification of false positives, +the local Lipschitz and MAE relationship was used to guide data augmentation +and reduce model uncertainty. Our study was validated using the AUTOMAP +architecture for sensor-to-image Magnetic Resonance Imaging (MRI) +reconstruction. We compare our proposed approach with baseline methods: +Monte-Carlo dropout and deep ensembles, and further analysis included MRI +denoising and Computed Tomography (CT) sparse-to-full view reconstruction using +UNET architectures. We show that our approach is applicable to various +architectures and learned functions, especially in the realm of medical image +reconstruction, where preserving the diagnostic accuracy of reconstructed +images remains paramount. + +
+
+
+
+
+ + ♻ ☆ Generalized 3D Self-supervised Learning Framework via Prompted + Foreground-Aware Feature Contrast + + +
+ Contrastive learning has recently demonstrated great potential for +unsupervised pre-training in 3D scene understanding tasks. However, most +existing work randomly selects point features as anchors while building +contrast, leading to a clear bias toward background points that often dominate +in 3D scenes. Also, object awareness and foreground-to-background +discrimination are neglected, making contrastive learning less effective. To +tackle these issues, we propose a general foreground-aware feature contrast +FAC++ framework to learn more effective point cloud representations in +pre-training. FAC++ consists of two novel contrast designs to construct more +effective and informative contrast pairs. The first is building positive pairs +within the same foreground segment where points tend to have the same +semantics. The second is that we prevent over-discrimination between 3D +segments/objects and encourage grouped foreground-to-background distinctions at +the segment level with adaptive feature learning in a Siamese correspondence +network, which adaptively learns feature correlations within and across point +cloud views effectively. Moreover, we have designed the foreground-prompted +regional sampling to enhance more balanced foreground-aware learning, which is +termed FAC++. Visualization with point activation maps shows that our contrast +pairs capture clear correspondences among foreground regions during +pre-training. Quantitative experiments also show that FAC++ achieves superior +knowledge transfer and data efficiency in various downstream 3D semantic +segmentation, instance segmentation as well as object detection tasks. All +codes, data, and models are available at: +https://github.com/KangchengLiu/FAC_Foreground_Aware_Contrast + +
+
+ comment: International Journal of Computer Vision, Manuscript Info: 28 Pages, + 14 Figures, and 11 Tables +
+
+
+
+
+ + ♻ ☆ Adversarial Attacks and Defenses on 3D Point Cloud Classification: A + Survey + + +
+ Deep learning has successfully solved a wide range of tasks in 2D vision as a +dominant AI technique. Recently, deep learning on 3D point clouds is becoming +increasingly popular for addressing various tasks in this field. Despite +remarkable achievements, deep learning algorithms are vulnerable to adversarial +attacks. These attacks are imperceptible to the human eye but can easily fool +deep neural networks in the testing and deployment stage. To encourage future +research, this survey summarizes the current progress on adversarial attack and +defense techniques on point cloud classification.This paper first introduces +the principles and characteristics of adversarial attacks and summarizes and +analyzes adversarial example generation methods in recent years. Additionally, +it provides an overview of defense strategies, organized into data-focused and +model-focused methods. Finally, it presents several current challenges and +potential future research directions in this domain. + +
+
+
+
+
+ + ♻ ☆ Assessment of Deep Learning Segmentation for Real-Time Free-Breathing + Cardiac Magnetic Resonance Imaging + + +
+ In recent years, a variety of deep learning networks for cardiac MRI (CMR) +segmentation have been developed and analyzed. However, nearly all of them are +focused on cine CMR under breathold. In this work, accuracy of deep learning +methods is assessed for volumetric analysis (via segmentation) of the left +ventricle in real-time free-breathing CMR at rest and under exercise stress. +Data from healthy volunteers (n=15) for cine and real-time free-breathing CMR +were analyzed retrospectively. Segmentations of a commercial software (comDL) +and a freely available neural network (nnU-Net), were compared to a reference +created via the manual correction of comDL segmentation. Segmentation of left +ventricular endocardium (LV), left ventricular myocardium (MYO), and right +ventricle (RV) is evaluated for both end-systolic and end-diastolic phases and +analyzed with Dice's coefficient (DC). The volumetric analysis includes LV +end-diastolic volume (EDV), LV end-systolic volume (ESV), and LV ejection +fraction (EF). For cine CMR, nnU-Net and comDL achieve a DC above 0.95 for LV +and 0.9 for MYO, and RV. For real-time CMR, the accuracy of nnU-Net exceeds +that of comDL overall. For real-time CMR at rest, nnU-Net achieves a DC of 0.94 +for LV, 0.89 for MYO, and 0.90 for RV; mean absolute differences between +nnU-Net and reference are 2.9mL for EDV, 3.5mL for ESV and 2.6% for EF. For +real-time CMR under exercise stress, nnU-Net achieves a DC of 0.92 for LV, 0.85 +for MYO, and 0.83 for RV; mean absolute differences between nnU-Net and +reference are 11.4mL for EDV, 2.9mL for ESV and 3.6% for EF. Deep learning +methods designed or trained for cine CMR segmentation can perform well on +real-time CMR. For real-time free-breathing CMR at rest, the performance of +deep learning methods is comparable to inter-observer variability in cine CMR +and is usable or fully automatic segmentation. + +
+
+ comment: Martin Schilling and Christina Unterberg-Buchwald contributed equally + to this work +
+
+
+
+
+ + ♻ ☆ MLLMs-Augmented Visual-Language Representation Learning + + +
+ Visual-language pre-training (VLP) has achieved remarkable success in +multi-modal tasks, largely attributed to the availability of large-scale +image-text datasets. In this work, we demonstrate that multi-modal large +language models (MLLMs) can enhance visual-language representation learning by +improving data quality. Our approach is simple, utilizing MLLMs to extend +multiple captions for each image. To prevent the bias introduced by MLLMs' +hallucinations and intrinsic caption styles, we propose "text shearing" to +maintain the same length for extended captions as that of the original +captions. In image-text retrieval, our method consistently obtains 5.6 ~ 35.0% +and 16.8 ~ 46.1% improvement on R@1 under the fine-tuning and zero-shot +settings, respectively. Notably, we obtain zero-shot results that are +comparable to fine-tuning on target datasets, which encourages more exploration +of the versatile use of MLLMs. + +
+
+
+
+
+ + ♻ ☆ Does a Neural Network Really Encode Symbolic Concepts? + + +
+ Recently, a series of studies have tried to extract interactions between +input variables modeled by a DNN and define such interactions as concepts +encoded by the DNN. However, strictly speaking, there still lacks a solid +guarantee whether such interactions indeed represent meaningful concepts. +Therefore, in this paper, we examine the trustworthiness of interaction +concepts from four perspectives. Extensive empirical studies have verified that +a well-trained DNN usually encodes sparse, transferable, and discriminative +concepts, which is partially aligned with human intuition. + +
+
+
+
+
+ + ♻ ☆ Practical Blind Image Denoising via Swin-Conv-UNet and Data Synthesis SC + + +
+ While recent years have witnessed a dramatic upsurge of exploiting deep +neural networks toward solving image denoising, existing methods mostly rely on +simple noise assumptions, such as additive white Gaussian noise (AWGN), JPEG +compression noise and camera sensor noise, and a general-purpose blind +denoising method for real images remains unsolved. In this paper, we attempt to +solve this problem from the perspective of network architecture design and +training data synthesis. Specifically, for the network architecture design, we +propose a swin-conv block to incorporate the local modeling ability of residual +convolutional layer and non-local modeling ability of swin transformer block, +and then plug it as the main building block into the widely-used image-to-image +translation UNet architecture. For the training data synthesis, we design a +practical noise degradation model which takes into consideration different +kinds of noise (including Gaussian, Poisson, speckle, JPEG compression, and +processed camera sensor noises) and resizing, and also involves a random +shuffle strategy and a double degradation strategy. Extensive experiments on +AGWN removal and real image denoising demonstrate that the new network +architecture design achieves state-of-the-art performance and the new +degradation model can help to significantly improve the practicability. We +believe our work can provide useful insights into current denoising research. + +
+
+ comment: Codes: https://github.com/cszn/SCUNet +
+
+
+
+
+ + ♻ ☆ Trustworthy Large Models in Vision: A Survey + + +
+ The rapid progress of Large Models (LMs) has recently revolutionized various +fields of deep learning with remarkable grades, ranging from Natural Language +Processing (NLP) to Computer Vision (CV). However, LMs are increasingly +challenged and criticized by academia and industry due to their powerful +performance but untrustworthy behavior, which urgently needs to be alleviated +by reliable methods. Despite the abundance of literature on trustworthy LMs in +NLP, a systematic survey specifically delving into the trustworthiness of LMs +in CV remains absent. In order to mitigate this gap, we summarize four relevant +concerns that obstruct the trustworthy usage in vision of LMs in this survey, +including 1) human misuse, 2) vulnerability, 3) inherent issue and 4) +interpretability. By highlighting corresponding challenge, countermeasures, and +discussion in each topic, we hope this survey will facilitate readers' +understanding of this field, promote alignment of LMs with human expectations +and enable trustworthy LMs to serve as welfare rather than disaster for human +society. + +
+
+
+
+
+ + ♻ ☆ Spatio-Angular Convolutions for Super-resolution in Diffusion MRI + + +
+ Diffusion MRI (dMRI) is a widely used imaging modality, but requires long +scanning times to acquire high resolution datasets. By leveraging the unique +geometry present within this domain, we present a novel approach to dMRI +angular super-resolution that extends upon the parametric continuous +convolution (PCConv) framework. We introduce several additions to the operation +including a Fourier feature mapping, global coordinates, and domain specific +context. Using this framework, we build a fully parametric continuous +convolution network (PCCNN) and compare against existing models. We demonstrate +the PCCNN performs competitively while using significantly less parameters. +Moreover, we show that this formulation generalises well to clinically relevant +downstream analyses such as fixel-based analysis, and neurite orientation +dispersion and density imaging. + +
+
+
+
+
+ + ♻ ☆ AME-CAM: Attentive Multiple-Exit CAM for Weakly Supervised Segmentation + on MRI Brain Tumor + + +
+ Magnetic resonance imaging (MRI) is commonly used for brain tumor +segmentation, which is critical for patient evaluation and treatment planning. +To reduce the labor and expertise required for labeling, weakly-supervised +semantic segmentation (WSSS) methods with class activation mapping (CAM) have +been proposed. However, existing CAM methods suffer from low resolution due to +strided convolution and pooling layers, resulting in inaccurate predictions. In +this study, we propose a novel CAM method, Attentive Multiple-Exit CAM +(AME-CAM), that extracts activation maps from multiple resolutions to +hierarchically aggregate and improve prediction accuracy. We evaluate our +method on the BraTS 2021 dataset and show that it outperforms state-of-the-art +methods. + +
+
+ comment: arXiv admin note: text overlap with arXiv:2306.05476 +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 29 pages, 12 figures, Published in Transactions on Machine Learning + Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Split, Merge, and Refine: Fitting Tight Bounding Boxes via + Over-Segmentation and Iterative Search 3DV 2024 + + +
+ Achieving tight bounding boxes of a shape while guaranteeing complete +boundness is an essential task for efficient geometric operations and +unsupervised semantic part detection. But previous methods fail to achieve both +full coverage and tightness. Neural-network-based methods are not suitable for +these goals due to the non-differentiability of the objective, while classic +iterative search methods suffer from their sensitivity to the initialization. +We propose a novel framework for finding a set of tight bounding boxes of a 3D +shape via over-segmentation and iterative merging and refinement. Our result +shows that utilizing effective search methods with appropriate objectives is +the key to producing bounding boxes with both properties. We employ an existing +pre-segmentation to split the shape and obtain over-segmentation. Then, we +apply hierarchical merging with our novel tightness-aware merging and stopping +criteria. To overcome the sensitivity to the initialization, we also define +actions to refine the bounding box parameters in an Markov Decision Process +(MDP) setup with a soft reward function promoting a wider exploration. Lastly, +we further improve the refinement step with Monte Carlo Tree Search (MCTS) +based multi-action space exploration. By thoughtful evaluation on diverse 3D +shapes, we demonstrate full coverage, tightness, and an adequate number of +bounding boxes of our method without requiring any training data or +supervision. It thus can be applied to various downstream tasks in computer +vision and graphics. + +
+
+ comment: 3DV 2024 +
+
+
+
+
+ + ♻ ☆ SimPLR: A Simple and Plain Transformer for Object Detection and + Segmentation + + +
+ The ability to detect objects in images at varying scales has played a +pivotal role in the design of modern object detectors. Despite considerable +progress in removing hand-crafted components and simplifying the architecture +with transformers, multi-scale feature maps and/or pyramid design remain a key +factor for their empirical success. In this paper, we show that this reliance +on either feature pyramids or an hierarchical backbone is unnecessary and a +transformer-based detector with scale-aware attention enables the plain +detector `SimPLR' whose backbone and detection head are both non-hierarchical +and operate on single-scale features. The plain architecture allows SimPLR to +effectively take advantages of self-supervised learning and scaling approaches +with ViTs, yielding competitive performance compared to hierarchical and +multi-scale counterparts. We demonstrate through our experiments that when +scaling to larger ViT backbones, SimPLR indicates better performance than +end-to-end segmentation models (Mask2Former) and plain-backbone detectors +(ViTDet), while consistently being faster. The code will be released. + +
+
+
+
+
+ + ♻ ☆ HouseCat6D -- A Large-Scale Multi-Modal Category Level 6D Object + Perception Dataset with Household Objects in Realistic Scenarios + + +
+ Estimating 6D object poses is a major challenge in 3D computer vision. +Building on successful instance-level approaches, research is shifting towards +category-level pose estimation for practical applications. Current +category-level datasets, however, fall short in annotation quality and pose +variety. Addressing this, we introduce HouseCat6D, a new category-level 6D pose +dataset. It features 1) multi-modality with Polarimetric RGB and Depth +(RGBD+P), 2) encompasses 194 diverse objects across 10 household categories, +including two photometrically challenging ones, and 3) provides high-quality +pose annotations with an error range of only 1.35 mm to 1.74 mm. The dataset +also includes 4) 41 large-scale scenes with comprehensive viewpoint and +occlusion coverage, 5) a checkerboard-free environment, and 6) dense 6D +parallel-jaw robotic grasp annotations. Additionally, we present benchmark +results for leading category-level pose estimation networks. + +
+
+
+
+
+ + ♻ ☆ A Unified Approach to Interpreting and Boosting Adversarial + Transferability + + +
+ In this paper, we use the interaction inside adversarial perturbations to +explain and boost the adversarial transferability. We discover and prove the +negative correlation between the adversarial transferability and the +interaction inside adversarial perturbations. The negative correlation is +further verified through different DNNs with various inputs. Moreover, this +negative correlation can be regarded as a unified perspective to understand +current transferability-boosting methods. To this end, we prove that some +classic methods of enhancing the transferability essentially decease +interactions inside adversarial perturbations. Based on this, we propose to +directly penalize interactions during the attacking process, which +significantly improves the adversarial transferability. + +
+
+
+
+
+ + ♻ ☆ ShapeGPT: 3D Shape Generation with A Unified Multi-modal Language Model + + +
+ The advent of large language models, enabling flexibility through +instruction-driven approaches, has revolutionized many traditional generative +tasks, but large models for 3D data, particularly in comprehensively handling +3D shapes with other modalities, are still under-explored. By achieving +instruction-based shape generations, versatile multimodal generative shape +models can significantly benefit various fields like 3D virtual construction +and network-aided design. In this work, we present ShapeGPT, a shape-included +multi-modal framework to leverage strong pre-trained language models to address +multiple shape-relevant tasks. Specifically, ShapeGPT employs a +word-sentence-paragraph framework to discretize continuous shapes into shape +words, further assembles these words for shape sentences, as well as integrates +shape with instructional text for multi-modal paragraphs. To learn this +shape-language model, we use a three-stage training scheme, including shape +representation, multimodal alignment, and instruction-based generation, to +align shape-language codebooks and learn the intricate correlations among these +modalities. Extensive experiments demonstrate that ShapeGPT achieves comparable +performance across shape-relevant tasks, including text-to-shape, +shape-to-text, shape completion, and shape editing. + +
+
+
+
+
+ + ♻ ☆ Interpreting and Disentangling Feature Components of Various Complexity + from DNNs + + +
+ This paper aims to define, quantify, and analyze the feature complexity that +is learned by a DNN. We propose a generic definition for the feature +complexity. Given the feature of a certain layer in the DNN, our method +disentangles feature components of different complexity orders from the +feature. We further design a set of metrics to evaluate the reliability, the +effectiveness, and the significance of over-fitting of these feature +components. Furthermore, we successfully discover a close relationship between +the feature complexity and the performance of DNNs. As a generic mathematical +tool, the feature complexity and the proposed metrics can also be used to +analyze the success of network compression and knowledge distillation. + +
+
+
+
+
+ + ♻ ☆ Defects of Convolutional Decoder Networks in Frequency Representation + + +
+ In this paper, we prove the representation defects of a cascaded +convolutional decoder network, considering the capacity of representing +different frequency components of an input sample. We conduct the discrete +Fourier transform on each channel of the feature map in an intermediate layer +of the decoder network. Then, we extend the 2D circular convolution theorem to +represent the forward and backward propagations through convolutional layers in +the frequency domain. Based on this, we prove three defects in representing +feature spectrums. First, we prove that the convolution operation, the +zero-padding operation, and a set of other settings all make a convolutional +decoder network more likely to weaken high-frequency components. Second, we +prove that the upsampling operation generates a feature spectrum, in which +strong signals repetitively appear at certain frequencies. Third, we prove that +if the frequency components in the input sample and frequency components in the +target output for regression have a small shift, then the decoder usually +cannot be effectively learned. + +
+
+
+
+
+ + ♻ ☆ Bayesian Neural Networks Avoid Encoding Complex and + Perturbation-Sensitive Concepts + + +
+ In this paper, we focus on mean-field variational Bayesian Neural Networks +(BNNs) and explore the representation capacity of such BNNs by investigating +which types of concepts are less likely to be encoded by the BNN. It has been +observed and studied that a relatively small set of interactive concepts +usually emerge in the knowledge representation of a sufficiently-trained neural +network, and such concepts can faithfully explain the network output. Based on +this, our study proves that compared to standard deep neural networks (DNNs), +it is less likely for BNNs to encode complex concepts. Experiments verify our +theoretical proofs. Note that the tendency to encode less complex concepts does +not necessarily imply weak representation power, considering that complex +concepts exhibit low generalization power and high adversarial vulnerability. +The code is available at https://github.com/sjtu-xai-lab/BNN-concepts. + +
+
+
+
+
+ + ♻ ☆ HarsanyiNet: Computing Accurate Shapley Values in a Single Forward + Propagation + + +
+ The Shapley value is widely regarded as a trustworthy attribution metric. +However, when people use Shapley values to explain the attribution of input +variables of a deep neural network (DNN), it usually requires a very high +computational cost to approximate relatively accurate Shapley values in +real-world applications. Therefore, we propose a novel network architecture, +the HarsanyiNet, which makes inferences on the input sample and simultaneously +computes the exact Shapley values of the input variables in a single forward +propagation. The HarsanyiNet is designed on the theoretical foundation that the +Shapley value can be reformulated as the redistribution of Harsanyi +interactions encoded by the network. + +
+
+
+
+
+ + ♻ ☆ RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object + Detection Systems + + +
+ In autonomous driving, LiDAR and radar play important roles in the perception +of the surrounding environment. LiDAR provides accurate 3D spatial sensing +information but cannot work in adverse weather like fog. On the other hand, the +radar signal can be diffracted when encountering raindrops or mist particles +thanks to its wavelength, but it suffers from large noise. Recent +state-of-the-art works reveal that fusion of radar and LiDAR can lead to robust +detection in adverse weather. The existing works adopt convolutional neural +network architecture to extract features from each sensor data, then align and +aggregate the two branch features to predict object detection results. However, +these methods have low accuracy of bounding box estimations due to a simple +design of label assignment and fusion strategies. In this paper, we propose a +bird's-eye view fusion learning-based anchor box-free object detection system, +which fuses the feature derived from the radar range-azimuth heatmap and the +LiDAR point cloud to estimate possible objects. Different label assignment +strategies have been designed to facilitate the consistency between the +classification of foreground or background anchor points and the corresponding +bounding box regressions. Furthermore, the performance of the proposed object +detector is further enhanced by employing a novel interactive transformer +module. The superior performance of the methods proposed in this paper has been +demonstrated using the recently published Oxford Radar RobotCar dataset. Our +system's average precision significantly outperforms the state-of-the-art +method by 13.1% and 19.0% at IoU of 0.8 under 'Clear+Foggy' training conditions +for 'Clear' and 'Foggy' testing, respectively. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Consistent Video-to-Video Transfer Using Synthetic Dataset + + +
+ We introduce a novel and efficient approach for text-based video-to-video +editing that eliminates the need for resource-intensive per-video-per-model +finetuning. At the core of our approach is a synthetic paired video dataset +tailored for video-to-video transfer tasks. Inspired by Instruct Pix2Pix's +image transfer via editing instruction, we adapt this paradigm to the video +domain. Extending the Prompt-to-Prompt to videos, we efficiently generate +paired samples, each with an input video and its edited counterpart. Alongside +this, we introduce the Long Video Sampling Correction during sampling, ensuring +consistent long videos across batches. Our method surpasses current methods +like Tune-A-Video, heralding substantial progress in text-based video-to-video +editing and suggesting exciting avenues for further exploration and deployment. + +
+
+
+
+
+ + ♻ ☆ CLiSA: A Hierarchical Hybrid Transformer Model using Orthogonal Cross + Attention for Satellite Image Cloud Segmentation + + +
+ Clouds in optical satellite images are a major concern since their presence +hinders the ability to carry accurate analysis as well as processing. Presence +of clouds also affects the image tasking schedule and results in wastage of +valuable storage space on ground as well as space-based systems. Due to these +reasons, deriving accurate cloud masks from optical remote-sensing images is an +important task. Traditional methods such as threshold-based, spatial filtering +for cloud detection in satellite images suffer from lack of accuracy. In recent +years, deep learning algorithms have emerged as a promising approach to solve +image segmentation problems as it allows pixel-level classification and +semantic-level segmentation. In this paper, we introduce a deep-learning model +based on hybrid transformer architecture for effective cloud mask generation +named CLiSA - Cloud segmentation via Lipschitz Stable Attention network. In +this context, we propose an concept of orthogonal self-attention combined with +hierarchical cross attention model, and we validate its Lipschitz stability +theoretically and empirically. We design the whole setup under adversarial +setting in presence of Lov\'asz-Softmax loss. We demonstrate both qualitative +and quantitative outcomes for multiple satellite image datasets including +Landsat-8, Sentinel-2, and Cartosat-2s. Performing comparative study we show +that our model performs preferably against other state-of-the-art methods and +also provides better generalization in precise cloud extraction from satellite +multi-spectral (MX) images. We also showcase different ablation studies to +endorse our choices corresponding to different architectural elements and +objective functions. + +
+
+ comment: 14 pages, 11 figures, 7 tables +
+
+
+
+
+ + ♻ ☆ GaussianEditor: Swift and Controllable 3D Editing with Gaussian + Splatting + + +
+ 3D editing plays a crucial role in many areas such as gaming and virtual +reality. Traditional 3D editing methods, which rely on representations like +meshes and point clouds, often fall short in realistically depicting complex +scenes. On the other hand, methods based on implicit 3D representations, like +Neural Radiance Field (NeRF), render complex scenes effectively but suffer from +slow processing speeds and limited control over specific scene areas. In +response to these challenges, our paper presents GaussianEditor, an innovative +and efficient 3D editing algorithm based on Gaussian Splatting (GS), a novel 3D +representation. GaussianEditor enhances precision and control in editing +through our proposed Gaussian semantic tracing, which traces the editing target +throughout the training process. Additionally, we propose Hierarchical Gaussian +splatting (HGS) to achieve stabilized and fine results under stochastic +generative guidance from 2D diffusion models. We also develop editing +strategies for efficient object removal and integration, a challenging task for +existing methods. Our comprehensive experiments demonstrate GaussianEditor's +superior control, efficacy, and rapid performance, marking a significant +advancement in 3D editing. Project Page: +https://buaacyw.github.io/gaussian-editor/ + +
+
+ comment: Project Page: https://buaacyw.github.io/gaussian-editor/ +
+
+
+
+
+ + ♻ ☆ VMAF Re-implementation on PyTorch: Some Experimental Results + + +
+ Based on the standard VMAF implementation we propose an implementation of +VMAF using PyTorch framework. For this implementation comparisons with the +standard (libvmaf) show the discrepancy $\lesssim 10^{-2}$ in VMAF units. We +investigate gradients computation when using VMAF as an objective function and +demonstrate that training using this function does not result in ill-behaving +gradients. The implementation is then used to train a preprocessing filter. It +is demonstrated that its performance is superior to the unsharp masking filter. +The resulting filter is also easy for implementation and can be applied in +video processing tasks for video copression improvement. This is confirmed by +the results of numerical experiments. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Generating high-quality 3DMPCs by adaptive data acquisition and + NeREF-based radiometric calibration with UGV plant phenotyping system + + +
+ Fusion of 3D and MS imaging data has a great potential for high-throughput +plant phenotyping of structural and biochemical as well as physiological traits +simultaneously, which is important for decision support in agriculture and for +crop breeders in selecting the best genotypes. However, lacking of 3D data +integrity of various plant canopy structures and low-quality of MS images +caused by the complex illumination effects make a great challenge, especially +at the proximal imaging scale. Therefore, this study proposed a novel approach +for adaptive data acquisition and radiometric calibration to generate +high-quality 3DMPCs of plants. An efficient NBV planning method based on an UGV +plant phenotyping system with a multi-sensor-equipped robotic arm was proposed +to achieve adaptive data acquisition. The NeREF was employed to predict the DN +values of the hemispherical reference for radiometric calibration. For NBV +planning, the average total time for single plant at a joint speed of 1.55 +rad/s was about 62.8 s, with an average reduction of 18.0% compared to the +unplanned. The integrity of the whole-plant data was improved by an average of +23.6% compared to the fixed viewpoints alone. Compared with the ASD +measurements, the RMSE of the reflectance spectra obtained from 3DMPCs at +different regions of interest was 0.08 with an average decrease of 58.93% +compared to the results obtained from the single-frame of MS images without 3D +radiometric calibration. The 3D-calibrated plant 3DMPCs improved the predictive +accuracy of PLSR for chlorophyll content, with an average increase of 0.07 in +R2 and an average decrease of 21.25% in RMSE. Our approach introduced a fresh +perspective on generating high-quality 3DMPCs of plants under the natural light +condition, enabling more precise analysis of plant morphological and +physiological parameters. + +
+
+
+
+
+ + ♻ ☆ Demystify Transformers & Convolutions in Modern Image Deep Networks + + +
+ Vision transformers have gained popularity recently, leading to the +development of new vision backbones with improved features and consistent +performance gains. However, these advancements are not solely attributable to +novel feature transformation designs; certain benefits also arise from advanced +network-level and block-level architectures. This paper aims to identify the +real gains of popular convolution and attention operators through a detailed +study. We find that the key difference among these feature transformation +modules, such as attention or convolution, lies in their spatial feature +aggregation approach, known as the "spatial token mixer" (STM). To facilitate +an impartial comparison, we introduce a unified architecture to neutralize the +impact of divergent network-level and block-level designs. Subsequently, +various STMs are integrated into this unified framework for comprehensive +comparative analysis. Our experiments on various tasks and an analysis of +inductive bias show a significant performance boost due to advanced +network-level and block-level designs, but performance differences persist +among different STMs. Our detailed analysis also reveals various findings about +different STMs, such as effective receptive fields and invariance tests. All +models and codes used in this study are publicly available at +\url{https://github.com/OpenGVLab/STM-Evaluation}. + +
+
+
+
+
+ + ♻ ☆ PointLLM: Empowering Large Language Models to Understand Point Clouds + + +
+ The unprecedented advancements in Large Language Models (LLMs) have shown a +profound impact on natural language processing but are yet to fully embrace the +realm of 3D understanding. This paper introduces PointLLM, a preliminary effort +to fill this gap, enabling LLMs to understand point clouds and offering a new +avenue beyond 2D visual data. PointLLM understands colored object point clouds +with human instructions and generates contextually appropriate responses, +illustrating its grasp of point clouds and common sense. Specifically, it +leverages a point cloud encoder with a powerful LLM to effectively fuse +geometric, appearance, and linguistic information. We collect a novel dataset +comprising 660K simple and 70K complex point-text instruction pairs to enable a +two-stage training strategy: aligning latent spaces and subsequently +instruction-tuning the unified model. To rigorously evaluate the perceptual and +generalization capabilities of PointLLM, we establish two benchmarks: +Generative 3D Object Classification and 3D Object Captioning, assessed through +three different methods, including human evaluation, GPT-4/ChatGPT evaluation, +and traditional metrics. Experimental results reveal PointLLM's superior +performance over existing 2D and 3D baselines, with a notable achievement in +human-evaluated object captioning tasks where it surpasses human annotators in +over 50% of the samples. Codes, datasets, and benchmarks are available at +https://github.com/OpenRobotLab/PointLLM . + +
+
+ comment: 28 pages. Empowering large language models with 3D point cloud + understanding, accompanied by a novel dataset and carefully designed + benchmarks. Project page: https://runsenxu.com/projects/PointLLM +
+
+
+
+
+ + ♻ ☆ Automatic Diagnosis of Myocarditis Disease in Cardiac MRI Modality using + Deep Transformers and Explainable Artificial Intelligence + + +
+ Myocarditis is a significant cardiovascular disease (CVD) that poses a threat +to the health of many individuals by causing damage to the myocardium. The +occurrence of microbes and viruses, including the likes of HIV, plays a crucial +role in the development of myocarditis disease (MCD). The images produced +during cardiac magnetic resonance imaging (CMRI) scans are low contrast, which +can make it challenging to diagnose cardiovascular diseases. In other hand, +checking numerous CMRI slices for each CVD patient can be a challenging task +for medical doctors. To overcome the existing challenges, researchers have +suggested the use of artificial intelligence (AI)-based computer-aided +diagnosis systems (CADS). The presented paper outlines a CADS for the detection +of MCD from CMR images, utilizing deep learning (DL) methods. The proposed CADS +consists of several steps, including dataset, preprocessing, feature +extraction, classification, and post-processing. First, the Z-Alizadeh dataset +was selected for the experiments. Subsequently, the CMR images underwent +various preprocessing steps, including denoising, resizing, as well as data +augmentation (DA) via CutMix and MixUp techniques. In the following, the most +current deep pre-trained and transformer models are used for feature extraction +and classification on the CMR images. The findings of our study reveal that +transformer models exhibit superior performance in detecting MCD as opposed to +pre-trained architectures. In terms of DL architectures, the Turbulence Neural +Transformer (TNT) model exhibited impressive accuracy, reaching 99.73% +utilizing a 10-fold cross-validation approach. Additionally, to pinpoint areas +of suspicion for MCD in CMRI images, the Explainable-based Grad Cam method was +employed. + +
+
+
+
+
+ + ♻ ☆ AgentAvatar: Disentangling Planning, Driving and Rendering for + Photorealistic Avatar Agents + + +
+ In this study, our goal is to create interactive avatar agents that can +autonomously plan and animate nuanced facial movements realistically, from both +visual and behavioral perspectives. Given high-level inputs about the +environment and agent profile, our framework harnesses LLMs to produce a series +of detailed text descriptions of the avatar agents' facial motions. These +descriptions are then processed by our task-agnostic driving engine into motion +token sequences, which are subsequently converted into continuous motion +embeddings that are further consumed by our standalone neural-based renderer to +generate the final photorealistic avatar animations. These streamlined +processes allow our framework to adapt to a variety of non-verbal avatar +interactions, both monadic and dyadic. Our extensive study, which includes +experiments on both newly compiled and existing datasets featuring two types of +agents -- one capable of monadic interaction with the environment, and the +other designed for dyadic conversation -- validates the effectiveness and +versatility of our approach. To our knowledge, we advanced a leap step by +combining LLMs and neural rendering for generalized non-verbal prediction and +photo-realistic rendering of avatar agents. + +
+
+ comment: Project page: https://dorniwang.github.io/AgentAvatar_project/; 26 + pages +
+
+
+
+
+ + ♻ ☆ Stability-Informed Initialization of Neural Ordinary Differential + Equations + + +
+ This paper addresses the training of Neural Ordinary Differential Equations +(neural ODEs), and in particular explores the interplay between numerical +integration techniques, stability regions, step size, and initialization +techniques. It is shown how the choice of integration technique implicitly +regularizes the learned model, and how the solver's corresponding stability +region affects training and prediction performance. From this analysis, a +stability-informed parameter initialization technique is introduced. The +effectiveness of the initialization method is displayed across several learning +benchmarks and industrial applications. + +
+
+
+
+
+ + ♻ ☆ Differentiable Display Photometric Stereo + + +
+ Photometric stereo leverages variations in illumination conditions to +reconstruct surface normals. Display photometric stereo, which employs a +conventional monitor as an illumination source, has the potential to overcome +limitations often encountered in bulky and difficult-to-use conventional +setups. In this paper, we present differentiable display photometric stereo +(DDPS), addressing an often overlooked challenge in display photometric stereo: +the design of display patterns. Departing from using heuristic display +patterns, DDPS learns the display patterns that yield accurate normal +reconstruction for a target system in an end-to-end manner. To this end, we +propose a differentiable framework that couples basis-illumination image +formation with analytic photometric-stereo reconstruction. The differentiable +framework facilitates the effective learning of display patterns via +auto-differentiation. Also, for training supervision, we propose to use 3D +printing for creating a real-world training dataset, enabling accurate +reconstruction on the target real-world setup. Finally, we exploit that +conventional LCD monitors emit polarized light, which allows for the optical +separation of diffuse and specular reflections when combined with a +polarization camera, leading to accurate normal reconstruction. Extensive +evaluation of DDPS shows improved normal-reconstruction accuracy compared to +heuristic patterns and demonstrates compelling properties such as robustness to +pattern initialization, calibration errors, and simplifications in image +formation and reconstruction. + +
+
+
+
+
+ + ♻ ☆ Rethinking Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Traditional frame-based cameras and videos are commonly applied, yet, they +become less reliable in scenarios under high dynamic range or heavy motion +blur. In contrast, event cameras offer a robust solution for navigating these +challenging contexts. Predominant methodologies incorporate event cameras into +learning frameworks by accumulating events into event frames. However, such +methods tend to marginalize the intrinsic asynchronous and high temporal +resolution characteristics of events. This disregard leads to a loss in +essential temporal dimension data, crucial for discerning distinct actions. To +address this issue and to unlock the 3D potential of event information, we +introduce two 3D event representations: the Rasterized Event Point Cloud +(RasEPC) and the Decoupled Event Voxel (DEV). The RasEPC collates events within +concise temporal slices at identical positions, preserving 3D attributes with +statistical cues and markedly mitigating memory and computational demands. +Meanwhile, the DEV representation discretizes events into voxels and projects +them across three orthogonal planes, utilizing decoupled event attention to +retrieve 3D cues from the 2D planes. Furthermore, we develop and release +EV-3DPW, a synthetic event-based dataset crafted to facilitate training and +quantitative analysis in outdoor scenes. On the public real-world DHP19 +dataset, our event point cloud technique excels in real-time mobile +predictions, while the decoupled event voxel method achieves the highest +accuracy. Experiments on EV-3DPW demonstrate that the robustness of our +proposed 3D representation methods compared to traditional RGB images and event +frame techniques under the same backbones. Our code and dataset have been made +publicly available at https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Extended version of arXiv:2206.04511. The code and dataset are + available at https://github.com/MasterHow/EventPointPose +
+
+
+
+
+ + ♻ ☆ Bayesian Learning with Information Gain Provably Bounds Risk for a + Robust Adversarial Defense ICML 2022 + + +
+ We present a new algorithm to learn a deep neural network model robust +against adversarial attacks. Previous algorithms demonstrate an adversarially +trained Bayesian Neural Network (BNN) provides improved robustness. We +recognize the adversarial learning approach for approximating the multi-modal +posterior distribution of a Bayesian model can lead to mode collapse; +consequently, the model's achievements in robustness and performance are +sub-optimal. Instead, we first propose preventing mode collapse to better +approximate the multi-modal posterior distribution. Second, based on the +intuition that a robust model should ignore perturbations and only consider the +informative content of the input, we conceptualize and formulate an information +gain objective to measure and force the information learned from both benign +and adversarial training instances to be similar. Importantly. we prove and +demonstrate that minimizing the information gain objective allows the +adversarial risk to approach the conventional empirical risk. We believe our +efforts provide a step toward a basis for a principled method of adversarially +training BNNs. Our model demonstrate significantly improved robustness--up to +20%--compared with adversarial training and Adv-BNN under PGD attacks with +0.035 distortion on both CIFAR-10 and STL-10 datasets. + +
+
+ comment: Published at ICML 2022. Code is available at + https://github.com/baogiadoan/IG-BNN +
+
+
+
+
+ + ♻ ☆ SAVE: Spectral-Shift-Aware Adaptation of Image Diffusion Models for + Text-driven Video Editing + + +
+ Text-to-Image (T2I) diffusion models have achieved remarkable success in +synthesizing high-quality images conditioned on text prompts. Recent methods +have tried to replicate the success by either training text-to-video (T2V) +models on a very large number of text-video pairs or adapting T2I models on +text-video pairs independently. Although the latter is computationally less +expensive, it still takes a significant amount of time for per-video adaption. +To address this issue, we propose SAVE, a novel spectral-shift-aware adaptation +framework, in which we fine-tune the spectral shift of the parameter space +instead of the parameters themselves. Specifically, we take the spectral +decomposition of the pre-trained T2I weights and only update the singular +values while freezing the corresponding singular vectors. In addition, we +introduce a spectral shift regularizer aimed at placing tighter constraints on +larger singular values compared to smaller ones. This form of regularization +enables the model to grasp finer details within the video that align with the +provided textual descriptions. We also offer theoretical justification for our +proposed regularization technique. Since we are only dealing with spectral +shifts, the proposed method reduces the adaptation time significantly (approx. +10 times) and has fewer resource constraints for training. Such attributes +posit SAVE to be more suitable for real-world applications, e.g. editing +undesirable content during video streaming. We validate the effectiveness of +SAVE with an extensive experimental evaluation under different settings, e.g. +style transfer, object replacement, privacy preservation, etc. + +
+
+ comment: 11 pages, 10 figures +
+
+
+
+
+ + ♻ ☆ AECIF-Net: An Attention-Enhanced Co-Interactive Fusion Network for + Automated Structural Condition Assessment in Visual Inspection + + +
+ Efficiently monitoring the condition of civil infrastructures necessitates +automating the structural condition assessment in visual inspection. This paper +proposes an Attention-Enhanced Co-Interactive Fusion Network (AECIF-Net) for +automatic structural condition assessment in visual bridge inspection. +AECIF-Net can simultaneously parse structural elements and segment surface +defects on the elements in inspection images. It integrates two task-specific +relearning subnets to extract task-specific features from an overall feature +embedding. A co-interactive feature fusion module further captures the spatial +correlation and facilitates information sharing between tasks. Experimental +results demonstrate that the proposed AECIF-Net outperforms the current +state-of-the-art approaches, achieving promising performance with 92.11% mIoU +for element segmentation and 87.16% mIoU for corrosion segmentation on the test +set of the new benchmark dataset Steel Bridge Condition Inspection Visual +(SBCIV). An ablation study verifies the merits of the designs for AECIF-Net, +and a case study demonstrates its capability to automate structural condition +assessment. + +
+
+ comment: Submitted to Automation in Construction +
+
+
+
+
+ + ♻ ☆ AG-CRC: Anatomy-Guided Colorectal Cancer Segmentation in CT with + Imperfect Anatomical Knowledge + + +
+ When delineating lesions from medical images, a human expert can always keep +in mind the anatomical structure behind the voxels. However, although +high-quality (though not perfect) anatomical information can be retrieved from +computed tomography (CT) scans with modern deep learning algorithms, it is +still an open problem how these automatically generated organ masks can assist +in addressing challenging lesion segmentation tasks, such as the segmentation +of colorectal cancer (CRC). In this paper, we develop a novel Anatomy-Guided +segmentation framework to exploit the auto-generated organ masks to aid CRC +segmentation from CT, namely AG-CRC. First, we obtain multi-organ segmentation +(MOS) masks with existing MOS models (e.g., TotalSegmentor) and further derive +a more robust organ of interest (OOI) mask that may cover most of the +colon-rectum and CRC voxels. Then, we propose an anatomy-guided training patch +sampling strategy by optimizing a heuristic gain function that considers both +the proximity of important regions (e.g., the tumor or organs of interest) and +sample diversity. Third, we design a novel self-supervised learning scheme +inspired by the topology of tubular organs like the colon to boost the model +performance further. Finally, we employ a masked loss scheme to guide the model +to focus solely on the essential learning region. We extensively evaluate the +proposed method on two CRC segmentation datasets, where substantial performance +improvement (5% to 9% in Dice) is achieved over current state-of-the-art +medical image segmentation models, and the ablation studies further evidence +the efficacy of every proposed component. + +
+
+ comment: under review +
+
+
+
+
+ + ♻ ☆ CCEdit: Creative and Controllable Video Editing via Diffusion Models + + +
+ In this paper, we present CCEdit, a versatile generative video editing +framework based on diffusion models. Our approach employs a novel trident +network structure that separates structure and appearance control, ensuring +precise and creative editing capabilities. Utilizing the foundational +ControlNet architecture, we maintain the structural integrity of the video +during editing. The incorporation of an additional appearance branch enables +users to exert fine-grained control over the edited key frame. These two side +branches seamlessly integrate into the main branch, which is constructed upon +existing text-to-image (T2I) generation models, through learnable temporal +layers. The versatility of our framework is demonstrated through a diverse +range of choices in both structure representations and personalized T2I models, +as well as the option to provide the edited key frame. To facilitate +comprehensive evaluation, we introduce the BalanceCC benchmark dataset, +comprising 100 videos and 4 target prompts for each video. Our extensive user +studies compare CCEdit with eight state-of-the-art video editing methods. The +outcomes demonstrate CCEdit's substantial superiority over all other methods. + +
+
+
+
+
+ + ♻ ☆ VideoAssembler: Identity-Consistent Video Generation with Reference + Entities using Diffusion Model + + +
+ Identity-consistent video generation seeks to synthesize videos that are +guided by both textual prompts and reference images of entities. Current +approaches typically utilize cross-attention layers to integrate the appearance +of the entity, which predominantly captures semantic attributes, resulting in +compromised fidelity of entities. Moreover, these methods necessitate iterative +fine-tuning for each new entity encountered, thereby limiting their +applicability. To address these challenges, we introduce VideoAssembler, a +novel end-to-end framework for identity-consistent video generation that can +conduct inference directly when encountering new entities. VideoAssembler is +adept at producing videos that are not only flexible with respect to the input +reference entities but also responsive to textual conditions. Additionally, by +modulating the quantity of input images for the entity, VideoAssembler enables +the execution of tasks ranging from image-to-video generation to sophisticated +video editing. VideoAssembler comprises two principal components: the Reference +Entity Pyramid (REP) encoder and the Entity-Prompt Attention Fusion (EPAF) +module. The REP encoder is designed to infuse comprehensive appearance details +into the denoising stages of the stable diffusion model. Concurrently, the EPAF +module is utilized to integrate text-aligned features effectively. Furthermore, +to mitigate the challenge of scarce data, we present a methodology for the +preprocessing of training data. Our evaluation of the VideoAssembler framework +on the UCF-101, MSR-VTT, and DAVIS datasets indicates that it achieves good +performances in both quantitative and qualitative analyses (346.84 in FVD and +48.01 in IS on UCF-101). Our project page is at +https://gulucaptain.github.io/videoassembler/. + +
+
+
+
+
+ + ♻ ☆ Linear Oscillation: A Novel Activation Function for Vision Transformer + + +
+ Activation functions are the linchpins of deep learning, profoundly +influencing both the representational capacity and training dynamics of neural +networks. They shape not only the nature of representations but also optimize +convergence rates and enhance generalization potential. Appreciating this +critical role, we present the Linear Oscillation (LoC) activation function, +defined as $f(x) = x \times \sin(\alpha x + \beta)$. Distinct from conventional +activation functions which primarily introduce non-linearity, LoC seamlessly +blends linear trajectories with oscillatory deviations. The nomenclature +"Linear Oscillation" is a nod to its unique attribute of infusing linear +activations with harmonious oscillations, capturing the essence of the +"Importance of Confusion". This concept of "controlled confusion" within +network activations is posited to foster more robust learning, particularly in +contexts that necessitate discerning subtle patterns. Our empirical studies +reveal that, when integrated into diverse neural architectures, the LoC +activation function consistently outperforms established counterparts like ReLU +and Sigmoid. The stellar performance exhibited by the avant-garde Vision +Transformer model using LoC further validates its efficacy. This study +illuminates the remarkable benefits of the LoC over other prominent activation +functions. It champions the notion that intermittently introducing deliberate +complexity or "confusion" during training can spur more profound and nuanced +learning. This accentuates the pivotal role of judiciously selected activation +functions in shaping the future of neural network training. + +
+
+ comment: Not Completed Paper +
+
+
+
+
+ + ♻ ☆ Task-Specific Alignment and Multiple Level Transformer for Few-Shot + Action Recognition + + +
+ In the research field of few-shot learning, the main difference between +image-based and video-based is the additional temporal dimension. In recent +years, some works have used the Transformer to deal with frames, then get the +attention feature and the enhanced prototype, and the results are competitive. +However, some video frames may relate little to the action, and only using +single frame-level or segment-level features may not mine enough information. +We address these problems sequentially through an end-to-end method named +"Task-Specific Alignment and Multiple-level Transformer Network (TSA-MLT)". The +first module (TSA) aims at filtering the action-irrelevant frames for action +duration alignment. Affine Transformation for frame sequence in the time +dimension is used for linear sampling. The second module (MLT) focuses on the +Multiple-level feature of the support prototype and query sample to mine more +information for the alignment, which operates on different level features. We +adopt a fusion loss according to a fusion distance that fuses the L2 sequence +distance, which focuses on temporal order alignment, and the Optimal Transport +distance, which focuses on measuring the gap between the appearance and +semantics of the videos. Extensive experiments show our method achieves +state-of-the-art results on the HMDB51 and UCF101 datasets and a competitive +result on the benchmark of Kinetics and something 2-something V2 datasets. Our +code is available at the URL: https://github.com/cofly2014/tsa-mlt.git + +
+
+
+
+
+ + ♻ ☆ Explaining CLIP's performance disparities on data from blind/low vision + users + + +
+ Large multi-modal models (LMMs) hold the potential to usher in a new era of +automated visual assistance for people who are blind or low vision (BLV). Yet, +these models have not been systematically evaluated on data captured by BLV +users. We address this by empirically assessing CLIP, a widely-used LMM likely +to underpin many assistive technologies. Testing 25 CLIP variants in a +zero-shot classification task, we find that their accuracy is 15 percentage +points lower on average for images captured by BLV users than web-crawled +images. This disparity stems from CLIP's sensitivities to 1) image content +(e.g. not recognizing disability objects as well as other objects); 2) image +quality (e.g. not being robust to lighting variation); and 3) text content +(e.g. not recognizing objects described by tactile adjectives as well as visual +ones). We delve deeper with a textual analysis of three common pre-training +datasets: LAION-400M, LAION-2B and DataComp-1B, showing that disability content +is rarely mentioned. We then provide three examples that illustrate how the +performance disparities extend to three downstream models underpinned by CLIP: +OWL-ViT, CLIPSeg and DALL-E2. We find that few-shot learning with as few as 5 +images can mitigate CLIP's quality-of-service disparities for BLV users in some +scenarios, which we discuss alongside a set of other possible mitigations. + +
+
+
+
+
+ + ♻ ☆ MuseChat: A Conversational Music Recommendation System for Videos + + +
+ Music recommendation for videos attracts growing interest in multi-modal +research. However, existing systems focus primarily on content compatibility, +often ignoring the users' preferences. Their inability to interact with users +for further refinements or to provide explanations leads to a less satisfying +experience. We address these issues with MuseChat, a first-of-its-kind +dialogue-based recommendation system that personalizes music suggestions for +videos. Our system consists of two key functionalities with associated modules: +recommendation and reasoning. The recommendation module takes a video along +with optional information including previous suggested music and user's +preference as inputs and retrieves an appropriate music matching the context. +The reasoning module, equipped with the power of Large Language Model +(Vicuna-7B) and extended to multi-modal inputs, is able to provide reasonable +explanation for the recommended music. To evaluate the effectiveness of +MuseChat, we build a large-scale dataset, conversational music recommendation +for videos, that simulates a two-turn interaction between a user and a +recommender based on accurate music track information. Experiment results show +that MuseChat achieves significant improvements over existing video-based music +retrieval methods as well as offers strong interpretability and +interactability. + +
+
+
+
+
+
+
+
+ + Information Retrieval 10 + +
+
+
+ + ☆ Context Retrieval via Normalized Contextual Latent Interaction for + Conversational Agent ICDM + + +
+ Conversational agents leveraging AI, particularly deep learning, are emerging +in both academic research and real-world applications. However, these +applications still face challenges, including disrespecting knowledge and +facts, not personalizing to user preferences, and enormous demand for +computational resources during training and inference. Recent research efforts +have been focused on addressing these challenges from various aspects, +including supplementing various types of auxiliary information to the +conversational agents. However, existing methods are still not able to +effectively and efficiently exploit relevant information from these auxiliary +supplements to further unleash the power of the conversational agents and the +language models they use. In this paper, we present a novel method, PK-NCLI, +that is able to accurately and efficiently identify relevant auxiliary +information to improve the quality of conversational responses by learning the +relevance among persona, chat history, and knowledge background through +low-level normalized contextual latent interaction. Our experimental results +indicate that PK-NCLI outperforms the state-of-the-art method, PK-FoCus, by +47.80%/30.61%/24.14% in terms of perplexity, knowledge grounding, and training +efficiency, respectively, and maintained the same level of persona grounding +performance. We also provide a detailed analysis of how different factors, +including language model choices and trade-offs on training weights, would +affect the performance of PK-NCLI. + +
+
+ comment: 2023 IEEE International Conference on Data Mining Workshops (ICDMW) +
+
+
+
+
+ + ☆ Rethinking Detection Based Table Structure Recognition for Visually Rich + Documents + + +
+ Table Structure Recognition (TSR) aims at transforming unstructured table +images into structured formats, such as HTML sequences. One type of popular +solution is using detection models to detect components of a table, such as +columns and rows, then applying a rule-based post-processing method to convert +detection results into HTML sequences. However, existing detection-based +studies often have the following limitations. First, these studies usually pay +more attention to improving the detection performance, which does not +necessarily lead to better performance regarding cell-level metrics, such as +TEDS. Second, some solutions over-simplify the problem and can miss some +critical information. Lastly, even though some studies defined the problem to +detect more components to provide as much information as other types of +solutions, these studies ignore the fact this problem definition is a +multi-label detection because row, projected row header and column header can +share identical bounding boxes. Besides, there is often a performance gap +between two-stage and transformer-based detection models regarding the +structure-only TEDS, even though they have similar performance regarding the +COCO metrics. Therefore, we revisit the limitations of existing detection-based +solutions, compare two-stage and transformer-based detection models, and +identify the key design aspects for the success of a two-stage detection model +for the TSR task, including the multi-class problem definition, the aspect +ratio for anchor box generation, and the feature generation of the backbone +network. We applied simple methods to improve these aspects of the Cascade +R-CNN model, achieved state-of-the-art performance, and improved the baseline +Cascade R-CNN model by 19.32%, 11.56% and 14.77% regarding the structure-only +TEDS on SciTSR, FinTabNet, and PubTables1M datasets. + +
+
+ comment: submitted to Pattern Recognition +
+
+
+
+
+ + ☆ Attack Detection Using Item Vector Shift in Matrix Factorisation + Recommenders + + +
+ This paper proposes a novel method for detecting shilling attacks in Matrix +Factorization (MF)-based Recommender Systems (RS), in which attackers use false +user-item feedback to promote a specific item. Unlike existing methods that use +either use supervised learning to distinguish between attack and genuine +profiles or analyse target item rating distributions to detect false ratings, +our method uses an unsupervised technique to detect false ratings by examining +shifts in item preference vectors that exploit rating deviations and user +characteristics, making it a promising new direction. The experimental results +demonstrate the effectiveness of our approach in various attack scenarios, +including those involving obfuscation techniques. + +
+
+
+
+
+ + ☆ Event-driven Real-time Retrieval in Web Search + + +
+ Information retrieval in real-time search presents unique challenges distinct +from those encountered in classical web search. These challenges are +particularly pronounced due to the rapid change of user search intent, which is +influenced by the occurrence and evolution of breaking news events, such as +earthquakes, elections, and wars. Previous dense retrieval methods, which +primarily focused on static semantic representation, lack the capacity to +capture immediate search intent, leading to inferior performance in retrieving +the most recent event-related documents in time-sensitive scenarios. To address +this issue, this paper expands the query with event information that represents +real-time search intent. The Event information is then integrated with the +query through a cross-attention mechanism, resulting in a time-context query +representation. We further enhance the model's capacity for event +representation through multi-task training. Since publicly available datasets +such as MS-MARCO do not contain any event information on the query side and +have few time-sensitive queries, we design an automatic data collection and +annotation pipeline to address this issue, which includes ModelZoo-based Coarse +Annotation and LLM-driven Fine Annotation processes. In addition, we share the +training tricks such as two-stage training and hard negative sampling. Finally, +we conduct a set of offline experiments on a million-scale production dataset +to evaluate our approach and deploy an A/B testing in a real online system to +verify the performance. Extensive experimental results demonstrate that our +proposed approach significantly outperforms existing state-of-the-art baseline +methods. + +
+
+
+
+
+ + ☆ Hypergraph Node Representation Learning with One-Stage Message Passing + + +
+ Hypergraphs as an expressive and general structure have attracted +considerable attention from various research domains. Most existing hypergraph +node representation learning techniques are based on graph neural networks, and +thus adopt the two-stage message passing paradigm (i.e. node -> hyperedge -> +node). This paradigm only focuses on local information propagation and does not +effectively take into account global information, resulting in less optimal +representations. Our theoretical analysis of representative two-stage message +passing methods shows that, mathematically, they model different ways of local +message passing through hyperedges, and can be unified into one-stage message +passing (i.e. node -> node). However, they still only model local information. +Motivated by this theoretical analysis, we propose a novel one-stage message +passing paradigm to model both global and local information propagation for +hypergraphs. We integrate this paradigm into HGraphormer, a Transformer-based +framework for hypergraph node representation learning. HGraphormer injects the +hypergraph structure information (local information) into Transformers (global +information) by combining the attention matrix and hypergraph Laplacian. +Extensive experiments demonstrate that HGraphormer outperforms recent +hypergraph learning methods on five representative benchmark datasets on the +semi-supervised hypernode classification task, setting new state-of-the-art +performance, with accuracy improvements between 2.52% and 6.70%. Our code and +datasets are available. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Agent-OM: Leveraging Large Language Models for Ontology Matching + + +
+ Ontology matching (OM) enables semantic interoperability between different +ontologies and resolves their conceptual heterogeneity by aligning related +entities. OM systems currently have two prevailing design paradigms: +conventional knowledge-based expert systems and newer machine learning-based +predictive systems. While large language models (LLMs) and LLM-based agents +have become revolutionary in data engineering and have been applied creatively +in various domains, their potential for OM remains underexplored. This study +introduces a novel agent-powered LLM-based design paradigm for OM systems. With +thoughtful consideration of several specific challenges to leverage LLMs for +OM, we propose a generic framework, namely Agent-OM, consisting of two Siamese +agents for retrieval and matching, with a set of simple prompt-based OM tools. +Our framework is implemented in a proof-of-concept system. Evaluations of three +Ontology Alignment Evaluation Initiative (OAEI) tracks over state-of-the-art OM +systems show that our system can achieve very close results to the best +long-standing performance on simple OM tasks and significantly improve the +performance on complex and few-shot OM tasks. + +
+
+ comment: 14 pages, 10 figures, 7 tables +
+
+
+
+
+ + ☆ LLM-TAKE: Theme Aware Keyword Extraction Using Large Language Models + + +
+ Keyword extraction is one of the core tasks in natural language processing. +Classic extraction models are notorious for having a short attention span which +make it hard for them to conclude relational connections among the words and +sentences that are far from each other. This, in turn, makes their usage +prohibitive for generating keywords that are inferred from the context of the +whole text. In this paper, we explore using Large Language Models (LLMs) in +generating keywords for items that are inferred from the items textual +metadata. Our modeling framework includes several stages to fine grain the +results by avoiding outputting keywords that are non informative or sensitive +and reduce hallucinations common in LLM. We call our LLM-based framework +Theme-Aware Keyword Extraction (LLM TAKE). We propose two variations of +framework for generating extractive and abstractive themes for products in an E +commerce setting. We perform an extensive set of experiments on three real data +sets and show that our modeling framework can enhance accuracy based and +diversity based metrics when compared with benchmark models. + +
+
+
+
+
+ + ♻ ☆ A Novel Patent Similarity Measurement Methodology: Semantic Distance and + Technological Distance + + +
+ Patent similarity analysis plays a crucial role in evaluating the risk of +patent infringement. Nonetheless, this analysis is predominantly conducted +manually by legal experts, often resulting in a time-consuming process. Recent +advances in natural language processing technology offer a promising avenue for +automating this process. However, methods for measuring similarity between +patents still rely on experts manually classifying patents. Due to the recent +development of artificial intelligence technology, a lot of research is being +conducted focusing on the semantic similarity of patents using natural language +processing technology. However, it is difficult to accurately analyze patent +data, which are legal documents representing complex technologies, using +existing natural language processing technologies. To address these +limitations, we propose a hybrid methodology that takes into account +bibliographic similarity, measures the similarity between patents by +considering the semantic similarity of patents, the technical similarity +between patents, and the bibliographic information of patents. Using natural +language processing techniques, we measure semantic similarity based on patent +text and calculate technical similarity through the degree of coexistence of +International patent classification (IPC) codes. The similarity of +bibliographic information of a patent is calculated using the special +characteristics of the patent: citation information, inventor information, and +assignee information. We propose a model that assigns reasonable weights to +each similarity method considered. With the help of experts, we performed +manual similarity evaluations on 420 pairs and evaluated the performance of our +model based on this data. We have empirically shown that our method outperforms +recent natural language processing techniques. + +
+
+
+
+
+ + ♻ ☆ Robust Basket Recommendation via Noise-tolerated Graph Contrastive + Learning CIKM 2023 + + +
+ The growth of e-commerce has seen a surge in popularity of platforms like +Amazon, eBay, and Taobao. This has given rise to a unique shopping behavior +involving baskets - sets of items purchased together. As a less studied +interaction mode in the community, the question of how should shopping basket +complement personalized recommendation systems remains under-explored. While +previous attempts focused on jointly modeling user purchases and baskets, the +distinct semantic nature of these elements can introduce noise when directly +integrated. This noise negatively impacts the model's performance, further +exacerbated by significant noise (e.g., a user is misled to click an item or +recognizes it as uninteresting after consuming it) within both user and basket +behaviors. In order to cope with the above difficulties, we propose a novel +Basket recommendation framework via Noise-tolerated Contrastive Learning, named +BNCL, to handle the noise existing in the cross-behavior integration and +within-behavior modeling. First, we represent the basket-item interactions as +the hypergraph to model the complex basket behavior, where all items appearing +in the same basket are treated as a single hyperedge. Second, cross-behavior +contrastive learning is designed to suppress the noise during the fusion of +diverse behaviors. Next, to further inhibit the within-behavior noise of the +user and basket interactions, we propose to exploit invariant properties of the +recommenders w.r.t augmentations through within-behavior contrastive learning. +A novel consistency-aware augmentation approach is further designed to better +identify noisy interactions with the consideration of the above two types of +interactions. Our framework BNCL offers a generic training paradigm that is +applicable to different backbones. Extensive experiments on three shopping +transaction datasets verify the effectiveness of our proposed method. + +
+
+ comment: CIKM 2023 +
+
+
+
+
+ + ♻ ☆ MuseChat: A Conversational Music Recommendation System for Videos + + +
+ Music recommendation for videos attracts growing interest in multi-modal +research. However, existing systems focus primarily on content compatibility, +often ignoring the users' preferences. Their inability to interact with users +for further refinements or to provide explanations leads to a less satisfying +experience. We address these issues with MuseChat, a first-of-its-kind +dialogue-based recommendation system that personalizes music suggestions for +videos. Our system consists of two key functionalities with associated modules: +recommendation and reasoning. The recommendation module takes a video along +with optional information including previous suggested music and user's +preference as inputs and retrieves an appropriate music matching the context. +The reasoning module, equipped with the power of Large Language Model +(Vicuna-7B) and extended to multi-modal inputs, is able to provide reasonable +explanation for the recommended music. To evaluate the effectiveness of +MuseChat, we build a large-scale dataset, conversational music recommendation +for videos, that simulates a two-turn interaction between a user and a +recommender based on accurate music track information. Experiment results show +that MuseChat achieves significant improvements over existing video-based music +retrieval methods as well as offers strong interpretability and +interactability. + +
+
+
+
+
+
+
+
+ + Machine Learning 148 + +
+
+
+ + ☆ Making Large Multimodal Models Understand Arbitrary Visual Prompts + + +
+ While existing large vision-language multimodal models focus on whole image +understanding, there is a prominent gap in achieving region-specific +comprehension. Current approaches that use textual coordinates or spatial +encodings often fail to provide a user-friendly interface for visual prompting. +To address this challenge, we introduce a novel multimodal model capable of +decoding arbitrary visual prompts. This allows users to intuitively mark images +and interact with the model using natural cues like a "red bounding box" or +"pointed arrow". Our simple design directly overlays visual markers onto the +RGB image, eliminating the need for complex region encodings, yet achieves +state-of-the-art performance on region-understanding tasks like Visual7W, +PointQA, and Visual Commonsense Reasoning benchmark. Furthermore, we present +ViP-Bench, a comprehensive benchmark to assess the capability of models in +understanding visual prompts across multiple dimensions, enabling future +research in this domain. Code, data, and model are publicly available. + +
+
+ comment: Project page: https://vip-llava.github.io/ +
+
+
+
+
+ + ☆ Towards Generalizable Zero-Shot Manipulation via Translating Human + Interaction Plans + + +
+ We pursue the goal of developing robots that can interact zero-shot with +generic unseen objects via a diverse repertoire of manipulation skills and show +how passive human videos can serve as a rich source of data for learning such +generalist robots. Unlike typical robot learning approaches which directly +learn how a robot should act from interaction data, we adopt a factorized +approach that can leverage large-scale human videos to learn how a human would +accomplish a desired task (a human plan), followed by translating this plan to +the robots embodiment. Specifically, we learn a human plan predictor that, +given a current image of a scene and a goal image, predicts the future hand and +object configurations. We combine this with a translation module that learns a +plan-conditioned robot manipulation policy, and allows following humans plans +for generic manipulation tasks in a zero-shot manner with no deployment-time +training. Importantly, while the plan predictor can leverage large-scale human +videos for learning, the translation module only requires a small amount of +in-domain data, and can generalize to tasks not seen during training. We show +that our learned system can perform over 16 manipulation skills that generalize +to 40 objects, encompassing 100 real-world tasks for table-top manipulation and +diverse in-the-wild manipulation. https://homangab.github.io/hopman/ + +
+
+ comment: Preprint. Under Review +
+
+
+
+
+ + ☆ Context Retrieval via Normalized Contextual Latent Interaction for + Conversational Agent ICDM + + +
+ Conversational agents leveraging AI, particularly deep learning, are emerging +in both academic research and real-world applications. However, these +applications still face challenges, including disrespecting knowledge and +facts, not personalizing to user preferences, and enormous demand for +computational resources during training and inference. Recent research efforts +have been focused on addressing these challenges from various aspects, +including supplementing various types of auxiliary information to the +conversational agents. However, existing methods are still not able to +effectively and efficiently exploit relevant information from these auxiliary +supplements to further unleash the power of the conversational agents and the +language models they use. In this paper, we present a novel method, PK-NCLI, +that is able to accurately and efficiently identify relevant auxiliary +information to improve the quality of conversational responses by learning the +relevance among persona, chat history, and knowledge background through +low-level normalized contextual latent interaction. Our experimental results +indicate that PK-NCLI outperforms the state-of-the-art method, PK-FoCus, by +47.80%/30.61%/24.14% in terms of perplexity, knowledge grounding, and training +efficiency, respectively, and maintained the same level of persona grounding +performance. We also provide a detailed analysis of how different factors, +including language model choices and trade-offs on training weights, would +affect the performance of PK-NCLI. + +
+
+ comment: 2023 IEEE International Conference on Data Mining Workshops (ICDMW) +
+
+
+
+
+ + ☆ Explaining Knock-on Effects of Bias Mitigation NeurIPS 2023 + + +
+ In machine learning systems, bias mitigation approaches aim to make outcomes +fairer across privileged and unprivileged groups. Bias mitigation methods work +in different ways and have known "waterfall" effects, e.g., mitigating bias at +one place may manifest bias elsewhere. In this paper, we aim to characterise +impacted cohorts when mitigation interventions are applied. To do so, we treat +intervention effects as a classification task and learn an explainable +meta-classifier to identify cohorts that have altered outcomes. We examine a +range of bias mitigation strategies that work at various stages of the model +life cycle. We empirically demonstrate that our meta-classifier is able to +uncover impacted cohorts. Further, we show that all tested mitigation +strategies negatively impact a non-trivial fraction of cases, i.e., people who +receive unfavourable outcomes solely on account of mitigation efforts. This is +despite improvement in fairness metrics. We use these results as a basis to +argue for more careful audits of static mitigation interventions that go beyond +aggregate metrics. + +
+
+ comment: This paper was accepted at NeurIPS 2023 workshop +
+
+
+
+
+ + ☆ Beyond ChatBots: ExploreLLM for Structured Thoughts and Personalized + Model Responses + + +
+ Large language model (LLM) powered chatbots are primarily text-based today, +and impose a large interactional cognitive load, especially for exploratory or +sensemaking tasks such as planning a trip or learning about a new city. Because +the interaction is textual, users have little scaffolding in the way of +structure, informational "scent", or ability to specify high-level preferences +or goals. We introduce ExploreLLM that allows users to structure thoughts, help +explore different options, navigate through the choices and recommendations, +and to more easily steer models to generate more personalized responses. We +conduct a user study and show that users find it helpful to use ExploreLLM for +exploratory or planning tasks, because it provides a useful schema-like +structure to the task, and guides users in planning. The study also suggests +that users can more easily personalize responses with high-level preferences +with ExploreLLM. Together, ExploreLLM points to a future where users interact +with LLMs beyond the form of chatbots, and instead designed to support complex +user tasks with a tighter integration between natural language and graphical +user interfaces. + +
+
+ comment: 19 pages, 11 figures +
+
+
+
+
+ + ☆ Deep Unlearning: Fast and Efficient Training-free Approach to Controlled + Forgetting + + +
+ Machine unlearning has emerged as a prominent and challenging area of +interest, driven in large part by the rising regulatory demands for industries +to delete user data upon request and the heightened awareness of privacy. +Existing approaches either retrain models from scratch or use several +finetuning steps for every deletion request, often constrained by computational +resource limitations and restricted access to the original training data. In +this work, we introduce a novel class unlearning algorithm designed to +strategically eliminate an entire class or a group of classes from the learned +model. To that end, our algorithm first estimates the Retain Space and the +Forget Space, representing the feature or activation spaces for samples from +classes to be retained and unlearned, respectively. To obtain these spaces, we +propose a novel singular value decomposition-based technique that requires +layer wise collection of network activations from a few forward passes through +the network. We then compute the shared information between these spaces and +remove it from the forget space to isolate class-discriminatory feature space +for unlearning. Finally, we project the model weights in the orthogonal +direction of the class-discriminatory space to obtain the unlearned model. We +demonstrate our algorithm's efficacy on ImageNet using a Vision Transformer +with only $\sim$1.5% drop in retain accuracy compared to the original model +while maintaining under 1% accuracy on the unlearned class samples. Further, +our algorithm consistently performs well when subject to Membership Inference +Attacks showing 7.8% improvement on average across a variety of image +classification datasets and network architectures, as compared to other +baselines while being $\sim$6x more computationally efficient. + +
+
+
+
+
+ + ☆ Mamba: Linear-Time Sequence Modeling with Selective State Spaces + + +
+ Foundation models, now powering most of the exciting applications in deep +learning, are almost universally based on the Transformer architecture and its +core attention module. Many subquadratic-time architectures such as linear +attention, gated convolution and recurrent models, and structured state space +models (SSMs) have been developed to address Transformers' computational +inefficiency on long sequences, but they have not performed as well as +attention on important modalities such as language. We identify that a key +weakness of such models is their inability to perform content-based reasoning, +and make several improvements. First, simply letting the SSM parameters be +functions of the input addresses their weakness with discrete modalities, +allowing the model to selectively propagate or forget information along the +sequence length dimension depending on the current token. Second, even though +this change prevents the use of efficient convolutions, we design a +hardware-aware parallel algorithm in recurrent mode. We integrate these +selective SSMs into a simplified end-to-end neural network architecture without +attention or even MLP blocks (Mamba). Mamba enjoys fast inference (5$\times$ +higher throughput than Transformers) and linear scaling in sequence length, and +its performance improves on real data up to million-length sequences. As a +general sequence model backbone, Mamba achieves state-of-the-art performance +across several modalities such as language, audio, and genomics. On language +modeling, our Mamba-3B model outperforms Transformers of the same size and +matches Transformers twice its size, both in pretraining and downstream +evaluation. + +
+
+
+
+
+ + ☆ Scalable Meta-Learning with Gaussian Processes + + +
+ Meta-learning is a powerful approach that exploits historical data to quickly +solve new tasks from the same distribution. In the low-data regime, methods +based on the closed-form posterior of Gaussian processes (GP) together with +Bayesian optimization have achieved high performance. However, these methods +are either computationally expensive or introduce assumptions that hinder a +principled propagation of uncertainty between task models. This may disrupt the +balance between exploration and exploitation during optimization. In this +paper, we develop ScaML-GP, a modular GP model for meta-learning that is +scalable in the number of tasks. Our core contribution is a carefully designed +multi-task kernel that enables hierarchical training and task scalability. +Conditioning ScaML-GP on the meta-data exposes its modular nature yielding a +test-task prior that combines the posteriors of meta-task GPs. In synthetic and +real-world meta-learning experiments, we demonstrate that ScaML-GP can learn +efficiently both with few and many meta-tasks. + +
+
+
+
+
+ + ☆ Safe Reinforcement Learning in Tensor Reproducing Kernel Hilbert Space + + +
+ This paper delves into the problem of safe reinforcement learning (RL) in a +partially observable environment with the aim of achieving safe-reachability +objectives. In traditional partially observable Markov decision processes +(POMDP), ensuring safety typically involves estimating the belief in latent +states. However, accurately estimating an optimal Bayesian filter in POMDP to +infer latent states from observations in a continuous state space poses a +significant challenge, largely due to the intractable likelihood. To tackle +this issue, we propose a stochastic model-based approach that guarantees RL +safety almost surely in the face of unknown system dynamics and partial +observation environments. We leveraged the Predictive State Representation +(PSR) and Reproducing Kernel Hilbert Space (RKHS) to represent future +multi-step observations analytically, and the results in this context are +provable. Furthermore, we derived essential operators from the kernel Bayes' +rule, enabling the recursive estimation of future observations using various +operators. Under the assumption of \textit{undercompleness}, a polynomial +sample complexity is established for the RL algorithm for the infinite size of +observation and action spaces, ensuring an $\epsilon-$suboptimal safe policy +guarantee. + +
+
+
+
+
+ + ☆ Removing Biases from Molecular Representations via Information + Maximization + + +
+ High-throughput drug screening -- using cell imaging or gene expression +measurements as readouts of drug effect -- is a critical tool in biotechnology +to assess and understand the relationship between the chemical structure and +biological activity of a drug. Since large-scale screens have to be divided +into multiple experiments, a key difficulty is dealing with batch effects, +which can introduce systematic errors and non-biological associations in the +data. We propose InfoCORE, an Information maximization approach for COnfounder +REmoval, to effectively deal with batch effects and obtain refined molecular +representations. InfoCORE establishes a variational lower bound on the +conditional mutual information of the latent representations given a batch +identifier. It adaptively reweighs samples to equalize their implied batch +distribution. Extensive experiments on drug screening data reveal InfoCORE's +superior performance in a multitude of tasks including molecular property +prediction and molecule-phenotype retrieval. Additionally, we show results for +how InfoCORE offers a versatile framework and resolves general distribution +shifts and issues of data fairness by minimizing correlation with spurious +features or removing sensitive attributes. The code is available at +https://github.com/uhlerlab/InfoCORE. + +
+
+
+
+
+ + ☆ SpaCE: The Spatial Confounding Environment + + +
+ Spatial confounding poses a significant challenge in scientific studies +involving spatial data, where unobserved spatial variables can influence both +treatment and outcome, possibly leading to spurious associations. To address +this problem, we introduce SpaCE: The Spatial Confounding Environment, the +first toolkit to provide realistic benchmark datasets and tools for +systematically evaluating causal inference methods designed to alleviate +spatial confounding. Each dataset includes training data, true counterfactuals, +a spatial graph with coordinates, and smoothness and confounding scores +characterizing the effect of a missing spatial confounder. It also includes +realistic semi-synthetic outcomes and counterfactuals, generated using +state-of-the-art machine learning ensembles, following best practices for +causal inference benchmarks. The datasets cover real treatment and covariates +from diverse domains, including climate, health and social sciences. SpaCE +facilitates an automated end-to-end pipeline, simplifying data loading, +experimental setup, and evaluating machine learning and causal inference +models. The SpaCE project provides several dozens of datasets of diverse sizes +and spatial complexity. It is publicly available as a Python package, +encouraging community feedback and contributions. + +
+
+
+
+
+ + ☆ GIFT: Generative Interpretable Fine-Tuning Transformers + + +
+ We present GIFT (Generative Interpretable Fine-tuning Transformers) for +fine-tuning pretrained (often large) Transformer models at downstream tasks in +a parameter-efficient way with built-in interpretability. Our GIFT is a deep +parameter-residual learning method, which addresses two problems in fine-tuning +a pretrained Transformer model: Where to apply the parameter-efficient +fine-tuning (PEFT) to be extremely lightweight yet sufficiently expressive, and +How to learn the PEFT to better exploit the knowledge of the pretrained model +in a direct way? For the former, we select the final projection (linear) layer +in the multi-head self-attention of a Transformer model, and verify its +effectiveness. For the latter, in contrast to the prior art that directly +introduce new model parameters (often in low-rank approximation form) to be +learned in fine-tuning with downstream data, we propose a method for learning +to generate the fine-tuning parameters. Our GIFT is a hyper-Transformer which +take as input the pretrained parameters of the projection layer to generate its +fine-tuning parameters using a proposed Parameter-to-Cluster Attention (PaCa). +The PaCa results in a simple clustering-based forward explainer that plays the +role of semantic segmentation in testing. In experiments, our proposed GIFT is +tested on the VTAB benchmark and the fine-grained visual classification (FGVC) +benchmark. It obtains significantly better performance than the prior art. Our +code is available at https://github.com/savadikarc/gift + +
+
+ comment: 18 pages, 12 figures +
+
+
+
+
+ + ☆ Object Detector Differences when using Synthetic and Real Training Data + + +
+ To train well-performing generalizing neural networks, sufficiently large and +diverse datasets are needed. Collecting data while adhering to privacy +legislation becomes increasingly difficult and annotating these large datasets +is both a resource-heavy and time-consuming task. An approach to overcome these +difficulties is to use synthetic data since it is inherently scalable and can +be automatically annotated. However, how training on synthetic data affects the +layers of a neural network is still unclear. In this paper, we train the YOLOv3 +object detector on real and synthetic images from city environments. We perform +a similarity analysis using Centered Kernel Alignment (CKA) to explore the +effects of training on synthetic data on a layer-wise basis. The analysis +captures the architecture of the detector while showing both different and +similar patterns between different models. With this similarity analysis we +want to give insights on how training synthetic data affects each layer and to +give a better understanding of the inner workings of complex neural networks. +The results show that the largest similarity between a detector trained on real +data and a detector trained on synthetic data was in the early layers, and the +largest difference was in the head part. The results also show that no major +difference in performance or similarity could be seen between frozen and +unfrozen backbone. + +
+
+ comment: 27 pages. The Version of Record of this article is published in + Springer Nature Computer Science 2023, and is available online at + https://doi.org/10.1007/s42979-023-01704-5 +
+
+
+
+
+ + ☆ Towards Transparency in Coreference Resolution: A Quantum-Inspired + Approach EMNLP 2023 + + +
+ Guided by grammatical structure, words compose to form sentences, and guided +by discourse structure, sentences compose to form dialogues and documents. The +compositional aspect of sentence and discourse units is often overlooked by +machine learning algorithms. A recent initiative called Quantum Natural +Language Processing (QNLP) learns word meanings as points in a Hilbert space +and acts on them via a translation of grammatical structure into Parametrised +Quantum Circuits (PQCs). Previous work extended the QNLP translation to +discourse structure using points in a closure of Hilbert spaces. In this paper, +we evaluate this translation on a Winograd-style pronoun resolution task. We +train a Variational Quantum Classifier (VQC) for binary classification and +implement an end-to-end pronoun resolution system. The simulations executed on +IBMQ software converged with an F1 score of 87.20%. The model outperformed two +out of three classical coreference resolution systems and neared +state-of-the-art SpanBERT. A mixed quantum-classical model yet improved these +results with an F1 score increase of around 6%. + +
+
+ comment: CRAC 2023, the Sixth Workshop on Computational Models of Reference, + Anaphora and Coreference, EMNLP 2023 +
+
+
+
+
+ + ☆ CellMixer: Annotation-free Semantic Cell Segmentation of Heterogeneous + Cell Populations NeurIPS 2023 + + +
+ In recent years, several unsupervised cell segmentation methods have been +presented, trying to omit the requirement of laborious pixel-level annotations +for the training of a cell segmentation model. Most if not all of these methods +handle the instance segmentation task by focusing on the detection of different +cell instances ignoring their type. While such models prove adequate for +certain tasks, like cell counting, other applications require the +identification of each cell's type. In this paper, we present CellMixer, an +innovative annotation-free approach for the semantic segmentation of +heterogeneous cell populations. Our augmentation-based method enables the +training of a segmentation model from image-level labels of homogeneous cell +populations. Our results show that CellMixer can achieve competitive +segmentation performance across multiple cell types and imaging modalities, +demonstrating the method's scalability and potential for broader applications +in medical imaging, cellular biology, and diagnostics. + +
+
+ comment: Medical Imaging Meets NeurIPS 2023 +
+
+
+
+
+ + ☆ Nonparametric Variational Regularisation of Pretrained Transformers + + +
+ The current paradigm of large-scale pre-training and fine-tuning Transformer +large language models has lead to significant improvements across the board in +natural language processing. However, such large models are susceptible to +overfitting to their training data, and as a result the models perform poorly +when the domain changes. Also, due to the model's scale, the cost of +fine-tuning the model to the new domain is large. Nonparametric Variational +Information Bottleneck (NVIB) has been proposed as a regulariser for training +cross-attention in Transformers, potentially addressing the overfitting +problem. We extend the NVIB framework to replace all types of attention +functions in Transformers, and show that existing pretrained Transformers can +be reinterpreted as Nonparametric Variational (NV) models using a proposed +identity initialisation. We then show that changing the initialisation +introduces a novel, information-theoretic post-training regularisation in the +attention mechanism, which improves out-of-domain generalisation without any +training. This success supports the hypothesis that pretrained Transformers are +implicitly NV Bayesian models. + +
+
+
+
+
+ + ☆ Resource-constrained knowledge diffusion processes inspired by human + peer learning + + +
+ We consider a setting where a population of artificial learners is given, and +the objective is to optimize aggregate measures of performance, under +constraints on training resources. The problem is motivated by the study of +peer learning in human educational systems. In this context, we study natural +knowledge diffusion processes in networks of interacting artificial learners. +By `natural', we mean processes that reflect human peer learning where the +students' internal state and learning process is mostly opaque, and the main +degree of freedom lies in the formation of peer learning groups by a +coordinator who can potentially evaluate the learners before assigning them to +peer groups. Among else, we empirically show that such processes indeed make +effective use of the training resources, and enable the design of modular +neural models that have the capacity to generalize without being prone to +overfitting noisy labels. + +
+
+
+
+
+ + ☆ Simple Transferability Estimation for Regression Tasks UAI2023 + + +
+ We consider transferability estimation, the problem of estimating how well +deep learning models transfer from a source to a target task. We focus on +regression tasks, which received little previous attention, and propose two +simple and computationally efficient approaches that estimate transferability +based on the negative regularized mean squared error of a linear regression +model. We prove novel theoretical results connecting our approaches to the +actual transferability of the optimal target models obtained from the transfer +learning process. Despite their simplicity, our approaches significantly +outperform existing state-of-the-art regression transferability estimators in +both accuracy and efficiency. On two large-scale keypoint regression +benchmarks, our approaches yield 12% to 36% better results on average while +being at least 27% faster than previous state-of-the-art methods. + +
+
+ comment: 23 pages, This paper published at The 39th Conference on Uncertainty + in Artificial Intelligence (UAI2023) +
+
+
+
+
+ + ☆ Machine Learning for Health symposium 2023 -- Findings track + + +
+ A collection of the accepted Findings papers that were presented at the 3rd +Machine Learning for Health symposium (ML4H 2023), which was held on December +10, 2023, in New Orleans, Louisiana, USA. ML4H 2023 invited high-quality +submissions on relevant problems in a variety of health-related disciplines +including healthcare, biomedicine, and public health. Two submission tracks +were offered: the archival Proceedings track, and the non-archival Findings +track. Proceedings were targeted at mature work with strong technical +sophistication and a high impact to health. The Findings track looked for new +ideas that could spark insightful discussion, serve as valuable resources for +the community, or could enable new collaborations. Submissions to the +Proceedings track, if not accepted, were automatically considered for the +Findings track. All the manuscripts submitted to ML4H Symposium underwent a +double-blind peer-review process. + +
+
+
+
+
+ + ☆ Hashmarks: Privacy-Preserving Benchmarks for High-Stakes AI Evaluation + + +
+ There is a growing need to gain insight into language model capabilities that +relate to sensitive topics, such as bioterrorism or cyberwarfare. However, +traditional open source benchmarks are not fit for the task, due to the +associated practice of publishing the correct answers in human-readable form. +At the same time, enforcing mandatory closed-quarters evaluations might stifle +development and erode trust. In this context, we propose hashmarking, a +protocol for evaluating language models in the open without having to disclose +the correct answers. In its simplest form, a hashmark is a benchmark whose +reference solutions have been cryptographically hashed prior to publication. +Following an overview of the proposed evaluation protocol, we go on to assess +its resilience against traditional attack vectors (e.g. rainbow table attacks), +as well as against failure modes unique to increasingly capable generative +models. + +
+
+
+
+
+ + ☆ One to beat them all: "RYU'' -- a unifying framework for the + construction of safe balls + + +
+ In this paper, we put forth a novel framework (named ``RYU'') for the +construction of ``safe'' balls, i.e. regions that provably contain the dual +solution of a target optimization problem. We concentrate on the standard setup +where the cost function is the sum of two terms: a closed, proper, convex +Lipschitz-smooth function and a closed, proper, convex function. The RYU +framework is shown to generalize or improve upon all the results proposed in +the last decade for the considered family of optimization problems. + +
+
+ comment: 19 pages, 1 table +
+
+
+
+
+ + ☆ EvE: Exploiting Generative Priors for Radiance Field Enrichment + + +
+ Modeling large-scale scenes from unconstrained image collections in-the-wild +has proven to be a major challenge in computer vision. Existing methods +tackling in-the-wild neural rendering operate in a closed-world setting, where +knowledge is limited to a scene's captured images within a training set. We +propose EvE, which is, to the best of our knowledge, the first method +leveraging generative priors to improve in-the-wild scene modeling. We employ +pre-trained generative networks to enrich K-Planes representations with +extrinsic knowledge. To this end, we define an alternating training procedure +to conduct optimization guidance of K-Planes trained on the training set. We +carry out extensive experiments and verify the merit of our method on synthetic +data as well as real tourism photo collections. EvE enhances rendered scenes +with richer details and outperforms the state of the art on the task of novel +view synthesis in-the-wild. Our project page can be found at +https://eve-nvs.github.io . + +
+
+
+
+
+ + ☆ Forecasting Trends in Food Security: a Reservoir Computing Approach + + +
+ Early warning systems are an essential tool for effective humanitarian +action. Advance warnings on impending disasters facilitate timely and targeted +response which help save lives, livelihoods, and scarce financial resources. In +this work we present a new quantitative methodology to forecast levels of food +consumption for 60 consecutive days, at the sub-national level, in four +countries: Mali, Nigeria, Syria, and Yemen. The methodology is built on +publicly available data from the World Food Programme's integrated global +hunger monitoring system which collects, processes, and displays daily updates +on key food security metrics, conflict, weather events, and other drivers of +food insecurity across 90 countries (https://hungermap.wfp.org/). In this +study, we assessed the performance of various models including ARIMA, XGBoost, +LSTMs, CNNs, and Reservoir Computing (RC), by comparing their Root Mean Squared +Error (RMSE) metrics. This comprehensive analysis spanned classical +statistical, machine learning, and deep learning approaches. Our findings +highlight Reservoir Computing as a particularly well-suited model in the field +of food security given both its notable resistance to over-fitting on limited +data samples and its efficient training capabilities. The methodology we +introduce establishes the groundwork for a global, data-driven early warning +system designed to anticipate and detect food insecurity. + +
+
+ comment: 22 pages, 11 figures +
+
+
+
+
+ + ☆ Practical Path-based Bayesian Optimization NeurIPS 2023 + + +
+ There has been a surge in interest in data-driven experimental design with +applications to chemical engineering and drug manufacturing. Bayesian +optimization (BO) has proven to be adaptable to such cases, since we can model +the reactions of interest as expensive black-box functions. Sometimes, the cost +of this black-box functions can be separated into two parts: (a) the cost of +the experiment itself, and (b) the cost of changing the input parameters. In +this short paper, we extend the SnAKe algorithm to deal with both types of +costs simultaneously. We further propose extensions to the case of a maximum +allowable input change, as well as to the multi-objective setting. + +
+
+ comment: 6 main pages, 12 with references and appendix. 4 figures, 2 tables. + To appear in NeurIPS 2023 Workshop on Adaptive Experimental Design and Active + Learning in the Real World +
+
+
+
+
+ + ☆ Investigating a domain adaptation approach for integrating different + measurement instruments in a longitudinal clinical registry + + +
+ In a longitudinal clinical registry, different measurement instruments might +have been used for assessing individuals at different time points. To combine +them, we investigate deep learning techniques for obtaining a joint latent +representation, to which the items of different measurement instruments are +mapped. This corresponds to domain adaptation, an established concept in +computer science for image data. Using the proposed approach as an example, we +evaluate the potential of domain adaptation in a longitudinal cohort setting +with a rather small number of time points, motivated by an application with +different motor function measurement instruments in a registry of spinal +muscular atrophy (SMA) patients. There, we model trajectories in the latent +representation by ordinary differential equations (ODEs), where person-specific +ODE parameters are inferred from baseline characteristics. The goodness of fit +and complexity of the ODE solutions then allows to judge the measurement +instrument mappings. We subsequently explore how alignment can be improved by +incorporating corresponding penalty terms into model fitting. To systematically +investigate the effect of differences between measurement instruments, we +consider several scenarios based on modified SMA data, including scenarios +where a mapping should be feasible in principle and scenarios where no perfect +mapping is available. While misalignment increases in more complex scenarios, +some structure is still recovered, even if the availability of measurement +instruments depends on patient state. A reasonable mapping is feasible also in +the more complex real SMA dataset. These results indicate that domain +adaptation might be more generally useful in statistical modeling for +longitudinal registry data. + +
+
+ comment: 18 pages, 4 figures +
+
+
+
+
+ + ☆ Improving Plasticity in Online Continual Learning via Collaborative + Learning + + +
+ Online Continual Learning (CL) solves the problem of learning the +ever-emerging new classification tasks from a continuous data stream. Unlike +its offline counterpart, in online CL, the training data can only be seen once. +Most existing online CL research regards catastrophic forgetting (i.e., model +stability) as almost the only challenge. In this paper, we argue that the +model's capability to acquire new knowledge (i.e., model plasticity) is another +challenge in online CL. While replay-based strategies have been shown to be +effective in alleviating catastrophic forgetting, there is a notable gap in +research attention toward improving model plasticity. To this end, we propose +Collaborative Continual Learning (CCL), a collaborative learning based strategy +to improve the model's capability in acquiring new concepts. Additionally, we +introduce Distillation Chain (DC), a novel collaborative learning scheme to +boost the training of the models. We adapted CCL-DC to existing representative +online CL works. Extensive experiments demonstrate that even if the learners +are well-trained with state-of-the-art online CL methods, our strategy can +still improve model plasticity dramatically, and thereby improve the overall +performance by a large margin. + +
+
+ comment: Under review +
+
+
+
+
+ + ☆ Tracking Object Positions in Reinforcement Learning: A Metric for + Keypoint Detection (extended version) + + +
+ Reinforcement learning (RL) for robot control typically requires a detailed +representation of the environment state, including information about +task-relevant objects not directly measurable. Keypoint detectors, such as +spatial autoencoders (SAEs), are a common approach to extracting a +low-dimensional representation from high-dimensional image data. SAEs aim at +spatial features such as object positions, which are often useful +representations in robotic RL. However, whether an SAE is actually able to +track objects in the scene and thus yields a spatial state representation well +suited for RL tasks has rarely been examined due to a lack of established +metrics. In this paper, we propose to assess the performance of an SAE instance +by measuring how well keypoints track ground truth objects in images. We +present a computationally lightweight metric and use it to evaluate common +baseline SAE architectures on image data from a simulated robot task. We find +that common SAEs differ substantially in their spatial extraction capability. +Furthermore, we validate that SAEs that perform well in our metric achieve +superior performance when used in downstream RL. Thus, our metric is an +effective and lightweight indicator of RL performance before executing +expensive RL training. Building on these insights, we identify three key +modifications of SAE architectures to improve tracking performance. We make our +code available at anonymous.4open.science/r/sae-rl. + +
+
+
+
+
+ + ☆ Explainable Fraud Detection with Deep Symbolic Classification + + +
+ There is a growing demand for explainable, transparent, and data-driven +models within the domain of fraud detection. Decisions made by fraud detection +models need to be explainable in the event of a customer dispute. Additionally, +the decision-making process in the model must be transparent to win the trust +of regulators and business stakeholders. At the same time, fraud detection +solutions can benefit from data due to the noisy, dynamic nature of fraud and +the availability of large historical data sets. Finally, fraud detection is +notorious for its class imbalance: there are typically several orders of +magnitude more legitimate transactions than fraudulent ones. In this paper, we +present Deep Symbolic Classification (DSC), an extension of the Deep Symbolic +Regression framework to classification problems. DSC casts classification as a +search problem in the space of all analytic functions composed of a vocabulary +of variables, constants, and operations and optimizes for an arbitrary +evaluation metric directly. The search is guided by a deep neural network +trained with reinforcement learning. Because the functions are mathematical +expressions that are in closed-form and concise, the model is inherently +explainable both at the level of a single classification decision and the +model's decision process. Furthermore, the class imbalance problem is +successfully addressed by optimizing for metrics that are robust to class +imbalance such as the F1 score. This eliminates the need for oversampling and +undersampling techniques that plague traditional approaches. Finally, the model +allows to explicitly balance between the prediction accuracy and the +explainability. An evaluation on the PaySim data set demonstrates competitive +predictive performance with state-of-the-art models, while surpassing them in +terms of explainability. This establishes DSC as a promising model for fraud +detection systems. + +
+
+ comment: 12 pages, 3 figures, To be published in the 3rd International + Workshop on Explainable AI in Finance of the 4th ACM International Conference + on AI in Finance (ICAIF, https://ai-finance.org/) +
+
+
+
+
+ + ☆ Adaptive Parameter-Free Robust Learning using Latent Bernoulli Variables + + +
+ We present an efficient parameter-free approach for statistical learning from +corrupted training sets. We identify corrupted and non-corrupted samples using +latent Bernoulli variables, and therefore formulate the robust learning problem +as maximization of the likelihood where latent variables are marginalized out. +The resulting optimization problem is solved via variational inference using an +efficient Expectation-Maximization based method. The proposed approach improves +over the state-of-the-art by automatically inferring the corruption level and +identifying outliers, while adding minimal computational overhead. We +demonstrate our robust learning method on a wide variety of machine learning +tasks including online learning and deep learning where it exhibits ability to +adapt to different levels of noise and attain high prediction accuracy. + +
+
+ comment: 14 pages, 14 figures, and 2 tables +
+
+
+
+
+ + ☆ Pathway to a fully data-driven geotechnics: lessons from materials + informatics + + +
+ This paper elucidates the challenges and opportunities inherent in +integrating data-driven methodologies into geotechnics, drawing inspiration +from the success of materials informatics. Highlighting the intricacies of soil +complexity, heterogeneity, and the lack of comprehensive data, the discussion +underscores the pressing need for community-driven database initiatives and +open science movements. By leveraging the transformative power of deep +learning, particularly in feature extraction from high-dimensional data and the +potential of transfer learning, we envision a paradigm shift towards a more +collaborative and innovative geotechnics field. The paper concludes with a +forward-looking stance, emphasizing the revolutionary potential brought about +by advanced computational tools like large language models in reshaping +geotechnics informatics. + +
+
+
+
+
+ + ☆ Interior Point Constrained Reinforcement Learning with Global + Convergence Guarantees + + +
+ We consider discounted infinite horizon constrained Markov decision processes +(CMDPs) where the goal is to find an optimal policy that maximizes the expected +cumulative reward subject to expected cumulative constraints. Motivated by the +application of CMDPs in online learning of safety-critical systems, we focus on +developing an algorithm that ensures constraint satisfaction during learning. +To this end, we develop a zeroth-order interior point approach based on the log +barrier function of the CMDP. Under the commonly assumed conditions of Fisher +non-degeneracy and bounded transfer error of the policy parameterization, we +establish the theoretical properties of the algorithm. In particular, in +contrast to existing CMDP approaches that ensure policy feasibility only upon +convergence, our algorithm guarantees feasibility of the policies during the +learning process and converges to the optimal policy with a sample complexity +of $O(\varepsilon^{-6})$. In comparison to the state-of-the-art policy +gradient-based algorithm, C-NPG-PDA, our algorithm requires an additional +$O(\varepsilon^{-2})$ samples to ensure policy feasibility during learning with +same Fisher-non-degenerate parameterization. + +
+
+ comment: 34 pages, no figures +
+
+
+
+
+ + ☆ Domain Adaptive Imitation Learning with Visual Observation NeurIPS 2023 + + +
+ In this paper, we consider domain-adaptive imitation learning with visual +observation, where an agent in a target domain learns to perform a task by +observing expert demonstrations in a source domain. Domain adaptive imitation +learning arises in practical scenarios where a robot, receiving visual sensory +data, needs to mimic movements by visually observing other robots from +different angles or observing robots of different shapes. To overcome the +domain shift in cross-domain imitation learning with visual observation, we +propose a novel framework for extracting domain-independent behavioral features +from input observations that can be used to train the learner, based on dual +feature extraction and image reconstruction. Empirical results demonstrate that +our approach outperforms previous algorithms for imitation learning from visual +observation with domain shift. + +
+
+ comment: Accepted to NeurIPS 2023 +
+
+
+
+
+ + ☆ Target-agnostic Source-free Domain Adaptation for Regression Tasks ICDE 2024 + + +
+ Unsupervised domain adaptation (UDA) seeks to bridge the domain gap between +the target and source using unlabeled target data. Source-free UDA removes the +requirement for labeled source data at the target to preserve data privacy and +storage. However, work on source-free UDA assumes knowledge of domain gap +distribution, and hence is limited to either target-aware or classification +task. To overcome it, we propose TASFAR, a novel target-agnostic source-free +domain adaptation approach for regression tasks. Using prediction confidence, +TASFAR estimates a label density map as the target label distribution, which is +then used to calibrate the source model on the target domain. We have conducted +extensive experiments on four regression tasks with various domain gaps, +namely, pedestrian dead reckoning for different users, image-based people +counting in different scenes, housing-price prediction at different districts, +and taxi-trip duration prediction from different departure points. TASFAR is +shown to substantially outperform the state-of-the-art source-free UDA +approaches by averagely reducing 22% errors for the four tasks and achieve +notably comparable accuracy as source-based UDA without using source data. + +
+
+ comment: Accepted by ICDE 2024 +
+
+
+
+
+ + ☆ A Preconditioned Interior Point Method for Support Vector Machines Using + an ANOVA-Decomposition and NFFT-Based Matrix-Vector Products + + +
+ In this paper we consider the numerical solution to the soft-margin support +vector machine optimization problem. This problem is typically solved using the +SMO algorithm, given the high computational complexity of traditional +optimization algorithms when dealing with large-scale kernel matrices. In this +work, we propose employing an NFFT-accelerated matrix-vector product using an +ANOVA decomposition for the feature space that is used within an interior point +method for the overall optimization problem. As this method requires the +solution of a linear system of saddle point form we suggest a preconditioning +approach that is based on low-rank approximations of the kernel matrix together +with a Krylov subspace solver. We compare the accuracy of the ANOVA-based +kernel with the default LIBSVM implementation. We investigate the performance +of the different preconditioners as well as the accuracy of the ANOVA kernel on +several large-scale datasets. + +
+
+ comment: Official Code https://github.com/wagnertheresa/NFFTSVMipm +
+
+
+
+
+ + ☆ RIS-Based On-the-Air Semantic Communications -- a Diffractional Deep + Neural Network Approach + + +
+ Semantic communication has gained significant attention recently due to its +advantages in achieving higher transmission efficiency by focusing on semantic +information instead of bit-level information. However, current AI-based +semantic communication methods require digital hardware for implementation. +With the rapid advancement on reconfigurable intelligence surfaces (RISs), a +new approach called on-the-air diffractional deep neural networks (D$^2$NN) can +be utilized to enable semantic communications on the wave domain. This paper +proposes a new paradigm of RIS-based on-the-air semantic communications, where +the computational process occurs inherently as wireless signals pass through +RISs. We present the system model and discuss the data and control flows of +this scheme, followed by a performance analysis using image transmission as an +example. In comparison to traditional hardware-based approaches, RIS-based +semantic communications offer appealing features, such as light-speed +computation, low computational power requirements, and the ability to handle +multiple tasks simultaneously. + +
+
+ comment: 17 pages, 5 figures, accepted by IEEE WCM +
+
+
+
+
+ + ☆ Spatio-Temporal-Decoupled Masked Pre-training for Traffic Forecasting + + +
+ Accurate forecasting of multivariate traffic flow time series remains +challenging due to substantial spatio-temporal heterogeneity and complex +long-range correlative patterns. To address this, we propose +Spatio-Temporal-Decoupled Masked Pre-training (STD-MAE), a novel framework that +employs masked autoencoders to learn and encode complex spatio-temporal +dependencies via pre-training. Specifically, we use two decoupled masked +autoencoders to reconstruct the traffic data along spatial and temporal axes +using a self-supervised pre-training approach. These mask reconstruction +mechanisms capture the long-range correlations in space and time separately. +The learned hidden representations are then used to augment the downstream +spatio-temporal traffic predictor. A series of quantitative and qualitative +evaluations on four widely-used traffic benchmarks (PEMS03, PEMS04, PEMS07, and +PEMS08) are conducted to verify the state-of-the-art performance, with STD-MAE +explicitly enhancing the downstream spatio-temporal models' ability to capture +long-range intricate spatial and temporal patterns. Codes are available at +https://github.com/Jimmy-7664/STD_MAE. + +
+
+
+
+
+ + ☆ Bayesian causal discovery from unknown general interventions + + +
+ We consider the problem of learning causal Directed Acyclic Graphs (DAGs) +using combinations of observational and interventional experimental data. +Current methods tailored to this setting assume that interventions either +destroy parent-child relations of the intervened (target) nodes or only alter +such relations without modifying the parent sets, even when the intervention +targets are unknown. We relax this assumption by proposing a Bayesian method +for causal discovery from general interventions, which allow for modifications +of the parent sets of the unknown targets. Even in this framework, DAGs and +general interventions may be identifiable only up to some equivalence classes. +We provide graphical characterizations of such interventional Markov +equivalence and devise compatible priors for Bayesian inference that guarantee +score equivalence of indistinguishable structures. We then develop a Markov +Chain Monte Carlo (MCMC) scheme to approximate the posterior distribution over +DAGs, intervention targets and induced parent sets. Finally, we evaluate the +proposed methodology on both simulated and real protein expression data. + +
+
+
+
+
+ + ☆ VEXIR2Vec: An Architecture-Neutral Embedding Framework for Binary + Similarity + + +
+ We propose VEXIR2Vec, a code embedding framework for finding similar +functions in binaries. Our representations rely on VEX IR, the intermediate +representation used by binary analysis tools like Valgrind and angr. Our +proposed embeddings encode both syntactic and semantic information to represent +a function, and is both application and architecture independent. We also +propose POV, a custom Peephole Optimization engine that normalizes the VEX IR +for effective similarity analysis. We design several optimizations like +copy/constant propagation, constant folding, common subexpression elimination +and load-store elimination in POV. + We evaluate our framework on two experiments -- diffing and searching -- +involving binaries targeting different architectures, compiled using different +compilers and versions, optimization sequences, and obfuscations. We show +results on several standard projects and on real-world vulnerabilities. Our +results show that VEXIR2Vec achieves superior precision and recall values +compared to the state-of-the-art works. Our framework is highly scalable and is +built as a multi-threaded, parallel library by only using open-source tools. +VEXIR2Vec achieves about $3.2 \times$ speedup on the closest competitor, and +orders-of-magnitude speedup on other tools. + +
+
+
+
+
+ + ☆ On the Out-Of-Distribution Robustness of Self-Supervised Representation + Learning for Phonocardiogram Signals + + +
+ Objective: Despite the recent increase in research activity, deep-learning +models have not yet been widely accepted in medicine. The shortage of +high-quality annotated data often hinders the development of robust and +generalizable models, which do not suffer from degraded effectiveness when +presented with newly-collected, out-of-distribution (OOD) datasets. Methods: +Contrastive Self-Supervised Learning (SSL) offers a potential solution to the +scarcity of labeled data as it takes advantage of unlabeled data to increase +model effectiveness and robustness. In this research, we propose applying +contrastive SSL for detecting abnormalities in phonocardiogram (PCG) samples by +learning a generalized representation of the signal. Specifically, we perform +an extensive comparative evaluation of a wide range of audio-based +augmentations and evaluate trained classifiers on multiple datasets across +different downstream tasks. Results: We experimentally demonstrate that, +depending on its training distribution, the effectiveness of a fully-supervised +model can degrade up to 32% when evaluated on unseen data, while SSL models +only lose up to 10% or even improve in some cases. Conclusions: Contrastive SSL +pretraining can assist in providing robust classifiers which can generalize to +unseen, OOD data, without relying on time- and labor-intensive annotation +processes by medical experts. Furthermore, the proposed extensive evaluation +protocol sheds light on the most promising and appropriate augmentations for +robust PCG signal processing. Significance: We provide researchers and +practitioners with a roadmap towards producing robust models for PCG +classification, in addition to an open-source codebase for developing novel +approaches. + +
+
+ comment: PREPRINT Manuscript under review +
+
+
+
+
+ + ☆ REDUCR: Robust Data Downsampling Using Class Priority Reweighting + + +
+ Modern machine learning models are becoming increasingly expensive to train +for real-world image and text classification tasks, where massive web-scale +data is collected in a streaming fashion. To reduce the training cost, online +batch selection techniques have been developed to choose the most informative +datapoints. However, these techniques can suffer from poor worst-class +generalization performance due to class imbalance and distributional shifts. +This work introduces REDUCR, a robust and efficient data downsampling method +that uses class priority reweighting. REDUCR reduces the training data while +preserving worst-class generalization performance. REDUCR assigns priority +weights to datapoints in a class-aware manner using an online learning +algorithm. We demonstrate the data efficiency and robust performance of REDUCR +on vision and text classification tasks. On web-scraped datasets with +imbalanced class distributions, REDUCR significantly improves worst-class test +accuracy (and average accuracy), surpassing state-of-the-art methods by around +15%. + +
+
+ comment: Preprint +
+
+
+
+
+ + ☆ Backbone-based Dynamic Graph Spatio-Temporal Network for Epidemic + Forecasting + + +
+ Accurate epidemic forecasting is a critical task in controlling disease +transmission. Many deep learning-based models focus only on static or dynamic +graphs when constructing spatial information, ignoring their relationship. +Additionally, these models often rely on recurrent structures, which can lead +to error accumulation and computational time consumption. To address the +aforementioned problems, we propose a novel model called Backbone-based Dynamic +Graph Spatio-Temporal Network (BDGSTN). Intuitively, the continuous and smooth +changes in graph structure, make adjacent graph structures share a basic +pattern. To capture this property, we use adaptive methods to generate static +backbone graphs containing the primary information and temporal models to +generate dynamic temporal graphs of epidemic data, fusing them to generate a +backbone-based dynamic graph. To overcome potential limitations associated with +recurrent structures, we introduce a linear model DLinear to handle temporal +dependencies and combine it with dynamic graph convolution for epidemic +forecasting. Extensive experiments on two datasets demonstrate that BDGSTN +outperforms baseline models and ablation comparison further verifies the +effectiveness of model components. Furthermore, we analyze and measure the +significance of backbone and temporal graphs by using information metrics from +different aspects. Finally, we compare model parameter volume and training time +to confirm the superior complexity and efficiency of BDGSTN. + +
+
+
+
+
+ + ☆ MultiView Independent Component Analysis with Delays + + +
+ Linear Independent Component Analysis (ICA) is a blind source separation +technique that has been used in various domains to identify independent latent +sources from observed signals. In order to obtain a higher signal-to-noise +ratio, the presence of multiple views of the same sources can be used. In this +work, we present MultiView Independent Component Analysis with Delays (MVICAD). +This algorithm builds on the MultiView ICA model by allowing sources to be +delayed versions of some shared sources: sources are shared across views up to +some unknown latencies that are view- and source-specific. Using simulations, +we demonstrate that MVICAD leads to better unmixing of the sources. Moreover, +as ICA is often used in neuroscience, we show that latencies are age-related +when applied to Cam-CAN, a large-scale magnetoencephalography (MEG) dataset. +These results demonstrate that the MVICAD model can reveal rich effects on +neural signals without human supervision. + +
+
+
+
+
+ + ☆ Interpretable Meta-Learning of Physical Systems + + +
+ Machine learning methods can be a valuable aid in the scientific process, but +they need to face challenging settings where data come from inhomogeneous +experimental conditions. Recent meta-learning methods have made significant +progress in multi-task learning, but they rely on black-box neural networks, +resulting in high computational costs and limited interpretability. Leveraging +the structure of the learning problem, we argue that multi-environment +generalization can be achieved using a simpler learning model, with an affine +structure with respect to the learning task. Crucially, we prove that this +architecture can identify the physical parameters of the system, enabling +interpreable learning. We demonstrate the competitive generalization +performance and the low computational cost of our method by comparing it to +state-of-the-art algorithms on physical systems, ranging from toy models to +complex, non-analytical systems. The interpretability of our method is +illustrated with original applications to physical-parameter-induced adaptation +and to adaptive control. + +
+
+
+
+
+ + ☆ A Bayesian approach for prompt optimization in pre-trained language + models + + +
+ A prompt is a sequence of symbol or tokens, selected from a vocabulary +according to some rule, which is prepended/concatenated to a textual query. A +key problem is how to select the sequence of tokens: in this paper we formulate +it as a combinatorial optimization problem. The high dimensionality of the +token space com-pounded by the length of the prompt sequence requires a very +efficient solution. In this paper we propose a Bayesian optimization method, +executed in a continuous em-bedding of the combinatorial space. In this paper +we focus on hard prompt tuning (HPT) which directly searches for discrete +tokens to be added to the text input with-out requiring access to the large +language model (LLM) and can be used also when LLM is available only as a +black-box. This is critically important if LLMs are made available in the Model +as a Service (MaaS) manner as in GPT-4. The current manu-script is focused on +the optimization of discrete prompts for classification tasks. The discrete +prompts give rise to difficult combinatorial optimization problem which easily +become intractable given the dimension of the token space in realistic +applications. The optimization method considered in this paper is Bayesian +optimization (BO) which has become the dominant approach in black-box +optimization for its sample efficiency along with its modular structure and +versatility. In this paper we use BoTorch, a library for Bayesian optimization +research built on top of pyTorch. Albeit preliminary and obtained using a +'vanilla' version of BO, the experiments on RoB-ERTa on six benchmarks, show a +good performance across a variety of tasks and enable an analysis of the +tradeoff between size of the search space, accuracy and wall clock time. + +
+
+
+
+
+ + ☆ Auto-encoding GPS data to reveal individual and collective behaviour + + +
+ We propose an innovative and generic methodology to analyse individual and +collective behaviour through individual trajectory data. The work is motivated +by the analysis of GPS trajectories of fishing vessels collected from +regulatory tracking data in the context of marine biodiversity conservation and +ecosystem-based fisheries management. We build a low-dimensional latent +representation of trajectories using convolutional neural networks as +non-linear mapping. This is done by training a conditional variational +auto-encoder taking into account covariates. The posterior distributions of the +latent representations can be linked to the characteristics of the actual +trajectories. The latent distributions of the trajectories are compared with +the Bhattacharyya coefficient, which is well-suited for comparing +distributions. Using this coefficient, we analyse the variation of the +individual behaviour of each vessel during time. For collective behaviour +analysis, we build proximity graphs and use an extension of the stochastic +block model for multiple networks. This model results in a clustering of the +individuals based on their set of trajectories. The application to French +fishing vessels enables us to obtain groups of vessels whose individual and +collective behaviours exhibit spatio-temporal patterns over the period +2014-2018. + +
+
+
+
+
+ + ☆ Meta-Diversity Search in Complex Systems, A Recipe for Artificial + Open-Endedness ? + + +
+ Can we build an artificial system that would be able to generate endless +surprises if ran "forever" in Minecraft? While there is not a single path +toward solving that grand challenge, this article presents what we believe to +be some working ingredients for the endless generation of novel increasingly +complex artifacts in Minecraft. Our framework for an open-ended system includes +two components: a complex system used to recursively grow and complexify +artifacts over time, and a discovery algorithm that leverages the concept of +meta-diversity search. Since complex systems have shown to enable the emergence +of considerable complexity from set of simple rules, we believe them to be +great candidates to generate all sort of artifacts in Minecraft. Yet, the space +of possible artifacts that can be generated by these systems is often unknown, +challenging to characterize and explore. Therefore automating the long-term +discovery of novel and increasingly complex artifacts in these systems is an +exciting research field. To approach these challenges, we formulate the problem +of meta-diversity search where an artificial "discovery assistant" +incrementally learns a diverse set of representations to characterize behaviors +and searches to discover diverse patterns within each of them. A successful +discovery assistant should continuously seek for novel sources of diversities +while being able to quickly specialize the search toward a new unknown type of +diversity. To implement those ideas in the Minecraft environment, we simulate +an artificial "chemistry" system based on Lenia continuous cellular automaton +for generating artifacts, as well as an artificial "discovery assistant" +(called Holmes) for the artifact-discovery process. Holmes incrementally learns +a hierarchy of modular representations to characterize divergent sources of +diversity and uses a goal-based intrinsically-motivated exploration as the +diversity search strategy. + +
+
+
+
+
+ + ☆ An Encoding Framework for Binarized Images using HyperDimensional + Computing + + +
+ Hyperdimensional Computing (HDC) is a brain-inspired and light-weight machine +learning method. It has received significant attention in the literature as a +candidate to be applied in the wearable internet of things, near-sensor +artificial intelligence applications and on-device processing. HDC is +computationally less complex than traditional deep learning algorithms and +typically achieves moderate to good classification performance. A key aspect +that determines the performance of HDC is the encoding of the input data to the +hyperdimensional (HD) space. This article proposes a novel light-weight +approach relying only on native HD arithmetic vector operations to encode +binarized images that preserves similarity of patterns at nearby locations by +using point of interest selection and local linear mapping. The method reaches +an accuracy of 97.35% on the test set for the MNIST data set and 84.12% for the +Fashion-MNIST data set. These results outperform other studies using baseline +HDC with different encoding approaches and are on par with more complex hybrid +HDC models. The proposed encoding approach also demonstrates a higher +robustness to noise and blur compared to the baseline encoding. + +
+
+
+
+
+ + ☆ PEFTDebias : Capturing debiasing information using PEFTs EMNLP 2023 + + +
+ The increasing use of foundation models highlights the urgent need to address +and eliminate implicit biases present in them that arise during pretraining. In +this paper, we introduce PEFTDebias, a novel approach that employs +parameter-efficient fine-tuning (PEFT) to mitigate the biases within foundation +models. PEFTDebias consists of two main phases: an upstream phase for acquiring +debiasing parameters along a specific bias axis, and a downstream phase where +these parameters are incorporated into the model and frozen during the +fine-tuning process. By evaluating on four datasets across two bias axes namely +gender and race, we find that downstream biases can be effectively reduced with +PEFTs. In addition, we show that these parameters possess axis-specific +debiasing characteristics, enabling their effective transferability in +mitigating biases in various downstream tasks. To ensure reproducibility, we +release the code to do our experiments. + +
+
+ comment: EMNLP 2023 +
+
+
+
+
+ + ☆ From Mutual Information to Expected Dynamics: New Generalization Bounds + for Heavy-Tailed SGD NeurIPS 2023 + + +
+ Understanding the generalization abilities of modern machine learning +algorithms has been a major research topic over the past decades. In recent +years, the learning dynamics of Stochastic Gradient Descent (SGD) have been +related to heavy-tailed dynamics. This has been successfully applied to +generalization theory by exploiting the fractal properties of those dynamics. +However, the derived bounds depend on mutual information (decoupling) terms +that are beyond the reach of computability. In this work, we prove +generalization bounds over the trajectory of a class of heavy-tailed dynamics, +without those mutual information terms. Instead, we introduce a geometric +decoupling term by comparing the learning dynamics (depending on the empirical +risk) with an expected one (depending on the population risk). We further +upper-bound this geometric term, by using techniques from the heavy-tailed and +the fractal literature, making it fully computable. Moreover, as an attempt to +tighten the bounds, we propose a PAC-Bayesian setting based on perturbed +dynamics, in which the same geometric term plays a crucial role and can still +be bounded using the techniques described above. + +
+
+ comment: Accepted in the NeurIPS 2023 Workshop Heavy Tails in Machine Learning +
+
+
+
+
+ + ☆ A framework for mining lifestyle profiles through multi-dimensional and + high-order mobility feature clustering + + +
+ Human mobility demonstrates a high degree of regularity, which facilitates +the discovery of lifestyle profiles. Existing research has yet to fully utilize +the regularities embedded in high-order features extracted from human mobility +records in such profiling. This study proposes a progressive feature extraction +strategy that mines high-order mobility features from users' moving trajectory +records from the spatial, temporal, and semantic dimensions. Specific features +are extracted such as travel motifs, rhythms decomposed by discrete Fourier +transform (DFT) of mobility time series, and vectorized place semantics by +word2vec, respectively to the three dimensions, and they are further clustered +to reveal the users' lifestyle characteristics. An experiment using a +trajectory dataset of over 500k users in Shenzhen, China yields seven user +clusters with different lifestyle profiles that can be well interpreted by +common sense. The results suggest the possibility of fine-grained user +profiling through cross-order trajectory feature engineering and clustering. + +
+
+
+
+
+ + ☆ A Causality-Aware Pattern Mining Scheme for Group Activity Recognition + in a Pervasive Sensor Space + + +
+ Human activity recognition (HAR) is a key challenge in pervasive computing +and its solutions have been presented based on various disciplines. +Specifically, for HAR in a smart space without privacy and accessibility +issues, data streams generated by deployed pervasive sensors are leveraged. In +this paper, we focus on a group activity by which a group of users perform a +collaborative task without user identification and propose an efficient group +activity recognition scheme which extracts causality patterns from pervasive +sensor event sequences generated by a group of users to support as good +recognition accuracy as the state-of-the-art graphical model. To filter out +irrelevant noise events from a given data stream, a set of rules is leveraged +to highlight causally related events. Then, a pattern-tree algorithm extracts +frequent causal patterns by means of a growing tree structure. Based on the +extracted patterns, a weighted sum-based pattern matching algorithm computes +the likelihoods of stored group activities to the given test event sequence by +means of matched event pattern counts for group activity recognition. We +evaluate the proposed scheme using the data collected from our testbed and +CASAS datasets where users perform their tasks on a daily basis and validate +its effectiveness in a real environment. Experiment results show that the +proposed scheme performs higher recognition accuracy and with a small amount of +runtime overhead than the existing schemes. + +
+
+
+
+
+ + ☆ GFN-SR: Symbolic Regression with Generative Flow Networks NeurIPS 2023 + + +
+ Symbolic regression (SR) is an area of interpretable machine learning that +aims to identify mathematical expressions, often composed of simple functions, +that best fit in a given set of covariates $X$ and response $y$. In recent +years, deep symbolic regression (DSR) has emerged as a popular method in the +field by leveraging deep reinforcement learning to solve the complicated +combinatorial search problem. In this work, we propose an alternative framework +(GFN-SR) to approach SR with deep learning. We model the construction of an +expression tree as traversing through a directed acyclic graph (DAG) so that +GFlowNet can learn a stochastic policy to generate such trees sequentially. +Enhanced with an adaptive reward baseline, our method is capable of generating +a diverse set of best-fitting expressions. Notably, we observe that GFN-SR +outperforms other SR algorithms in noisy data regimes, owing to its ability to +learn a distribution of rewards over a space of candidate solutions. + +
+
+ comment: Accepted by the NeurIPS 2023 AI4Science Workshop +
+
+
+
+
+ + ☆ LinguaLinked: A Distributed Large Language Model Inference System for + Mobile Devices + + +
+ Deploying Large Language Models (LLMs) locally on mobile devices presents a +significant challenge due to their extensive memory requirements. In this +paper, we introduce LinguaLinked, a system for decentralized, distributed LLM +inference on mobile devices. LinguaLinked enables collaborative execution of +the inference task across multiple trusted devices. LinguaLinked ensures data +privacy by processing information locally. LinguaLinked uses three key +strategies. First, an optimized model assignment technique segments LLMs and +uses linear optimization to align segments with each device's capabilities. +Second, an optimized data transmission mechanism ensures efficient and +structured data flow between model segments while also maintaining the +integrity of the original model structure. Finally, LinguaLinked incorporates a +runtime load balancer that actively monitors and redistributes tasks among +mobile devices to prevent bottlenecks, enhancing the system's overall +efficiency and responsiveness. We demonstrate that LinguaLinked facilitates +efficient LLM inference while maintaining consistent throughput and minimal +latency through extensive testing across various mobile devices, from high-end +to low-end Android devices. In our evaluations, compared to the baseline, +LinguaLinked achieves an inference performance acceleration of $1.11\times$ to +$1.61\times$ in single-threaded settings, $1.73\times$ to $2.65\times$ with +multi-threading. Additionally, runtime load balancing yields an overall +inference acceleration of $1.29\times$ to $1.32\times$. + +
+
+ comment: 16 pages, 8 figures +
+
+
+
+
+ + ☆ Local monotone operator learning using non-monotone operators: MnM-MOL + + +
+ The recovery of magnetic resonance (MR) images from undersampled measurements +is a key problem that has seen extensive research in recent years. Unrolled +approaches, which rely on end-to-end training of convolutional neural network +(CNN) blocks within iterative reconstruction algorithms, offer state-of-the-art +performance. These algorithms require a large amount of memory during training, +making them difficult to employ in high-dimensional applications. Deep +equilibrium (DEQ) models and the recent monotone operator learning (MOL) +approach were introduced to eliminate the need for unrolling, thus reducing the +memory demand during training. Both approaches require a Lipschitz constraint +on the network to ensure that the forward and backpropagation iterations +converge. Unfortunately, the constraint often results in reduced performance +compared to unrolled methods. The main focus of this work is to relax the +constraint on the CNN block in two different ways. Inspired by +convex-non-convex regularization strategies, we now impose the monotone +constraint on the sum of the gradient of the data term and the CNN block, +rather than constrain the CNN itself to be a monotone operator. This approach +enables the CNN to learn possibly non-monotone score functions, which can +translate to improved performance. In addition, we only restrict the operator +to be monotone in a local neighborhood around the image manifold. Our +theoretical results show that the proposed algorithm is guaranteed to converge +to the fixed point and that the solution is robust to input perturbations, +provided that it is initialized close to the true solution. Our empirical +results show that the relaxed constraints translate to improved performance and +that the approach enjoys robustness to input perturbations similar to MOL. + +
+
+ comment: 10 pages, 7 figures +
+
+
+
+
+ + ☆ Optimal Sample Complexity of Contrastive Learning + + +
+ Contrastive learning is a highly successful technique for learning +representations of data from labeled tuples, specifying the distance relations +within the tuple. We study the sample complexity of contrastive learning, i.e. +the minimum number of labeled tuples sufficient for getting high generalization +accuracy. We give tight bounds on the sample complexity in a variety of +settings, focusing on arbitrary distance functions, both general +$\ell_p$-distances, and tree metrics. Our main result is an (almost) optimal +bound on the sample complexity of learning $\ell_p$-distances for integer $p$. +For any $p \ge 1$ we show that $\tilde \Theta(\min(nd,n^2))$ labeled tuples are +necessary and sufficient for learning $d$-dimensional representations of +$n$-point datasets. Our results hold for an arbitrary distribution of the input +samples and are based on giving the corresponding bounds on the +Vapnik-Chervonenkis/Natarajan dimension of the associated problems. We further +show that the theoretical bounds on sample complexity obtained via VC/Natarajan +dimension can have strong predictive power for experimental results, in +contrast with the folklore belief about a substantial gap between the +statistical learning theory and the practice of deep learning. + +
+
+
+
+
+ + ☆ Streaming Bayesian Modeling for predicting Fat-Tailed Customer Lifetime + Value + + +
+ We develop an online learning MCMC approach applicable for hierarchical +bayesian models and GLMS. We also develop a fat-tailed LTV model that +generalizes over several kinds of fat and thin tails. We demonstrate both +developments on commercial LTV data from a large mobile app. + +
+
+ comment: Work in progress +
+
+
+
+
+ + ☆ Benchmarking Multi-Domain Active Learning on Image Classification + + +
+ Active learning aims to enhance model performance by strategically labeling +informative data points. While extensively studied, its effectiveness on +large-scale, real-world datasets remains underexplored. Existing research +primarily focuses on single-source data, ignoring the multi-domain nature of +real-world data. We introduce a multi-domain active learning benchmark to +bridge this gap. Our benchmark demonstrates that traditional single-domain +active learning strategies are often less effective than random selection in +multi-domain scenarios. We also introduce CLIP-GeoYFCC, a novel large-scale +image dataset built around geographical domains, in contrast to existing +genre-based domain datasets. Analysis on our benchmark shows that all +multi-domain strategies exhibit significant tradeoffs, with no strategy +outperforming across all datasets or all metrics, emphasizing the need for +future research. + +
+
+
+
+
+ + ☆ Dancing with Images: Video Distillation via Static-Dynamic + Disentanglement + + +
+ Recently, dataset distillation has paved the way towards efficient machine +learning, especially for image datasets. However, the distillation for videos, +characterized by an exclusive temporal dimension, remains an underexplored +domain. In this work, we provide the first systematic study of video +distillation and introduce a taxonomy to categorize temporal compression. Our +investigation reveals that the temporal information is usually not well learned +during distillation , and the temporal dimension of synthetic data contributes +little. The observations motivate our unified framework of disentangling the +dynamic and static information in the videos. It first distills the videos into +still images as static memory and then compensates the dynamic and motion +information with a learnable dynamic memory block. Our method achieves +state-of-the-art on video datasets at different scales, with notably smaller +storage expenditure. Our code will be publicly available. + +
+
+
+
+
+ + ☆ Temperature Balancing, Layer-wise Weight Analysis, and Neural Network + Training NeurIPS 2023 + + +
+ Regularization in modern machine learning is crucial, and it can take various +forms in algorithmic design: training set, model family, error function, +regularization terms, and optimizations. In particular, the learning rate, +which can be interpreted as a temperature-like parameter within the statistical +mechanics of learning, plays a crucial role in neural network training. Indeed, +many widely adopted training strategies basically just define the decay of the +learning rate over time. This process can be interpreted as decreasing a +temperature, using either a global learning rate (for the entire model) or a +learning rate that varies for each parameter. This paper proposes TempBalance, +a straightforward yet effective layer-wise learning rate method. TempBalance is +based on Heavy-Tailed Self-Regularization (HT-SR) Theory, an approach which +characterizes the implicit self-regularization of different layers in trained +models. We demonstrate the efficacy of using HT-SR-motivated metrics to guide +the scheduling and balancing of temperature across all network layers during +model training, resulting in improved performance during testing. We implement +TempBalance on CIFAR10, CIFAR100, SVHN, and TinyImageNet datasets using +ResNets, VGGs, and WideResNets with various depths and widths. Our results show +that TempBalance significantly outperforms ordinary SGD and carefully-tuned +spectral norm regularization. We also show that TempBalance outperforms a +number of state-of-the-art optimizers and learning rate schedulers. + +
+
+ comment: NeurIPS 2023 Spotlight, first two authors contributed equally +
+
+
+
+
+ + ☆ Impact of Data Augmentation on QCNNs + + +
+ In recent years, Classical Convolutional Neural Networks (CNNs) have been +applied for image recognition successfully. Quantum Convolutional Neural +Networks (QCNNs) are proposed as a novel generalization to CNNs by using +quantum mechanisms. The quantum mechanisms lead to an efficient training +process in QCNNs by reducing the size of input from $N$ to $log_2N$. This paper +implements and compares both CNNs and QCNNs by testing losses and prediction +accuracy on three commonly used datasets. The datasets include the MNIST +hand-written digits, Fashion MNIST and cat/dog face images. Additionally, data +augmentation (DA), a technique commonly used in CNNs to improve the performance +of classification by generating similar images based on original inputs, is +also implemented in QCNNs. Surprisingly, the results showed that data +augmentation didn't improve QCNNs performance. The reasons and logic behind +this result are discussed, hoping to expand our understanding of Quantum +machine learning theory. + +
+
+ comment: 14 pages, 9 figures +
+
+
+
+
+ + ☆ A Generalizable Deep Learning System for Cardiac MRI + + +
+ Cardiac MRI allows for a comprehensive assessment of myocardial structure, +function, and tissue characteristics. Here we describe a foundational vision +system for cardiac MRI, capable of representing the breadth of human +cardiovascular disease and health. Our deep learning model is trained via +self-supervised contrastive learning, by which visual concepts in cine-sequence +cardiac MRI scans are learned from the raw text of the accompanying radiology +reports. We train and evaluate our model on data from four large academic +clinical institutions in the United States. We additionally showcase the +performance of our models on the UK BioBank, and two additional publicly +available external datasets. We explore emergent zero-shot capabilities of our +system, and demonstrate remarkable performance across a range of tasks; +including the problem of left ventricular ejection fraction regression, and the +diagnosis of 35 different conditions such as cardiac amyloidosis and +hypertrophic cardiomyopathy. We show that our deep learning system is capable +of not only understanding the staggering complexity of human cardiovascular +disease, but can be directed towards clinical problems of interest yielding +impressive, clinical grade diagnostic accuracy with a fraction of the training +data typically required for such tasks. + +
+
+ comment: 21 page main manuscript, 4 figures. Supplementary Appendix and code + will be made available on publication +
+
+
+
+
+ + ☆ Transfer learning for predicting source terms of principal component + transport in chemically reactive flow + + +
+ The objective of this study is to evaluate whether the number of requisite +training samples can be reduced with the use of various transfer learning +models for predicting, for example, the chemical source terms of the +data-driven reduced-order model that represents the homogeneous ignition +process of a hydrogen/air mixture. Principal component analysis is applied to +reduce the dimensionality of the hydrogen/air mixture in composition space. +Artificial neural networks (ANNs) are used to tabulate the reaction rates of +principal components, and subsequently, a system of ordinary differential +equations is solved. As the number of training samples decreases at the target +task (i.e.,for T0 > 1000 K and various phi), the reduced-order model fails to +predict the ignition evolution of a hydrogen/air mixture. Three transfer +learning strategies are then applied to the training of the ANN model with a +sparse dataset. The performance of the reduced-order model with a sparse +dataset is found to be remarkably enhanced if the training of the ANN model is +restricted by a regularization term that controls the degree of knowledge +transfer from source to target tasks. To this end, a novel transfer learning +method is introduced, parameter control via partial initialization and +regularization (PaPIR), whereby the amount of knowledge transferred is +systemically adjusted for the initialization and regularization of the ANN +model in the target task. It is found that an additional performance gain can +be achieved by changing the initialization scheme of the ANN model in the +target task when the task similarity between source and target tasks is +relatively low. + +
+
+ comment: 41 pages, 14 figures +
+
+
+
+
+ + ☆ Quantum Kernel t-Distributed Stochastic Neighbor Embedding + + +
+ Data visualization is important in understanding the characteristics of data +that are difficult to see directly. It is used to visualize loss landscapes and +optimization trajectories to analyze optimization performance. Popular +optimization analysis is performed by visualizing a loss landscape around the +reached local or global minimum using principal component analysis. However, +this visualization depends on the variational parameters of a quantum circuit +rather than quantum states, which makes it difficult to understand the +mechanism of optimization process through the property of quantum states. Here, +we propose a quantum data visualization method using quantum kernels, which +enables us to offer fast and highly accurate visualization of quantum states. +In our numerical experiments, we visualize hand-written digits dataset and +apply $k$-nearest neighbor algorithm to the low-dimensional data to +quantitatively evaluate our proposed method compared with a classical kernel +method. As a result, our proposed method achieves comparable accuracy to the +state-of-the-art classical kernel method, meaning that the proposed +visualization method based on quantum machine learning does not degrade the +separability of the input higher dimensional data. Furthermore, we visualize +the optimization trajectories of finding the ground states of transverse field +Ising model and successfully find the trajectory characteristics. Since quantum +states are higher dimensional objects that can only be seen via observables, +our visualization method, which inherits the similarity of quantum data, would +be useful in understanding the behavior of quantum circuits and algorithms. + +
+
+ comment: 10pages, 8 figures, 2 tables +
+
+
+
+
+ + ☆ TRC: Trust Region Conditional Value at Risk for Safe Reinforcement + Learning ICRA 2022 + + +
+ As safety is of paramount importance in robotics, reinforcement learning that +reflects safety, called safe RL, has been studied extensively. In safe RL, we +aim to find a policy which maximizes the desired return while satisfying the +defined safety constraints. There are various types of constraints, among which +constraints on conditional value at risk (CVaR) effectively lower the +probability of failures caused by high costs since CVaR is a conditional +expectation obtained above a certain percentile. In this paper, we propose a +trust region-based safe RL method with CVaR constraints, called TRC. We first +derive the upper bound on CVaR and then approximate the upper bound in a +differentiable form in a trust region. Using this approximation, a subproblem +to get policy gradients is formulated, and policies are trained by iteratively +solving the subproblem. TRC is evaluated through safe navigation tasks in +simulations with various robots and a sim-to-real environment with a Jackal +robot from Clearpath. Compared to other safe RL methods, the performance is +improved by 1.93 times while the constraints are satisfied in all experiments. + +
+
+ comment: RA-L and ICRA 2022 +
+
+
+
+
+ + ☆ Efficient Off-Policy Safe Reinforcement Learning Using Trust Region + Conditional Value at Risk IROS 2022 + + +
+ This paper aims to solve a safe reinforcement learning (RL) problem with risk +measure-based constraints. As risk measures, such as conditional value at risk +(CVaR), focus on the tail distribution of cost signals, constraining risk +measures can effectively prevent a failure in the worst case. An on-policy safe +RL method, called TRC, deals with a CVaR-constrained RL problem using a trust +region method and can generate policies with almost zero constraint violations +with high returns. However, to achieve outstanding performance in complex +environments and satisfy safety constraints quickly, RL methods are required to +be sample efficient. To this end, we propose an off-policy safe RL method with +CVaR constraints, called off-policy TRC. If off-policy data from replay buffers +is directly used to train TRC, the estimation error caused by the +distributional shift results in performance degradation. To resolve this issue, +we propose novel surrogate functions, in which the effect of the distributional +shift can be reduced, and introduce an adaptive trust-region constraint to +ensure a policy not to deviate far from replay buffers. The proposed method has +been evaluated in simulation and real-world environments and satisfied safety +constraints within a few steps while achieving high returns even in complex +robotic tasks. + +
+
+ comment: RA-L and IROS 2022 +
+
+
+
+
+ + ☆ Hypergraph Node Representation Learning with One-Stage Message Passing + + +
+ Hypergraphs as an expressive and general structure have attracted +considerable attention from various research domains. Most existing hypergraph +node representation learning techniques are based on graph neural networks, and +thus adopt the two-stage message passing paradigm (i.e. node -> hyperedge -> +node). This paradigm only focuses on local information propagation and does not +effectively take into account global information, resulting in less optimal +representations. Our theoretical analysis of representative two-stage message +passing methods shows that, mathematically, they model different ways of local +message passing through hyperedges, and can be unified into one-stage message +passing (i.e. node -> node). However, they still only model local information. +Motivated by this theoretical analysis, we propose a novel one-stage message +passing paradigm to model both global and local information propagation for +hypergraphs. We integrate this paradigm into HGraphormer, a Transformer-based +framework for hypergraph node representation learning. HGraphormer injects the +hypergraph structure information (local information) into Transformers (global +information) by combining the attention matrix and hypergraph Laplacian. +Extensive experiments demonstrate that HGraphormer outperforms recent +hypergraph learning methods on five representative benchmark datasets on the +semi-supervised hypernode classification task, setting new state-of-the-art +performance, with accuracy improvements between 2.52% and 6.70%. Our code and +datasets are available. + +
+
+ comment: 11 pages +
+
+
+
+
+ + ☆ Improving Normalization with the James-Stein Estimator + + +
+ Stein's paradox holds considerable sway in high-dimensional statistics, +highlighting that the sample mean, traditionally considered the de facto +estimator, might not be the most efficacious in higher dimensions. To address +this, the James-Stein estimator proposes an enhancement by steering the sample +means toward a more centralized mean vector. In this paper, first, we establish +that normalization layers in deep learning use inadmissible estimators for mean +and variance. Next, we introduce a novel method to employ the James-Stein +estimator to improve the estimation of mean and variance within normalization +layers. We evaluate our method on different computer vision tasks: image +classification, semantic segmentation, and 3D object classification. Through +these evaluations, it is evident that our improved normalization layers +consistently yield superior accuracy across all tasks without extra +computational burden. Moreover, recognizing that a plethora of shrinkage +estimators surpass the traditional estimator in performance, we study two other +prominent shrinkage estimators: Ridge and LASSO. Additionally, we provide +visual representations to intuitively demonstrate the impact of shrinkage on +the estimated layer statistics. Finally, we study the effect of regularization +and batch size on our modified batch normalization. The studies show that our +method is less sensitive to batch size and regularization, improving accuracy +under various setups. + +
+
+
+
+
+ + ☆ Multiple Testing of Linear Forms for Noisy Matrix Completion + + +
+ Many important tasks of large-scale recommender systems can be naturally cast +as testing multiple linear forms for noisy matrix completion. These problems, +however, present unique challenges because of the subtle bias-and-variance +tradeoff of and an intricate dependence among the estimated entries induced by +the low-rank structure. In this paper, we develop a general approach to +overcome these difficulties by introducing new statistics for individual tests +with sharp asymptotics both marginally and jointly, and utilizing them to +control the false discovery rate (FDR) via a data splitting and symmetric +aggregation scheme. We show that valid FDR control can be achieved with +guaranteed power under nearly optimal sample size requirements using the +proposed methodology. Extensive numerical simulations and real data examples +are also presented to further illustrate its practical merits. + +
+
+
+
+
+ + ☆ Developmental Pretraining (DPT) for Image Classification Networks + + +
+ In the backdrop of increasing data requirements of Deep Neural Networks for +object recognition that is growing more untenable by the day, we present +Developmental PreTraining (DPT) as a possible solution. DPT is designed as a +curriculum-based pre-training approach designed to rival traditional +pre-training techniques that are data-hungry. These training approaches also +introduce unnecessary features that could be misleading when the network is +employed in a downstream classification task where the data is sufficiently +different from the pre-training data and is scarce. We design the curriculum +for DPT by drawing inspiration from human infant visual development. DPT +employs a phased approach where carefully-selected primitive and universal +features like edges and shapes are taught to the network participating in our +pre-training regime. A model that underwent the DPT regime is tested against +models with randomised weights to evaluate the viability of DPT. + +
+
+ comment: 7 pages, 4 figures +
+
+
+
+
+ + ☆ Towards Aligned Canonical Correlation Analysis: Preliminary Formulation + and Proof-of-Concept Results + + +
+ Canonical Correlation Analysis (CCA) has been widely applied to jointly embed +multiple views of data in a maximally correlated latent space. However, the +alignment between various data perspectives, which is required by traditional +approaches, is unclear in many practical cases. In this work we propose a new +framework Aligned Canonical Correlation Analysis (ACCA), to address this +challenge by iteratively solving the alignment and multi-view embedding. + +
+
+ comment: 4 pages, 7 figures, MLG 2023 +
+
+
+
+
+ + ☆ Learning to forecast diagnostic parameters using pre-trained weather + embedding NeurIPS 2023 + + +
+ Data-driven weather prediction (DDWP) models are increasingly becoming +popular for weather forecasting. However, while operational weather forecasts +predict a wide variety of weather variables, DDWPs currently forecast a +specific set of key prognostic variables. Non-prognostic ("diagnostic") +variables are sometimes modeled separately as dependent variables of the +prognostic variables (c.f. FourCastNet), or by including the diagnostic +variable as a target in the DDWP. However, the cost of training and deploying +bespoke models for each diagnostic variable can increase dramatically with more +diagnostic variables, and limit the operational use of such models. Likewise, +retraining an entire DDWP each time a new diagnostic variable is added is also +cost-prohibitive. We present an two-stage approach that allows new diagnostic +variables to be added to an end-to-end DDWP model without the expensive +retraining. In the first stage, we train an autoencoder that learns to embed +prognostic variables into a latent space. In the second stage, the autoencoder +is frozen and "downstream" models are trained to predict diagnostic variables +using only the latent representations of prognostic variables as input. Our +experiments indicate that models trained using the two-stage approach offer +accuracy comparable to training bespoke models, while leading to significant +reduction in resource utilization during training and inference. This approach +allows for new "downstream" models to be developed as needed, without affecting +existing models and thus reducing the friction in operationalizing new models. + +
+
+ comment: Accepted as a spotlight paper at the NeurIPS 2023 workshop on + Tackling Climate Change with Machine Learning +
+
+
+
+
+ + ☆ Age-Based Scheduling for Mobile Edge Computing: A Deep Reinforcement + Learning Approach + + +
+ With the rapid development of Mobile Edge Computing (MEC), various real-time +applications have been deployed to benefit people's daily lives. The +performance of these applications relies heavily on the freshness of collected +environmental information, which can be quantified by its Age of Information +(AoI). In the traditional definition of AoI, it is assumed that the status +information can be actively sampled and directly used. However, for many +MEC-enabled applications, the desired status information is updated in an +event-driven manner and necessitates data processing. To better serve these +applications, we propose a new definition of AoI and, based on the redefined +AoI, we formulate an online AoI minimization problem for MEC systems. Notably, +the problem can be interpreted as a Markov Decision Process (MDP), thus +enabling its solution through Reinforcement Learning (RL) algorithms. +Nevertheless, the traditional RL algorithms are designed for MDPs with +completely unknown system dynamics and hence usually suffer long convergence +times. To accelerate the learning process, we introduce Post-Decision States +(PDSs) to exploit the partial knowledge of the system's dynamics. We also +combine PDSs with deep RL to further improve the algorithm's applicability, +scalability, and robustness. Numerical results demonstrate that our algorithm +outperforms the benchmarks under various scenarios. + +
+
+
+
+
+ + ☆ Text Attribute Control via Closed-Loop Disentanglement ACL 2023 + + +
+ Changing an attribute of a text without changing the content usually requires +to first disentangle the text into irrelevant attributes and content +representations. After that, in the inference phase, the representation of one +attribute is tuned to a different value, expecting that the corresponding +attribute of the text can also be changed accordingly. The usual way of +disentanglement is to add some constraints on the latent space of an +encoder-decoder architecture, including adversarial-based constraints and +mutual-information-based constraints. However, the previous semi-supervised +processes of attribute change are usually not enough to guarantee the success +of attribute change and content preservation. In this paper, we propose a novel +approach to achieve a robust control of attributes while enhancing content +preservation. In this approach, we use a semi-supervised contrastive learning +method to encourage the disentanglement of attributes in latent spaces. +Differently from previous works, we re-disentangle the reconstructed sentence +and compare the re-disentangled latent space with the original latent space, +which makes a closed-loop disentanglement process. This also helps content +preservation. In addition, the contrastive learning method is also able to +replace the role of minimizing mutual information and adversarial training in +the disentanglement process, which alleviates the computation cost. We +conducted experiments on three text datasets, including the Yelp Service review +dataset, the Amazon Product review dataset, and the GoEmotions dataset. The +experimental results show the effectiveness of our model. + +
+
+ comment: accepted by TACL 2023 +
+
+
+
+
+ + ☆ Automating Continual Learning + + +
+ General-purpose learning systems should improve themselves in open-ended +fashion in ever-changing environments. Conventional learning algorithms for +neural networks, however, suffer from catastrophic forgetting (CF) -- +previously acquired skills are forgotten when a new task is learned. Instead of +hand-crafting new algorithms for avoiding CF, we propose Automated Continual +Learning (ACL) to train self-referential neural networks to meta-learn their +own in-context continual (meta-)learning algorithms. ACL encodes all desiderata +-- good performance on both old and new tasks -- into its meta-learning +objectives. Our experiments demonstrate that ACL effectively solves "in-context +catastrophic forgetting"; our ACL-learned algorithms outperform hand-crafted +ones, e.g., on the Split-MNIST benchmark in the replay-free setting, and +enables continual learning of diverse tasks consisting of multiple few-shot and +standard image classification datasets. + +
+
+
+
+
+ + ☆ Towards Clinical Prediction with Transparency: An Explainable AI + Approach to Survival Modelling in Residential Aged Care + + +
+ Background: Accurate survival time estimates aid end-of-life medical +decision-making. Objectives: Develop an interpretable survival model for +elderly residential aged care residents using advanced machine learning. +Setting: A major Australasian residential aged care provider. Participants: +Residents aged 65+ admitted for long-term care from July 2017 to August 2023. +Sample size: 11,944 residents across 40 facilities. Predictors: Factors include +age, gender, health status, co-morbidities, cognitive function, mood, +nutrition, mobility, smoking, sleep, skin integrity, and continence. Outcome: +Probability of survival post-admission, specifically calibrated for 6-month +survival estimates. Statistical Analysis: Tested CoxPH, EN, RR, Lasso, GB, XGB, +and RF models in 20 experiments with a 90/10 train/test split. Evaluated +accuracy using C-index, Harrell's C-index, dynamic AUROC, IBS, and calibrated +ROC. Chose XGB for its performance and calibrated it for 1, 3, 6, and 12-month +predictions using Platt scaling. Employed SHAP values to analyze predictor +impacts. Results: GB, XGB, and RF models showed the highest C-Index values +(0.714, 0.712, 0.712). The optimal XGB model demonstrated a 6-month survival +prediction AUROC of 0.746 (95% CI 0.744-0.749). Key mortality predictors +include age, male gender, mobility, health status, pressure ulcer risk, and +appetite. Conclusions: The study successfully applies machine learning to +create a survival model for aged care, aligning with clinical insights on +mortality risk factors and enhancing model interpretability and clinical +utility through explainable AI. + +
+
+
+
+
+ + ☆ Academic competitions + + +
+ Academic challenges comprise effective means for (i) advancing the state of +the art, (ii) putting in the spotlight of a scientific community specific +topics and problems, as well as (iii) closing the gap for under represented +communities in terms of accessing and participating in the shaping of research +fields. Competitions can be traced back for centuries and their achievements +have had great influence in our modern world. Recently, they (re)gained +popularity, with the overwhelming amounts of data that is being generated in +different domains, as well as the need of pushing the barriers of existing +methods, and available tools to handle such data. This chapter provides a +survey of academic challenges in the context of machine learning and related +fields. We review the most influential competitions in the last few years and +analyze challenges per area of knowledge. The aims of scientific challenges, +their goals, major achievements and expectations for the next few years are +reviewed. + +
+
+
+
+
+ + ☆ Sample Efficient Reinforcement Learning from Human Feedback via Active + Exploration + + +
+ Preference-based feedback is important for many applications in reinforcement +learning where direct evaluation of a reward function is not feasible. A +notable recent example arises in reinforcement learning from human feedback +(RLHF) on large language models. For many applications of RLHF, the cost of +acquiring the human feedback can be substantial. In this work, we take +advantage of the fact that one can often choose contexts at which to obtain +human feedback in order to most efficiently identify a good policy, and +formalize this as an offline contextual dueling bandit problem. We give an +upper-confidence-bound style algorithm for this problem and prove a polynomial +worst-case regret bound. We then provide empirical confirmation in a synthetic +setting that our approach outperforms existing methods. After, we extend the +setting and methodology for practical use in RLHF training of large language +models. Here, our method is able to reach better performance with fewer samples +of human preferences than multiple baselines on three real-world datasets. + +
+
+
+
+
+ + ♻ ☆ Topological properties of basins of attraction and expressiveness of + width bounded neural networks + + +
+ In Radhakrishnan et al. [2020], the authors empirically show that +autoencoders trained with usual SGD methods shape out basins of attraction +around their training data. We consider network functions of width not +exceeding the input dimension and prove that in this situation basins of +attraction are bounded and their complement cannot have bounded components. Our +conditions in these results are met in several experiments of the latter work +and we thus address a question posed therein. We also show that under some more +restrictive conditions the basins of attraction are path-connected. The +tightness of the conditions in our results is demonstrated by means of several +examples. Finally, the arguments used to prove the above results allow us to +derive a root cause why scalar-valued neural network functions that fulfill our +bounded width condition are not dense in spaces of continuous functions. + +
+
+
+
+
+ + ♻ ☆ Persona-Coded Poly-Encoder: Persona-Guided Multi-Stream Conversational + Sentence Scoring ICTAI + + +
+ Recent advances in machine learning and deep learning have led to the +widespread use of Conversational AI in many practical applications. However, it +is still very challenging to leverage auxiliary information that can provide +conversational context or personalized tuning to improve the quality of +conversations. For example, there has only been limited research on using an +individuals persona information to improve conversation quality, and even +state-of-the-art conversational AI techniques are unable to effectively +leverage signals from heterogeneous sources of auxiliary data, such as +multi-modal interaction data, demographics, SDOH data, etc. In this paper, we +present a novel Persona-Coded Poly-Encoder method that leverages persona +information in a multi-stream encoding scheme to improve the quality of +response generation for conversations. To show the efficacy of the proposed +method, we evaluate our method on two different persona-based conversational +datasets, and compared against two state-of-the-art methods. Our experimental +results and analysis demonstrate that our method can improve conversation +quality over the baseline method Poly-Encoder by 3.32% and 2.94% in terms of +BLEU score and HR@1, respectively. More significantly, our method offers a path +to better utilization of multi-modal data in conversational tasks. Lastly, our +study outlines several challenges and future research directions for advancing +personalized conversational AI technology. + +
+
+ comment: The 35th IEEE International Conference on Tools with Artificial + Intelligence (ICTAI) +
+
+
+
+
+ + ♻ ☆ TpuGraphs: A Performance Prediction Dataset on Large Tensor + Computational Graphs + + +
+ Precise hardware performance models play a crucial role in code +optimizations. They can assist compilers in making heuristic decisions or aid +autotuners in identifying the optimal configuration for a given program. For +example, the autotuner for XLA, a machine learning compiler, discovered 10-20% +speedup on state-of-the-art models serving substantial production traffic at +Google. Although there exist a few datasets for program performance prediction, +they target small sub-programs such as basic blocks or kernels. This paper +introduces TpuGraphs, a performance prediction dataset on full tensor programs, +represented as computational graphs, running on Tensor Processing Units (TPUs). +Each graph in the dataset represents the main computation of a machine learning +workload, e.g., a training epoch or an inference step. Each data sample +contains a computational graph, a compilation configuration, and the execution +time of the graph when compiled with the configuration. The graphs in the +dataset are collected from open-source machine learning programs, featuring +popular model architectures, e.g., ResNet, EfficientNet, Mask R-CNN, and +Transformer. TpuGraphs provides 25x more graphs than the largest graph property +prediction dataset (with comparable graph sizes), and 770x larger graphs on +average compared to existing performance prediction datasets on machine +learning programs. This graph-level prediction task on large graphs introduces +new challenges in learning, ranging from scalability, training efficiency, to +model quality. + +
+
+
+
+
+ + ♻ ☆ Contrastive losses as generalized models of global epistasis + + +
+ Fitness functions map large combinatorial spaces of biological sequences to +properties of interest. Inferring these multimodal functions from experimental +data is a central task in modern protein engineering. Global epistasis models +are an effective and physically-grounded class of models for estimating fitness +functions from observed data. These models assume that a sparse latent function +is transformed by a monotonic nonlinearity to emit measurable fitness. Here we +demonstrate that minimizing contrastive loss functions, such as the +Bradley-Terry loss, is a simple and flexible technique for extracting the +sparse latent function implied by global epistasis. We argue by way of a +fitness-epistasis uncertainty principle that the nonlinearities in global +epistasis models can produce observed fitness functions that do not admit +sparse representations, and thus may be inefficient to learn from observations +when using a Mean Squared Error (MSE) loss (a common practice). We show that +contrastive losses are able to accurately estimate a ranking function from +limited data even in regimes where MSE is ineffective. We validate the +practical utility of this insight by showing contrastive loss functions result +in consistently improved performance on benchmark tasks. + +
+
+
+
+
+ + ♻ ☆ Learning Causally Disentangled Representations via the Principle of + Independent Causal Mechanisms NeurIPS 2023 + + +
+ Learning disentangled causal representations is a challenging problem that +has gained significant attention recently due to its implications for +extracting meaningful information for downstream tasks. In this work, we define +a new notion of causal disentanglement from the perspective of independent +causal mechanisms. We propose ICM-VAE, a framework for learning causally +disentangled representations supervised by causally related observed labels. We +model causal mechanisms using learnable flow-based diffeomorphic functions to +map noise variables to latent causal variables. Further, to promote the +disentanglement of causal factors, we propose a causal disentanglement prior +that utilizes the known causal structure to encourage learning a causally +factorized distribution in the latent space. Under relatively mild conditions, +we provide theoretical results showing the identifiability of causal factors +and mechanisms up to permutation and elementwise reparameterization. We +empirically demonstrate that our framework induces highly disentangled causal +factors, improves interventional robustness, and is compatible with +counterfactual generation. + +
+
+ comment: Accepted to the NeurIPS 2023 Workshop on Causal Representation + Learning +
+
+
+
+
+ + ♻ ☆ HyperAttention: Long-context Attention in Near-Linear Time + + +
+ We present an approximate attention mechanism named HyperAttention to address +the computational challenges posed by the growing complexity of long contexts +used in Large Language Models (LLMs). Recent work suggests that in the +worst-case scenario, quadratic time is necessary unless the entries of the +attention matrix are bounded or the matrix has low stable rank. We introduce +two parameters which measure: (1) the max column norm in the normalized +attention matrix, and (2) the ratio of row norms in the unnormalized attention +matrix after detecting and removing large entries. We use these fine-grained +parameters to capture the hardness of the problem. Despite previous lower +bounds, we are able to achieve a linear time sampling algorithm even when the +matrix has unbounded entries or a large stable rank, provided the above +parameters are small. HyperAttention features a modular design that easily +accommodates integration of other fast low-level implementations, particularly +FlashAttention. Empirically, employing Locality Sensitive Hashing (LSH) to +identify large entries, HyperAttention outperforms existing methods, giving +significant speed improvements compared to state-of-the-art solutions like +FlashAttention. We validate the empirical performance of HyperAttention on a +variety of different long-context length datasets. For example, HyperAttention +makes the inference time of ChatGLM2 50\% faster on 32k context length while +perplexity increases from 5.6 to 6.3. On larger context length, e.g., 131k, +with causal masking, HyperAttention offers 5-fold speedup on a single attention +layer. + +
+
+
+
+
+ + ♻ ☆ Upper and lower bounds for the Lipschitz constant of random neural + networks + + +
+ Empirical studies have widely demonstrated that neural networks are highly +sensitive to small, adversarial perturbations of the input. The worst-case +robustness against these so-called adversarial examples can be quantified by +the Lipschitz constant of the neural network. In this paper, we study upper and +lower bounds for the Lipschitz constant of random ReLU neural networks. +Specifically, we assume that the weights and biases follow a generalization of +the He initialization, where general symmetric distributions for the biases are +permitted. For shallow neural networks, we characterize the Lipschitz constant +up to an absolute numerical constant. For deep networks with fixed depth and +sufficiently large width, our established bounds differ by a factor that is +logarithmic in the width. + +
+
+
+
+
+ + ♻ ☆ Adaptive Deep Neural Network Inference Optimization with EENet + + +
+ Well-trained deep neural networks (DNNs) treat all test samples equally +during prediction. Adaptive DNN inference with early exiting leverages the +observation that some test examples can be easier to predict than others. This +paper presents EENet, a novel early-exiting scheduling framework for multi-exit +DNN models. Instead of having every sample go through all DNN layers during +prediction, EENet learns an early exit scheduler, which can intelligently +terminate the inference earlier for certain predictions, which the model has +high confidence of early exit. As opposed to previous early-exiting solutions +with heuristics-based methods, our EENet framework optimizes an early-exiting +policy to maximize model accuracy while satisfying the given per-sample average +inference budget. Extensive experiments are conducted on four computer vision +datasets (CIFAR-10, CIFAR-100, ImageNet, Cityscapes) and two NLP datasets +(SST-2, AgNews). The results demonstrate that the adaptive inference by EENet +can outperform the representative existing early exit techniques. We also +perform a detailed visualization analysis of the comparison results to +interpret the benefits of EENet. + +
+
+
+
+
+ + ♻ ☆ Contextualized Policy Recovery: Modeling and Interpreting Medical + Decisions with Adaptive Imitation Learning + + +
+ Interpretable policy learning seeks to estimate intelligible decision +policies from observed actions; however, existing models fall short by forcing +a tradeoff between accuracy and interpretability. This tradeoff limits +data-driven interpretations of human decision-making process. e.g. to audit +medical decisions for biases and suboptimal practices, we require models of +decision processes which provide concise descriptions of complex behaviors. +Fundamentally, existing approaches are burdened by this tradeoff because they +represent the underlying decision process as a universal policy, when in fact +human decisions are dynamic and can change drastically with contextual +information. Thus, we propose Contextualized Policy Recovery (CPR), which +re-frames the problem of modeling complex decision processes as a multi-task +learning problem in which complex decision policies are comprised of +context-specific policies. CPR models each context-specific policy as a linear +observation-to-action mapping, and generates new decision models +$\textit{on-demand}$ as contexts are updated with new observations. CPR is +compatible with fully offline and partially observable decision environments, +and can be tailored to incorporate any recurrent black-box model or +interpretable decision model. We assess CPR through studies on simulated and +real data, achieving state-of-the-art performance on the canonical tasks of +predicting antibiotic prescription in intensive care units ($+22\%$ AUROC vs. +previous SOTA) and predicting MRI prescription for Alzheimer's patients +($+7.7\%$ AUROC vs. previous SOTA). With this improvement in predictive +performance, CPR closes the accuracy gap between interpretable and black-box +methods for policy learning, allowing high-resolution exploration and analysis +of context-specific decision models. + +
+
+
+
+
+ + ♻ ☆ Learning Robust Precipitation Forecaster by Temporal Frame Interpolation + + +
+ Recent advances in deep learning have significantly elevated weather +prediction models. However, these models often falter in real-world scenarios +due to their sensitivity to spatial-temporal shifts. This issue is particularly +acute in weather forecasting, where models are prone to overfit to local and +temporal variations, especially when tasked with fine-grained predictions. In +this paper, we address these challenges by developing a robust precipitation +forecasting model that demonstrates resilience against such spatial-temporal +discrepancies. We introduce Temporal Frame Interpolation (TFI), a novel +technique that enhances the training dataset by generating synthetic samples +through interpolating adjacent frames from satellite imagery and ground radar +data, thus improving the model's robustness against frame noise. Moreover, we +incorporate a unique Multi-Level Dice (ML-Dice) loss function, leveraging the +ordinal nature of rainfall intensities to improve the model's performance. Our +approach has led to significant improvements in forecasting precision, +culminating in our model securing \textit{1st place} in the transfer learning +leaderboard of the \textit{Weather4cast'23} competition. This achievement not +only underscores the effectiveness of our methodologies but also establishes a +new standard for deep learning applications in weather forecasting. Our code +and weights have been public on \url{https://github.com/Secilia-Cxy/UNetTFI}. + +
+
+ comment: Previous version has text overlap with last year's paper + arXiv:2212.02968 since the competition's datasets does not change. We restate + the dataset description to avoid it. We also polish the overall writing +
+
+
+
+
+ + ♻ ☆ Bayesian CART models for insurance claims frequency + + +
+ Accuracy and interpretability of a (non-life) insurance pricing model are +essential qualities to ensure fair and transparent premiums for policy-holders, +that reflect their risk. In recent years, the classification and regression +trees (CARTs) and their ensembles have gained popularity in the actuarial +literature, since they offer good prediction performance and are relatively +easily interpretable. In this paper, we introduce Bayesian CART models for +insurance pricing, with a particular focus on claims frequency modelling. +Additionally to the common Poisson and negative binomial (NB) distributions +used for claims frequency, we implement Bayesian CART for the zero-inflated +Poisson (ZIP) distribution to address the difficulty arising from the +imbalanced insurance claims data. To this end, we introduce a general MCMC +algorithm using data augmentation methods for posterior tree exploration. We +also introduce the deviance information criterion (DIC) for the tree model +selection. The proposed models are able to identify trees which can better +classify the policy-holders into risk groups. Some simulations and real +insurance data will be discussed to illustrate the applicability of these +models. + +
+
+ comment: 46 pages +
+
+
+
+
+ + ♻ ☆ Uncertainty Estimation and Out-of-Distribution Detection for Deep + Learning-Based Image Reconstruction using the Local Lipschitz + + +
+ Accurate image reconstruction is at the heart of diagnostics in medical +imaging. Supervised deep learning-based approaches have been investigated for +solving inverse problems including image reconstruction. However, these trained +models encounter unseen data distributions that are widely shifted from +training data during deployment. Therefore, it is essential to assess whether a +given input falls within the training data distribution for diagnostic +purposes. Uncertainty estimation approaches exist but focus on providing an +uncertainty map to radiologists, rather than assessing the training +distribution fit. In this work, we propose a method based on the local +Lipschitz-based metric to distinguish out-of-distribution images from +in-distribution with an area under the curve of 99.94%. Empirically, we +demonstrate a very strong relationship between the local Lipschitz value and +mean absolute error (MAE), supported by a high Spearman's rank correlation +coefficient of 0.8475, which determines the uncertainty estimation threshold +for optimal model performance. Through the identification of false positives, +the local Lipschitz and MAE relationship was used to guide data augmentation +and reduce model uncertainty. Our study was validated using the AUTOMAP +architecture for sensor-to-image Magnetic Resonance Imaging (MRI) +reconstruction. We compare our proposed approach with baseline methods: +Monte-Carlo dropout and deep ensembles, and further analysis included MRI +denoising and Computed Tomography (CT) sparse-to-full view reconstruction using +UNET architectures. We show that our approach is applicable to various +architectures and learned functions, especially in the realm of medical image +reconstruction, where preserving the diagnostic accuracy of reconstructed +images remains paramount. + +
+
+
+
+
+ + ♻ ☆ On the Trade-off of Intra-/Inter-class Diversity for Supervised + Pre-training NeurIPS 2023 + + +
+ Pre-training datasets are critical for building state-of-the-art machine +learning models, motivating rigorous study on their impact on downstream tasks. +In this work, we study the impact of the trade-off between the intra-class +diversity (the number of samples per class) and the inter-class diversity (the +number of classes) of a supervised pre-training dataset. Empirically, we found +that with the size of the pre-training dataset fixed, the best downstream +performance comes with a balance on the intra-/inter-class diversity. To +understand the underlying mechanism, we show theoretically that the downstream +performance depends monotonically on both types of diversity. Notably, our +theory reveals that the optimal class-to-sample ratio (#classes / #samples per +class) is invariant to the size of the pre-training dataset, which motivates an +application of predicting the optimal number of pre-training classes. We +demonstrate the effectiveness of this application by an improvement of around 2 +points on the downstream tasks when using ImageNet as the pre-training dataset. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Adversarial Attacks and Defenses on 3D Point Cloud Classification: A + Survey + + +
+ Deep learning has successfully solved a wide range of tasks in 2D vision as a +dominant AI technique. Recently, deep learning on 3D point clouds is becoming +increasingly popular for addressing various tasks in this field. Despite +remarkable achievements, deep learning algorithms are vulnerable to adversarial +attacks. These attacks are imperceptible to the human eye but can easily fool +deep neural networks in the testing and deployment stage. To encourage future +research, this survey summarizes the current progress on adversarial attack and +defense techniques on point cloud classification.This paper first introduces +the principles and characteristics of adversarial attacks and summarizes and +analyzes adversarial example generation methods in recent years. Additionally, +it provides an overview of defense strategies, organized into data-focused and +model-focused methods. Finally, it presents several current challenges and +potential future research directions in this domain. + +
+
+
+
+
+ + ♻ ☆ MLLMs-Augmented Visual-Language Representation Learning + + +
+ Visual-language pre-training (VLP) has achieved remarkable success in +multi-modal tasks, largely attributed to the availability of large-scale +image-text datasets. In this work, we demonstrate that multi-modal large +language models (MLLMs) can enhance visual-language representation learning by +improving data quality. Our approach is simple, utilizing MLLMs to extend +multiple captions for each image. To prevent the bias introduced by MLLMs' +hallucinations and intrinsic caption styles, we propose "text shearing" to +maintain the same length for extended captions as that of the original +captions. In image-text retrieval, our method consistently obtains 5.6 ~ 35.0% +and 16.8 ~ 46.1% improvement on R@1 under the fine-tuning and zero-shot +settings, respectively. Notably, we obtain zero-shot results that are +comparable to fine-tuning on target datasets, which encourages more exploration +of the versatile use of MLLMs. + +
+
+
+
+
+ + ♻ ☆ Dissecting the Runtime Performance of the Training, Fine-tuning, and + Inference of Large Language Models + + +
+ Large Language Models (LLMs) have seen great advance in both academia and +industry, and their popularity results in numerous open-source frameworks and +techniques in accelerating LLM pre-training, fine-tuning, and inference. +Training and deploying LLMs are expensive as it requires considerable computing +resources and memory, hence many efficient approaches have been developed for +improving system pipelines as well as operators. However, the runtime +performance can vary significantly across hardware and software stacks, which +makes it difficult to choose the best configuration. In this work, we aim to +benchmark the performance from both macro and micro perspectives. First, we +benchmark the end-to-end performance of pre-training, fine-tuning, and serving +LLMs in different sizes , i.e., 7, 13, and 70 billion parameters (7B, 13B, and +70B) on three 8-GPU platforms with and without individual optimization +techniques, including ZeRO, quantization, recomputation, FlashAttention. Then, +we dive deeper to provide a detailed runtime analysis of the sub-modules, +including computing and communication operators in LLMs. For end users, our +benchmark and findings help better understand different optimization +techniques, training and inference frameworks, together with hardware platforms +in choosing configurations for deploying LLMs. For researchers, our in-depth +module-wise analyses discover potential opportunities for future work to +further optimize the runtime performance of LLMs. + +
+
+
+
+
+ + ♻ ☆ Does a Neural Network Really Encode Symbolic Concepts? + + +
+ Recently, a series of studies have tried to extract interactions between +input variables modeled by a DNN and define such interactions as concepts +encoded by the DNN. However, strictly speaking, there still lacks a solid +guarantee whether such interactions indeed represent meaningful concepts. +Therefore, in this paper, we examine the trustworthiness of interaction +concepts from four perspectives. Extensive empirical studies have verified that +a well-trained DNN usually encodes sparse, transferable, and discriminative +concepts, which is partially aligned with human intuition. + +
+
+
+
+
+ + ♻ ☆ Classification Utility, Fairness, and Compactness via Tunable + Information Bottleneck and Rényi Measures + + +
+ Designing machine learning algorithms that are accurate yet fair, not +discriminating based on any sensitive attribute, is of paramount importance for +society to accept AI for critical applications. In this article, we propose a +novel fair representation learning method termed the R\'enyi Fair Information +Bottleneck Method (RFIB) which incorporates constraints for utility, fairness, +and compactness (compression) of representation, and apply it to image and +tabular data classification. A key attribute of our approach is that we +consider - in contrast to most prior work - both demographic parity and +equalized odds as fairness constraints, allowing for a more nuanced +satisfaction of both criteria. Leveraging a variational approach, we show that +our objectives yield a loss function involving classical Information Bottleneck +(IB) measures and establish an upper bound in terms of two R\'enyi measures of +order $\alpha$ on the mutual information IB term measuring compactness between +the input and its encoded embedding. We study the influence of the $\alpha$ +parameter as well as two other tunable IB parameters on achieving +utility/fairness trade-off goals, and show that the $\alpha$ parameter gives an +additional degree of freedom that can be used to control the compactness of the +representation. Experimenting on three different image datasets (EyePACS, +CelebA, and FairFace) and two tabular datasets (Adult and COMPAS), using both +binary and categorical sensitive attributes, we show that on various utility, +fairness, and compound utility/fairness metrics RFIB outperforms current +state-of-the-art approaches. + +
+
+ comment: To appear in the IEEE Transactions on Information Forensics & + Security +
+
+
+
+
+ + ♻ ☆ RAFT: Reward rAnked FineTuning for Generative Foundation Model Alignment + + +
+ Generative foundation models are susceptible to implicit biases that can +arise from extensive unsupervised training data. Such biases can produce +suboptimal samples, skewed outcomes, and unfairness, with potentially serious +consequences. Consequently, aligning these models with human ethics and +preferences is an essential step toward ensuring their responsible and +effective deployment in real-world applications. Prior research has primarily +employed Reinforcement Learning from Human Feedback (RLHF) to address this +problem, where generative models are fine-tuned with RL algorithms guided by a +human-feedback-informed reward model. However, the inefficiencies and +instabilities associated with RL algorithms frequently present substantial +obstacles to the successful alignment, necessitating the development of a more +robust and streamlined approach. To this end, we introduce a new framework, +Reward rAnked FineTuning (RAFT), designed to align generative models +effectively. Utilizing a reward model and a sufficient number of samples, our +approach selects the high-quality samples, discarding those that exhibit +undesired behavior, and subsequently enhancing the model by fine-tuning on +these filtered samples. Our studies show that RAFT can effectively improve the +model performance in both reward learning and other automated metrics in both +large language models and diffusion models. + +
+
+ comment: 29 pages, 12 figures, Published in Transactions on Machine Learning + Research (TMLR) +
+
+
+
+
+ + ♻ ☆ Learning Delays in Spiking Neural Networks using Dilated Convolutions + with Learnable Spacings + + +
+ Spiking Neural Networks (SNNs) are a promising research direction for +building power-efficient information processing systems, especially for +temporal tasks such as speech recognition. In SNNs, delays refer to the time +needed for one spike to travel from one neuron to another. These delays matter +because they influence the spike arrival times, and it is well-known that +spiking neurons respond more strongly to coincident input spikes. More +formally, it has been shown theoretically that plastic delays greatly increase +the expressivity in SNNs. Yet, efficient algorithms to learn these delays have +been lacking. Here, we propose a new discrete-time algorithm that addresses +this issue in deep feedforward SNNs using backpropagation, in an offline +manner. To simulate delays between consecutive layers, we use 1D convolutions +across time. The kernels contain only a few non-zero weights - one per synapse +- whose positions correspond to the delays. These positions are learned +together with the weights using the recently proposed Dilated Convolution with +Learnable Spacings (DCLS). We evaluated our method on three datasets: the +Spiking Heidelberg Dataset (SHD), the Spiking Speech Commands (SSC) and its +non-spiking version Google Speech Commands v0.02 (GSC) benchmarks, which +require detecting temporal patterns. We used feedforward SNNs with two or three +hidden fully connected layers, and vanilla leaky integrate-and-fire neurons. We +showed that fixed random delays help and that learning them helps even more. +Furthermore, our method outperformed the state-of-the-art in the three datasets +without using recurrent connections and with substantially fewer parameters. +Our work demonstrates the potential of delay learning in developing accurate +and precise models for temporal data processing. Our code is based on PyTorch / +SpikingJelly and available at: https://github.com/Thvnvtos/SNN-delays + +
+
+
+
+
+ + ♻ ☆ Decentralized policy learning with partial observation and mechanical + constraints for multiperson modeling + + +
+ Extracting the rules of real-world multi-agent behaviors is a current +challenge in various scientific and engineering fields. Biological agents +independently have limited observation and mechanical constraints; however, +most of the conventional data-driven models ignore such assumptions, resulting +in lack of biological plausibility and model interpretability for behavioral +analyses. Here we propose sequential generative models with partial observation +and mechanical constraints in a decentralized manner, which can model agents' +cognition and body dynamics, and predict biologically plausible behaviors. We +formulate this as a decentralized multi-agent imitation-learning problem, +leveraging binary partial observation and decentralized policy models based on +hierarchical variational recurrent neural networks with physical and +biomechanical penalties. Using real-world basketball and soccer datasets, we +show the effectiveness of our method in terms of the constraint violations, +long-term trajectory prediction, and partial observation. Our approach can be +used as a multi-agent simulator to generate realistic trajectories using +real-world data. + +
+
+ comment: 17 pages with 7 figures and 4 tables, accepted in Neural Networks +
+
+
+
+
+ + ♻ ☆ Use Perturbations when Learning from Explanations NeurIPS 2023 + + +
+ Machine learning from explanations (MLX) is an approach to learning that uses +human-provided explanations of relevant or irrelevant features for each input +to ensure that model predictions are right for the right reasons. Existing MLX +approaches rely on local model interpretation methods and require strong model +smoothing to align model and human explanations, leading to sub-optimal +performance. We recast MLX as a robustness problem, where human explanations +specify a lower dimensional manifold from which perturbations can be drawn, and +show both theoretically and empirically how this approach alleviates the need +for strong model smoothing. We consider various approaches to achieving +robustness, leading to improved performance over prior MLX methods. Finally, we +show how to combine robustness with an earlier MLX method, yielding +state-of-the-art results on both synthetic and real-world benchmarks. + +
+
+ comment: NeurIPS 2023; https://github.com/vihari/robust_mlx +
+
+
+
+
+ + ♻ ☆ Bandwidth Selection for Gaussian Kernel Ridge Regression via Jacobian + Control + + +
+ Most machine learning methods require tuning of hyper-parameters. For kernel +ridge regression with the Gaussian kernel, the hyper-parameter is the +bandwidth. The bandwidth specifies the length scale of the kernel and has to be +carefully selected to obtain a model with good generalization. The default +methods for bandwidth selection, cross-validation and marginal likelihood +maximization, often yield good results, albeit at high computational costs. +Inspired by Jacobian regularization, we formulate an approximate expression for +how the derivatives of the functions inferred by kernel ridge regression with +the Gaussian kernel depend on the kernel bandwidth. We use this expression to +propose a closed-form, computationally feather-light, bandwidth selection +heuristic, based on controlling the Jacobian. In addition, the Jacobian +expression illuminates how the bandwidth selection is a trade-off between the +smoothness of the inferred function and the conditioning of the training data +kernel matrix. We show on real and synthetic data that compared to +cross-validation and marginal likelihood maximization, our method is on pair in +terms of model performance, but up to six orders of magnitude faster. + +
+
+
+
+
+ + ♻ ☆ A Definition of Continual Reinforcement Learning NeurIPS 2023 + + +
+ In a standard view of the reinforcement learning problem, an agent's goal is +to efficiently identify a policy that maximizes long-term reward. However, this +perspective is based on a restricted view of learning as finding a solution, +rather than treating learning as endless adaptation. In contrast, continual +reinforcement learning refers to the setting in which the best agents never +stop learning. Despite the importance of continual reinforcement learning, the +community lacks a simple definition of the problem that highlights its +commitments and makes its primary concepts precise and clear. To this end, this +paper is dedicated to carefully defining the continual reinforcement learning +problem. We formalize the notion of agents that "never stop learning" through a +new mathematical language for analyzing and cataloging agents. Using this new +language, we define a continual learning agent as one that can be understood as +carrying out an implicit search process indefinitely, and continual +reinforcement learning as the setting in which the best agents are all +continual learning agents. We provide two motivating examples, illustrating +that traditional views of multi-task reinforcement learning and continual +supervised learning are special cases of our definition. Collectively, these +definitions and perspectives formalize many intuitive concepts at the heart of +learning, and open new research pathways surrounding continual learning agents. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Action valuation of on- and off-ball soccer players based on multi-agent + deep reinforcement learning + + +
+ Analysis of invasive sports such as soccer is challenging because the game +situation changes continuously in time and space, and multiple agents +individually recognize the game situation and make decisions. Previous studies +using deep reinforcement learning have often considered teams as a single agent +and valued the teams and players who hold the ball in each discrete event. Then +it was challenging to value the actions of multiple players, including players +far from the ball, in a spatiotemporally continuous state space. In this paper, +we propose a method of valuing possible actions for on- and off-ball soccer +players in a single holistic framework based on multi-agent deep reinforcement +learning. We consider a discrete action space in a continuous state space that +mimics that of Google research football and leverages supervised learning for +actions in reinforcement learning. In the experiment, we analyzed the +relationships with conventional indicators, season goals, and game ratings by +experts, and showed the effectiveness of the proposed method. Our approach can +assess how multiple players move continuously throughout the game, which is +difficult to be discretized or labeled but vital for teamwork, scouting, and +fan engagement. + +
+
+ comment: 12 pages, 4 figures, published in IEEE Access. The latest version is + at https://ieeexplore.ieee.org/document/10328596 +
+
+
+
+
+ + ♻ ☆ Intrinsically motivated graph exploration using network theories of + human curiosity + + +
+ Intrinsically motivated exploration has proven useful for reinforcement +learning, even without additional extrinsic rewards. When the environment is +naturally represented as a graph, how to guide exploration best remains an open +question. In this work, we propose a novel approach for exploring +graph-structured data motivated by two theories of human curiosity: the +information gap theory and the compression progress theory. The theories view +curiosity as an intrinsic motivation to optimize for topological features of +subgraphs induced by nodes visited in the environment. We use these proposed +features as rewards for graph neural-network-based reinforcement learning. On +multiple classes of synthetically generated graphs, we find that trained agents +generalize to longer exploratory walks and larger environments than are seen +during training. Our method computes more efficiently than the greedy +evaluation of the relevant topological properties. The proposed intrinsic +motivations bear particular relevance for recommender systems. We demonstrate +that next-node recommendations considering curiosity are more predictive of +human choices than PageRank centrality in several real-world graph +environments. + +
+
+ comment: 15 pages, 5 figures in main text, and 18 pages, 9 figures in + supplement +
+
+
+
+
+ + ♻ ☆ Transfer Learning Enhanced Full Waveform Inversion + + +
+ We propose a way to favorably employ neural networks in the field of +non-destructive testing using Full Waveform Inversion (FWI). The presented +methodology discretizes the unknown material distribution in the domain with a +neural network within an adjoint optimization. To further increase efficiency +of the FWI, pretrained neural networks are used to provide a good starting +point for the inversion. This reduces the number of iterations in the Full +Waveform Inversion for specific, yet generalizable settings. + +
+
+ comment: 7 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Timewarp: Transferable Acceleration of Molecular Dynamics by Learning + Time-Coarsened Dynamics + + +
+ Molecular dynamics (MD) simulation is a widely used technique to simulate +molecular systems, most commonly at the all-atom resolution where equations of +motion are integrated with timesteps on the order of femtoseconds +($1\textrm{fs}=10^{-15}\textrm{s}$). MD is often used to compute equilibrium +properties, which requires sampling from an equilibrium distribution such as +the Boltzmann distribution. However, many important processes, such as binding +and folding, occur over timescales of milliseconds or beyond, and cannot be +efficiently sampled with conventional MD. Furthermore, new MD simulations need +to be performed for each molecular system studied. We present Timewarp, an +enhanced sampling method which uses a normalising flow as a proposal +distribution in a Markov chain Monte Carlo method targeting the Boltzmann +distribution. The flow is trained offline on MD trajectories and learns to make +large steps in time, simulating the molecular dynamics of $10^{5} - +10^{6}\:\textrm{fs}$. Crucially, Timewarp is transferable between molecular +systems: once trained, we show that it generalises to unseen small peptides +(2-4 amino acids) at all-atom resolution, exploring their metastable states and +providing wall-clock acceleration of sampling compared to standard MD. Our +method constitutes an important step towards general, transferable algorithms +for accelerating MD. + +
+
+
+
+
+ + ♻ ☆ A Unified Approach to Interpreting and Boosting Adversarial + Transferability + + +
+ In this paper, we use the interaction inside adversarial perturbations to +explain and boost the adversarial transferability. We discover and prove the +negative correlation between the adversarial transferability and the +interaction inside adversarial perturbations. The negative correlation is +further verified through different DNNs with various inputs. Moreover, this +negative correlation can be regarded as a unified perspective to understand +current transferability-boosting methods. To this end, we prove that some +classic methods of enhancing the transferability essentially decease +interactions inside adversarial perturbations. Based on this, we propose to +directly penalize interactions during the attacking process, which +significantly improves the adversarial transferability. + +
+
+
+
+
+ + ♻ ☆ Interpreting and Disentangling Feature Components of Various Complexity + from DNNs + + +
+ This paper aims to define, quantify, and analyze the feature complexity that +is learned by a DNN. We propose a generic definition for the feature +complexity. Given the feature of a certain layer in the DNN, our method +disentangles feature components of different complexity orders from the +feature. We further design a set of metrics to evaluate the reliability, the +effectiveness, and the significance of over-fitting of these feature +components. Furthermore, we successfully discover a close relationship between +the feature complexity and the performance of DNNs. As a generic mathematical +tool, the feature complexity and the proposed metrics can also be used to +analyze the success of network compression and knowledge distillation. + +
+
+
+
+
+ + ♻ ☆ Defects of Convolutional Decoder Networks in Frequency Representation + + +
+ In this paper, we prove the representation defects of a cascaded +convolutional decoder network, considering the capacity of representing +different frequency components of an input sample. We conduct the discrete +Fourier transform on each channel of the feature map in an intermediate layer +of the decoder network. Then, we extend the 2D circular convolution theorem to +represent the forward and backward propagations through convolutional layers in +the frequency domain. Based on this, we prove three defects in representing +feature spectrums. First, we prove that the convolution operation, the +zero-padding operation, and a set of other settings all make a convolutional +decoder network more likely to weaken high-frequency components. Second, we +prove that the upsampling operation generates a feature spectrum, in which +strong signals repetitively appear at certain frequencies. Third, we prove that +if the frequency components in the input sample and frequency components in the +target output for regression have a small shift, then the decoder usually +cannot be effectively learned. + +
+
+
+
+
+ + ♻ ☆ Bayesian Neural Networks Avoid Encoding Complex and + Perturbation-Sensitive Concepts + + +
+ In this paper, we focus on mean-field variational Bayesian Neural Networks +(BNNs) and explore the representation capacity of such BNNs by investigating +which types of concepts are less likely to be encoded by the BNN. It has been +observed and studied that a relatively small set of interactive concepts +usually emerge in the knowledge representation of a sufficiently-trained neural +network, and such concepts can faithfully explain the network output. Based on +this, our study proves that compared to standard deep neural networks (DNNs), +it is less likely for BNNs to encode complex concepts. Experiments verify our +theoretical proofs. Note that the tendency to encode less complex concepts does +not necessarily imply weak representation power, considering that complex +concepts exhibit low generalization power and high adversarial vulnerability. +The code is available at https://github.com/sjtu-xai-lab/BNN-concepts. + +
+
+
+
+
+ + ♻ ☆ HarsanyiNet: Computing Accurate Shapley Values in a Single Forward + Propagation + + +
+ The Shapley value is widely regarded as a trustworthy attribution metric. +However, when people use Shapley values to explain the attribution of input +variables of a deep neural network (DNN), it usually requires a very high +computational cost to approximate relatively accurate Shapley values in +real-world applications. Therefore, we propose a novel network architecture, +the HarsanyiNet, which makes inferences on the input sample and simultaneously +computes the exact Shapley values of the input variables in a single forward +propagation. The HarsanyiNet is designed on the theoretical foundation that the +Shapley value can be reformulated as the redistribution of Harsanyi +interactions encoded by the network. + +
+
+
+
+
+ + ♻ ☆ On the Identifiability of Switching Dynamical Systems + + +
+ In the realm of interpretability and out-of-distribution generalisation, the +identifiability of latent variable models has emerged as a captivating field of +inquiry. In this work, we delve into the identifiability of Switching Dynamical +Systems, taking an initial stride toward extending identifiability analysis to +sequential latent variable models. We first prove the identifiability of Markov +Switching Models, which commonly serve as the prior distribution for the +continuous latent variables in Switching Dynamical Systems. We present +identification conditions for first-order Markov dependency structures, whose +transition distribution is parametrised via non-linear Gaussians. We then +establish the identifiability of the latent variables and non-linear mappings +in Switching Dynamical Systems up to affine transformations, by leveraging +identifiability analysis techniques from identifiable deep latent variable +models. We finally develop estimation algorithms for identifiable Switching +Dynamical Systems. Throughout empirical studies, we demonstrate the +practicality of identifiable Switching Dynamical Systems for segmenting +high-dimensional time series such as videos, and showcase the use of +identifiable Markov Switching Models for regime-dependent causal discovery in +climate data. + +
+
+
+
+
+ + ♻ ☆ RaLiBEV: Radar and LiDAR BEV Fusion Learning for Anchor Box Free Object + Detection Systems + + +
+ In autonomous driving, LiDAR and radar play important roles in the perception +of the surrounding environment. LiDAR provides accurate 3D spatial sensing +information but cannot work in adverse weather like fog. On the other hand, the +radar signal can be diffracted when encountering raindrops or mist particles +thanks to its wavelength, but it suffers from large noise. Recent +state-of-the-art works reveal that fusion of radar and LiDAR can lead to robust +detection in adverse weather. The existing works adopt convolutional neural +network architecture to extract features from each sensor data, then align and +aggregate the two branch features to predict object detection results. However, +these methods have low accuracy of bounding box estimations due to a simple +design of label assignment and fusion strategies. In this paper, we propose a +bird's-eye view fusion learning-based anchor box-free object detection system, +which fuses the feature derived from the radar range-azimuth heatmap and the +LiDAR point cloud to estimate possible objects. Different label assignment +strategies have been designed to facilitate the consistency between the +classification of foreground or background anchor points and the corresponding +bounding box regressions. Furthermore, the performance of the proposed object +detector is further enhanced by employing a novel interactive transformer +module. The superior performance of the methods proposed in this paper has been +demonstrated using the recently published Oxford Radar RobotCar dataset. Our +system's average precision significantly outperforms the state-of-the-art +method by 13.1% and 19.0% at IoU of 0.8 under 'Clear+Foggy' training conditions +for 'Clear' and 'Foggy' testing, respectively. + +
+
+ comment: 14 pages, 5 figures +
+
+
+
+
+ + ♻ ☆ Diffusion Models for Wireless Communications + + +
+ Innovative foundation models, such as GPT-4 and stable diffusion models, have +made a paradigm shift in the realm of artificial intelligence (AI) towards +generative AI-based systems. AI and machine learning (AI/ML) algorithms are +envisioned to be pervasively incorporated into the future wireless +communications systems. In this article, we outline the applications of +diffusion models in wireless communication systems, which are a new family of +probabilistic generative models that have showcased state-of-the-art +performance. The key idea is to decompose data generation process over +"denoising" steps, gradually generating samples out of noise. Based on two case +studies presented, we show how diffusion models can be employed for the +development of resilient AI-native communication systems. Specifically, we +propose denoising diffusion probabilistic models (DDPM) for a wireless +communication scheme with non-ideal transceivers, where 30% improvement is +achieved in terms of bit error rate. In the other example, DDPM is employed at +the transmitter to shape the constellation symbols, highlighting a robust +out-of-distribution performance. + +
+
+
+
+
+ + ♻ ☆ Optimal Stopping via Randomized Neural Networks + + +
+ This paper presents the benefits of using randomized neural networks instead +of standard basis functions or deep neural networks to approximate the +solutions of optimal stopping problems. The key idea is to use neural networks, +where the parameters of the hidden layers are generated randomly and only the +last layer is trained, in order to approximate the continuation value. Our +approaches are applicable to high dimensional problems where the existing +approaches become increasingly impractical. In addition, since our approaches +can be optimized using simple linear regression, they are easy to implement and +theoretical guarantees can be provided. We test our approaches for American +option pricing on Black--Scholes, Heston and rough Heston models and for +optimally stopping a fractional Brownian motion. In all cases, our algorithms +outperform the state-of-the-art and other relevant machine learning approaches +in terms of computation time while achieving comparable results. Moreover, we +show that they can also be used to efficiently compute Greeks of American +options. + +
+
+
+
+
+ + ♻ ☆ Duet: efficient and scalable hybriD neUral rElation undersTanding + + +
+ Learned cardinality estimation methods have achieved high precision compared +to traditional methods. Among learned methods, query-driven approaches have +faced the workload drift problem for a long time. Although both data-driven and +hybrid methods are proposed to avoid this problem, most of them suffer from +high training and estimation costs, limited scalability, instability, and +long-tail distribution problems on high-dimensional tables, which seriously +affects the practical application of learned cardinality estimators. In this +paper, we prove that most of these problems are directly caused by the widely +used progressive sampling. We solve this problem by introducing predicate +information into the autoregressive model and propose Duet, a stable, +efficient, and scalable hybrid method to estimate cardinality directly without +sampling or any non-differentiable process, which can not only reduce the +inference complexity from $O(n)$ to $O(1)$ compared to Naru and UAE but also +achieve higher accuracy on high cardinality and high-dimensional tables. +Experimental results show that Duet can achieve all the design goals above and +be much more practical. Besides, Duet even has a lower inference cost on CPU +than that of most learned methods on GPU. + +
+
+
+
+
+ + ♻ ☆ Symplectic Structure-Aware Hamiltonian (Graph) Embeddings + + +
+ In traditional Graph Neural Networks (GNNs), the assumption of a fixed +embedding manifold often limits their adaptability to diverse graph geometries. +Recently, Hamiltonian system-inspired GNNs have been proposed to address the +dynamic nature of such embeddings by incorporating physical laws into node +feature updates. We present Symplectic Structure-Aware Hamiltonian GNN +(SAH-GNN), a novel approach that generalizes Hamiltonian dynamics for more +flexible node feature updates. Unlike existing Hamiltonian approaches, SAH-GNN +employs Riemannian optimization on the symplectic Stiefel manifold to +adaptively learn the underlying symplectic structure, circumventing the +limitations of existing Hamiltonian GNNs that rely on a pre-defined form of +standard symplectic structure. This innovation allows SAH-GNN to automatically +adapt to various graph datasets without extensive hyperparameter tuning. +Moreover, it conserves energy during training meaning the implicit Hamiltonian +system is physically meaningful. Finally, we empirically validate SAH-GNN's +superiority and adaptability in node classification tasks across multiple types +of graph datasets. + +
+
+ comment: 5 pages main content with 5 pages appendix +
+
+
+
+
+ + ♻ ☆ VMAF Re-implementation on PyTorch: Some Experimental Results + + +
+ Based on the standard VMAF implementation we propose an implementation of +VMAF using PyTorch framework. For this implementation comparisons with the +standard (libvmaf) show the discrepancy $\lesssim 10^{-2}$ in VMAF units. We +investigate gradients computation when using VMAF as an objective function and +demonstrate that training using this function does not result in ill-behaving +gradients. The implementation is then used to train a preprocessing filter. It +is demonstrated that its performance is superior to the unsharp masking filter. +The resulting filter is also easy for implementation and can be applied in +video processing tasks for video copression improvement. This is confirmed by +the results of numerical experiments. + +
+
+ comment: 6 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ Generating high-quality 3DMPCs by adaptive data acquisition and + NeREF-based radiometric calibration with UGV plant phenotyping system + + +
+ Fusion of 3D and MS imaging data has a great potential for high-throughput +plant phenotyping of structural and biochemical as well as physiological traits +simultaneously, which is important for decision support in agriculture and for +crop breeders in selecting the best genotypes. However, lacking of 3D data +integrity of various plant canopy structures and low-quality of MS images +caused by the complex illumination effects make a great challenge, especially +at the proximal imaging scale. Therefore, this study proposed a novel approach +for adaptive data acquisition and radiometric calibration to generate +high-quality 3DMPCs of plants. An efficient NBV planning method based on an UGV +plant phenotyping system with a multi-sensor-equipped robotic arm was proposed +to achieve adaptive data acquisition. The NeREF was employed to predict the DN +values of the hemispherical reference for radiometric calibration. For NBV +planning, the average total time for single plant at a joint speed of 1.55 +rad/s was about 62.8 s, with an average reduction of 18.0% compared to the +unplanned. The integrity of the whole-plant data was improved by an average of +23.6% compared to the fixed viewpoints alone. Compared with the ASD +measurements, the RMSE of the reflectance spectra obtained from 3DMPCs at +different regions of interest was 0.08 with an average decrease of 58.93% +compared to the results obtained from the single-frame of MS images without 3D +radiometric calibration. The 3D-calibrated plant 3DMPCs improved the predictive +accuracy of PLSR for chlorophyll content, with an average increase of 0.07 in +R2 and an average decrease of 21.25% in RMSE. Our approach introduced a fresh +perspective on generating high-quality 3DMPCs of plants under the natural light +condition, enabling more precise analysis of plant morphological and +physiological parameters. + +
+
+
+
+
+ + ♻ ☆ Decision Tree Psychological Risk Assessment in Currency Trading + + +
+ This research paper focuses on the integration of Artificial Intelligence +(AI) into the currency trading landscape, positing the development of +personalized AI models, essentially functioning as intelligent personal +assistants tailored to the idiosyncrasies of individual traders. The paper +posits that AI models are capable of identifying nuanced patterns within the +trader's historical data, facilitating a more accurate and insightful +assessment of psychological risk dynamics in currency trading. The PRI is a +dynamic metric that experiences fluctuations in response to market conditions +that foster psychological fragility among traders. By employing sophisticated +techniques, a classifying decision tree is crafted, enabling clearer +decision-making boundaries within the tree structure. By incorporating the +user's chronological trade entries, the model becomes adept at identifying +critical junctures when psychological risks are heightened. The real-time +nature of the calculations enhances the model's utility as a proactive tool, +offering timely alerts to traders about impending moments of psychological +risks. The implications of this research extend beyond the confines of currency +trading, reaching into the realms of other industries where the judicious +application of personalized modeling emerges as an efficient and strategic +approach. This paper positions itself at the intersection of cutting-edge +technology and the intricate nuances of human psychology, offering a +transformative paradigm for decision making support in dynamic and +high-pressure environments. + +
+
+ comment: 8 pages, 3 figures, 7 listings +
+
+
+
+
+ + ♻ ☆ ChebNet: Efficient and Stable Constructions of Deep Neural Networks with + Rectified Power Units via Chebyshev Approximations + + +
+ In a previous study [B. Li, S. Tang and H. Yu, Commun. Comput. Phy. +27(2):379-411, 2020], it is shown that deep neural networks built with +rectified power units (RePU) as activation functions can give better +approximation for sufficient smooth functions than those built with rectified +linear units, by converting polynomial approximations using power series into +deep neural networks with optimal complexity and no approximation error. +However, in practice, power series approximations are not easy to obtain due to +the associated stability issue. In this paper, we propose a new and more stable +way to construct RePU deep neural networks based on Chebyshev polynomial +approximations. By using a hierarchical structure of Chebyshev polynomial +approximation in frequency domain, we obtain efficient and stable deep neural +network construction, which we call ChebNet. The approximation of smooth +functions by ChebNets is no worse than the approximation by deep RePU nets +using power series. On the same time, ChebNets are much more stable. Numerical +results show that the constructed ChebNets can be further fine-tuned to obtain +much better results than those obtained by tuning deep RePU nets constructed by +power series approach. As spectral accuracy is hard to obtain by direct +training of deep neural networks, ChebNets provide a practical way to obtain +spectral accuracy, it is expected to be useful in real applications that +require efficient approximations of smooth functions. + +
+
+ comment: 6 figures, 3 tables, to appear on Communications in Mathematics and + Statistics +
+
+
+
+
+ + ♻ ☆ Phylo2Vec: a vector representation for binary trees + + +
+ Binary phylogenetic trees inferred from biological data are central to +understanding the shared evolutionary history of organisms. Inferring the +placement of latent nodes in a tree by any optimality criterion (e.g., maximum +likelihood) is an NP-hard problem, propelling the development of myriad +heuristic approaches. Yet, these heuristics often lack a systematic means of +uniformly sampling random trees or effectively exploring a tree space that +grows factorially, which are crucial to optimisation problems such as machine +learning. Accordingly, we present Phylo2Vec, a new parsimonious representation +of a phylogenetic tree. Phylo2Vec maps any binary tree with $n$ leaves to an +integer vector of length $n$. We prove that Phylo2Vec is both well-defined and +bijective to the space of phylogenetic trees. The advantages of Phylo2Vec are +twofold: i) easy uniform sampling of binary trees and ii) systematic ability to +traverse tree space in very large or small jumps. As a proof of concept, we use +Phylo2Vec for maximum likelihood inference on five real-world datasets and show +that a simple hill climbing-based optimisation efficiently traverses the +vastness of tree space from a random to an optimal tree. + +
+
+ comment: 35 pages, 8 figures, 2 tables, 2 supplementary figures +
+
+
+
+
+ + ♻ ☆ Harnessing machine learning for accurate treatment of overlapping + opacity species in GCMs + + +
+ To understand high precision observations of exoplanets and brown dwarfs, we +need detailed and complex general circulation models (GCMs) that incorporate +hydrodynamics, chemistry, and radiation. In this study, we specifically examine +the coupling between chemistry and radiation in GCMs and compare different +methods for mixing opacities of different chemical species in the correlated-k +assumption, when equilibrium chemistry cannot be assumed. We propose a fast +machine learning method based on DeepSets (DS), which effectively combines +individual correlated-k opacities (k-tables). We evaluate the DS method +alongside other published methods like adaptive equivalent extinction (AEE) and +random overlap with rebinning and resorting (RORR). We integrate these mixing +methods into our GCM (expeRT/MITgcm) and assess their accuracy and performance +for the example of the hot Jupiter HD~209458 b. Our findings indicate that the +DS method is both accurate and efficient for GCM usage, whereas RORR is too +slow. Additionally, we observe that the accuracy of AEE depends on its specific +implementation and may introduce numerical issues in achieving radiative +transfer solution convergence. We then apply the DS mixing method in a +simplified chemical disequilibrium situation, where we model the rainout of TiO +and VO, and confirm that the rainout of TiO and VO would hinder the formation +of a stratosphere. To further expedite the development of consistent +disequilibrium chemistry calculations in GCMs, we provide documentation and +code for coupling the DS mixing method with correlated-k radiative transfer +solvers. The DS method has been extensively tested to be accurate enough for +GCMs, however, other methods might be needed for accelerating atmospheric +retrievals. + +
+
+ comment: Recommended for publication in A&A, following minor revisions, + version sent to referee +
+
+
+
+
+ + ♻ ☆ Understanding Forward Process of Convolutional Neural Network + + +
+ This paper reveal the selective rotation in the CNNs' forward processing. It +elucidates the activation function as a discerning mechanism that unifies and +quantizes the rotational aspects of the input data. Experiments show how this +defined methodology reflects the progress network distinguish inputs based on +statistical indicators, which can be comprehended or analyzed by applying +structured mathematical tools. Our findings also unveil the consistency between +artificial neural networks and the human brain in their data processing +pattern. + +
+
+ comment: something wrong in this paper +
+
+
+
+
+ + ♻ ☆ Denoising Heat-inspired Diffusion with Insulators for Collision Free + Motion Planning + + +
+ Diffusion models have risen as a powerful tool in robotics due to their +flexibility and multi-modality. While some of these methods effectively address +complex problems, they often depend heavily on inference-time obstacle +detection and require additional equipment. Addressing these challenges, we +present a method that, during inference time, simultaneously generates only +reachable goals and plans motions that avoid obstacles, all from a single +visual input. Central to our approach is the novel use of a collision-avoiding +diffusion kernel for training. Through evaluations against behavior-cloning and +classical diffusion models, our framework has proven its robustness. It is +particularly effective in multi-modal environments, navigating toward goals and +avoiding unreachable ones blocked by obstacles, while ensuring collision +avoidance. + +
+
+ comment: 9 pages, 6 figures +
+
+
+
+
+ + ♻ ☆ Automatic Diagnosis of Myocarditis Disease in Cardiac MRI Modality using + Deep Transformers and Explainable Artificial Intelligence + + +
+ Myocarditis is a significant cardiovascular disease (CVD) that poses a threat +to the health of many individuals by causing damage to the myocardium. The +occurrence of microbes and viruses, including the likes of HIV, plays a crucial +role in the development of myocarditis disease (MCD). The images produced +during cardiac magnetic resonance imaging (CMRI) scans are low contrast, which +can make it challenging to diagnose cardiovascular diseases. In other hand, +checking numerous CMRI slices for each CVD patient can be a challenging task +for medical doctors. To overcome the existing challenges, researchers have +suggested the use of artificial intelligence (AI)-based computer-aided +diagnosis systems (CADS). The presented paper outlines a CADS for the detection +of MCD from CMR images, utilizing deep learning (DL) methods. The proposed CADS +consists of several steps, including dataset, preprocessing, feature +extraction, classification, and post-processing. First, the Z-Alizadeh dataset +was selected for the experiments. Subsequently, the CMR images underwent +various preprocessing steps, including denoising, resizing, as well as data +augmentation (DA) via CutMix and MixUp techniques. In the following, the most +current deep pre-trained and transformer models are used for feature extraction +and classification on the CMR images. The findings of our study reveal that +transformer models exhibit superior performance in detecting MCD as opposed to +pre-trained architectures. In terms of DL architectures, the Turbulence Neural +Transformer (TNT) model exhibited impressive accuracy, reaching 99.73% +utilizing a 10-fold cross-validation approach. Additionally, to pinpoint areas +of suspicion for MCD in CMRI images, the Explainable-based Grad Cam method was +employed. + +
+
+
+
+
+ + ♻ ☆ Constructing Custom Thermodynamics Using Deep Learning + + +
+ One of the most exciting applications of artificial intelligence (AI) is +automated scientific discovery based on previously amassed data, coupled with +restrictions provided by known physical principles, including symmetries and +conservation laws. Such automated hypothesis creation and verification can +assist scientists in studying complex phenomena, where traditional physical +intuition may fail. Here we develop a platform based on a generalized Onsager +principle to learn macroscopic dynamical descriptions of arbitrary stochastic +dissipative systems directly from observations of their microscopic +trajectories. Our method simultaneously constructs reduced thermodynamic +coordinates and interprets the dynamics on these coordinates. We demonstrate +its effectiveness by studying theoretically and validating experimentally the +stretching of long polymer chains in an externally applied field. Specifically, +we learn three interpretable thermodynamic coordinates and build a dynamical +landscape of polymer stretching, including the identification of stable and +transition states and the control of the stretching rate. Our general +methodology can be used to address a wide range of scientific and technological +applications. + +
+
+
+
+
+ + ♻ ☆ Stability-Informed Initialization of Neural Ordinary Differential + Equations + + +
+ This paper addresses the training of Neural Ordinary Differential Equations +(neural ODEs), and in particular explores the interplay between numerical +integration techniques, stability regions, step size, and initialization +techniques. It is shown how the choice of integration technique implicitly +regularizes the learned model, and how the solver's corresponding stability +region affects training and prediction performance. From this analysis, a +stability-informed parameter initialization technique is introduced. The +effectiveness of the initialization method is displayed across several learning +benchmarks and industrial applications. + +
+
+
+
+
+ + ♻ ☆ GeoPhy: Differentiable Phylogenetic Inference via Geometric Gradients of + Tree Topologies NeurIPS 2023 + + +
+ Phylogenetic inference, grounded in molecular evolution models, is essential +for understanding the evolutionary relationships in biological data. Accounting +for the uncertainty of phylogenetic tree variables, which include tree +topologies and evolutionary distances on branches, is crucial for accurately +inferring species relationships from molecular data and tasks requiring +variable marginalization. Variational Bayesian methods are key to developing +scalable, practical models; however, it remains challenging to conduct +phylogenetic inference without restricting the combinatorially vast number of +possible tree topologies. In this work, we introduce a novel, fully +differentiable formulation of phylogenetic inference that leverages a unique +representation of topological distributions in continuous geometric spaces. +Through practical considerations on design spaces and control variates for +gradient estimations, our approach, GeoPhy, enables variational inference +without limiting the topological candidates. In experiments using real +benchmark datasets, GeoPhy significantly outperformed other approximate +Bayesian methods that considered whole topologies. + +
+
+ comment: NeurIPS 2023 camera ready +
+
+
+
+
+ + ♻ ☆ Hessian-Aware Bayesian Optimization for Decision Making Systems + + +
+ Many approaches for optimizing decision making systems rely on gradient based +methods requiring informative feedback from the environment. However, in the +case where such feedback is sparse or uninformative, such approaches may result +in poor performance. Derivative-free approaches such as Bayesian Optimization +mitigate the dependency on the quality of gradient feedback, but are known to +scale poorly in the high-dimension setting of complex decision making systems. +This problem is exacerbated if the system requires interactions between several +actors cooperating to accomplish a shared goal. To address the dimensionality +challenge, we propose a compact multi-layered architecture modeling the +dynamics of actor interactions through the concept of role. We introduce +Hessian-aware Bayesian Optimization to efficiently optimize the multi-layered +architecture parameterized by a large number of parameters, and give the first +improved regret bound in additive high-dimensional Bayesian Optimization since +Mutny & Krause (2018). Our approach shows strong empirical results under +malformed or sparse reward. + +
+
+ comment: Fixed a typo +
+
+
+
+
+ + ♻ ☆ QuantEase: Optimization-based Quantization for Language Models + + +
+ With the rising popularity of Large Language Models (LLMs), there has been an +increasing interest in compression techniques that enable their efficient +deployment. This study focuses on the Post-Training Quantization (PTQ) of LLMs. +Drawing from recent advances, our work introduces QuantEase, a layer-wise +quantization framework where individual layers undergo separate quantization. +The problem is framed as a discrete-structured non-convex optimization, +prompting the development of algorithms rooted in Coordinate Descent (CD) +techniques. These CD-based methods provide high-quality solutions to the +complex non-convex layer-wise quantization problems. Notably, our CD-based +approach features straightforward updates, relying solely on matrix and vector +operations, circumventing the need for matrix inversion or decomposition. We +also explore an outlier-aware variant of our approach, allowing for retaining +significant weights (outliers) with complete precision. Our proposal attains +state-of-the-art performance in terms of perplexity and zero-shot accuracy in +empirical evaluations across various LLMs and datasets, with relative +improvements up to 15% over methods such as GPTQ. Leveraging careful linear +algebra optimizations, QuantEase can quantize models like Falcon-180B on a +single NVIDIA A100 GPU in $\sim$3 hours. Particularly noteworthy is our +outlier-aware algorithm's capability to achieve near or sub-3-bit quantization +of LLMs with an acceptable drop in accuracy, obviating the need for non-uniform +quantization or grouping techniques, improving upon methods such as SpQR by up +to two times in terms of perplexity. + +
+
+
+
+
+ + ♻ ☆ iTransformer: Inverted Transformers Are Effective for Time Series + Forecasting + + +
+ The recent boom of linear forecasting models questions the ongoing passion +for architectural modifications of Transformer-based forecasters. These +forecasters leverage Transformers to model the global dependencies over +temporal tokens of time series, with each token formed by multiple variates of +the same timestamp. However, Transformers are challenged in forecasting series +with larger lookback windows due to performance degradation and computation +explosion. Besides, the embedding for each temporal token fuses multiple +variates that represent potential delayed events and distinct physical +measurements, which may fail in learning variate-centric representations and +result in meaningless attention maps. In this work, we reflect on the competent +duties of Transformer components and repurpose the Transformer architecture +without any modification to the basic components. We propose iTransformer that +simply applies the attention and feed-forward network on the inverted +dimensions. Specifically, the time points of individual series are embedded +into variate tokens which are utilized by the attention mechanism to capture +multivariate correlations; meanwhile, the feed-forward network is applied for +each variate token to learn nonlinear representations. The iTransformer model +achieves state-of-the-art on challenging real-world datasets, which further +empowers the Transformer family with promoted performance, generalization +ability across different variates, and better utilization of arbitrary lookback +windows, making it a nice alternative as the fundamental backbone of time +series forecasting. + +
+
+
+
+
+ + ♻ ☆ Token-Level Adaptation of LoRA Adapters for Downstream Task + Generalization + + +
+ This paper introduces a method for adapting LoRA adapters in smaller-sized +language models to arbitrary downstream tasks. Unlike standard +mixture-of-expert architectures, our method employs a gradient-free routing +function to choose a weighted combination of experts without increasing the +compute requirements for training or inference. The results show that +token-level adaptation of LoRA adapters outperforms the base Llama-2-7b model +across mathematical (GSM8K), scientific (ARC-Challenge), reading comprehension +(SQuAD), and coding (CodeAlpaca-20k) tasks. Further evaluations also show that +the average performance of token-level adaptation outperforms individual models +fine-tuned for each of the tasks with the best performance observed in +adaptation of every-other token during inference. The code for this study is +made available through a public repository. + +
+
+
+
+
+ + ♻ ☆ Bayesian Learning with Information Gain Provably Bounds Risk for a + Robust Adversarial Defense ICML 2022 + + +
+ We present a new algorithm to learn a deep neural network model robust +against adversarial attacks. Previous algorithms demonstrate an adversarially +trained Bayesian Neural Network (BNN) provides improved robustness. We +recognize the adversarial learning approach for approximating the multi-modal +posterior distribution of a Bayesian model can lead to mode collapse; +consequently, the model's achievements in robustness and performance are +sub-optimal. Instead, we first propose preventing mode collapse to better +approximate the multi-modal posterior distribution. Second, based on the +intuition that a robust model should ignore perturbations and only consider the +informative content of the input, we conceptualize and formulate an information +gain objective to measure and force the information learned from both benign +and adversarial training instances to be similar. Importantly. we prove and +demonstrate that minimizing the information gain objective allows the +adversarial risk to approach the conventional empirical risk. We believe our +efforts provide a step toward a basis for a principled method of adversarially +training BNNs. Our model demonstrate significantly improved robustness--up to +20%--compared with adversarial training and Adv-BNN under PGD attacks with +0.035 distortion on both CIFAR-10 and STL-10 datasets. + +
+
+ comment: Published at ICML 2022. Code is available at + https://github.com/baogiadoan/IG-BNN +
+
+
+
+
+ + ♻ ☆ Beta Diffusion NeurIPS 2023 + + +
+ We introduce beta diffusion, a novel generative modeling method that +integrates demasking and denoising to generate data within bounded ranges. +Using scaled and shifted beta distributions, beta diffusion utilizes +multiplicative transitions over time to create both forward and reverse +diffusion processes, maintaining beta distributions in both the forward +marginals and the reverse conditionals, given the data at any point in time. +Unlike traditional diffusion-based generative models relying on additive +Gaussian noise and reweighted evidence lower bounds (ELBOs), beta diffusion is +multiplicative and optimized with KL-divergence upper bounds (KLUBs) derived +from the convexity of the KL divergence. We demonstrate that the proposed KLUBs +are more effective for optimizing beta diffusion compared to negative ELBOs, +which can also be derived as the KLUBs of the same KL divergence with its two +arguments swapped. The loss function of beta diffusion, expressed in terms of +Bregman divergence, further supports the efficacy of KLUBs for optimization. +Experimental results on both synthetic data and natural images demonstrate the +unique capabilities of beta diffusion in generative modeling of range-bounded +data and validate the effectiveness of KLUBs in optimizing diffusion models, +thereby making them valuable additions to the family of diffusion-based +generative models and the optimization techniques used to train them. + +
+
+ comment: NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ Fast Controllable Diffusion Models for Undersampled MRI Reconstruction + + +
+ Supervised deep learning methods have shown promise in undersampled Magnetic +Resonance Imaging (MRI) reconstruction, but their requirement for paired data +limits their generalizability to the diverse MRI acquisition parameters. +Recently, unsupervised controllable generative diffusion models have been +applied to undersampled MRI reconstruction, without paired data or model +retraining for different MRI acquisitions. However, diffusion models are +generally slow in sampling and state-of-the-art acceleration techniques can +lead to sub-optimal results when directly applied to the controllable +generation process. This study introduces a new algorithm called +Predictor-Projector-Noisor (PPN), which enhances and accelerates controllable +generation of diffusion models for undersampled MRI reconstruction. Our results +demonstrate that PPN produces high-fidelity MR images that conform to +undersampled k-space measurements with significantly shorter reconstruction +time than other controllable sampling methods. In addition, the unsupervised +PPN accelerated diffusion models are adaptable to different MRI acquisition +parameters, making them more practical for clinical use than supervised +learning techniques. + +
+
+
+
+
+ + ♻ ☆ UAV-assisted Semantic Communication with Hybrid Action Reinforcement + Learning + + +
+ In this paper, we aim to explore the use of uplink semantic communications +with the assistance of UAV in order to improve data collection effiicency for +metaverse users in remote areas. To reduce the time for uplink data collection +while balancing the trade-off between reconstruction quality and computational +energy cost, we propose a hybrid action reinforcement learning (RL) framework +to make decisions on semantic model scale, channel allocation, transmission +power, and UAV trajectory. The variables are classified into discrete type and +continuous type, which are optimized by two different RL agents to generate the +combined action. Simulation results indicate that the proposed hybrid action +reinforcement learning framework can effectively improve the efficiency of +uplink semantic data collection under different parameter settings and +outperforms the benchmark scenarios. + +
+
+ comment: This paper appears in IEEE Global Communications Conference + (GLOBECOM) 2023 +
+
+
+
+
+ + ♻ ☆ Neurological Prognostication of Post-Cardiac-Arrest Coma Patients Using + EEG Data: A Dynamic Survival Analysis Framework with Competing Risks + + +
+ Patients resuscitated from cardiac arrest who enter a coma are at high risk +of death. Forecasting neurological outcomes of these patients (the task of +neurological prognostication) could help with treatment decisions. In this +paper, we propose, to the best of our knowledge, the first dynamic framework +for neurological prognostication of post-cardiac-arrest comatose patients using +EEG data: our framework makes predictions for a patient over time as more EEG +data become available, and different training patients' available EEG time +series could vary in length. Predictions are phrased in terms of either +time-to-event outcomes (time-to-awakening or time-to-death) or as the patient's +probability of awakening or of dying across multiple time horizons. Our +framework uses any dynamic survival analysis model that supports competing +risks in the form of estimating patient-level cumulative incidence functions. +We consider three competing risks as to what happens first to a patient: +awakening, being withdrawn from life-sustaining therapies (and thus +deterministically dying), or dying (by other causes). We demonstrate our +framework by benchmarking three existing dynamic survival analysis models that +support competing risks on a real dataset of 922 patients. Our main +experimental findings are that: (1) the classical Fine and Gray model which +only uses a patient's static features and summary statistics from the patient's +latest hour's worth of EEG data is highly competitive, achieving accuracy +scores as high as the recently developed Dynamic-DeepHit model that uses +substantially more of the patient's EEG data; and (2) in an ablation study, we +show that our choice of modeling three competing risks results in a model that +is at least as accurate while learning more information than simpler models +(using two competing risks or a standard survival analysis setup with no +competing risks). + +
+
+ comment: Machine Learning for Healthcare conference (MLHC 2023); this draft + includes minor corrections (in explaining how Dynamic-DeepHit works) +
+
+
+
+
+ + ♻ ☆ Linear Oscillation: A Novel Activation Function for Vision Transformer + + +
+ Activation functions are the linchpins of deep learning, profoundly +influencing both the representational capacity and training dynamics of neural +networks. They shape not only the nature of representations but also optimize +convergence rates and enhance generalization potential. Appreciating this +critical role, we present the Linear Oscillation (LoC) activation function, +defined as $f(x) = x \times \sin(\alpha x + \beta)$. Distinct from conventional +activation functions which primarily introduce non-linearity, LoC seamlessly +blends linear trajectories with oscillatory deviations. The nomenclature +"Linear Oscillation" is a nod to its unique attribute of infusing linear +activations with harmonious oscillations, capturing the essence of the +"Importance of Confusion". This concept of "controlled confusion" within +network activations is posited to foster more robust learning, particularly in +contexts that necessitate discerning subtle patterns. Our empirical studies +reveal that, when integrated into diverse neural architectures, the LoC +activation function consistently outperforms established counterparts like ReLU +and Sigmoid. The stellar performance exhibited by the avant-garde Vision +Transformer model using LoC further validates its efficacy. This study +illuminates the remarkable benefits of the LoC over other prominent activation +functions. It champions the notion that intermittently introducing deliberate +complexity or "confusion" during training can spur more profound and nuanced +learning. This accentuates the pivotal role of judiciously selected activation +functions in shaping the future of neural network training. + +
+
+ comment: Not Completed Paper +
+
+
+
+
+ + ♻ ☆ G-NM: A Group of Numerical Time Series Prediction Models + + +
+ In this study, we focus on the development and implementation of a +comprehensive ensemble of numerical time series forecasting models, +collectively referred to as the Group of Numerical Time Series Prediction Model +(G-NM). This inclusive set comprises traditional models such as Autoregressive +Integrated Moving Average (ARIMA), Holt-Winters' method, and Support Vector +Regression (SVR), in addition to modern neural network models including +Recurrent Neural Network (RNN) and Long Short-Term Memory (LSTM). G-NM is +explicitly constructed to augment our predictive capabilities related to +patterns and trends inherent in complex natural phenomena. By utilizing time +series data relevant to these events, G-NM facilitates the prediction of such +phenomena over extended periods. The primary objective of this research is to +both advance our understanding of such occurrences and to significantly enhance +the accuracy of our forecasts. G-NM encapsulates both linear and non-linear +dependencies, seasonalities, and trends present in time series data. Each of +these models contributes distinct strengths, from ARIMA's resilience in +handling linear trends and seasonality, SVR's proficiency in capturing +non-linear patterns, to LSTM's adaptability in modeling various components of +time series data. Through the exploitation of the G-NM potential, we strive to +advance the state-of-the-art in large-scale time series forecasting models. We +anticipate that this research will represent a significant stepping stone in +our ongoing endeavor to comprehend and forecast the complex events that +constitute the natural world. + +
+
+ comment: Not Completed Paper +
+
+
+
+
+ + ♻ ☆ Stable Adam Optimization for 16-bit Neural Networks Training + + +
+ In this research, we address critical concerns related to the numerical +instability observed in 16-bit computations of machine learning models. Such +instability, particularly when employing popular optimization algorithms like +Adam, often leads to unstable training of deep neural networks. This not only +disrupts the learning process but also poses significant challenges in +deploying dependable models in real-world applications. Our investigation +identifies the epsilon hyperparameter as the primary source of this +instability. A nuanced exploration reveals that subtle adjustments to epsilon +within 16-bit computations can enhance the numerical stability of Adam, +enabling more stable training of 16-bit neural networks. We propose a novel, +dependable approach that leverages updates from the Adam optimizer to bolster +the stability of the learning process. Our contributions provide deeper +insights into optimization challenges in low-precision computations and offer +solutions to ensure the stability of deep neural network training, paving the +way for their dependable use in various applications. + +
+
+ comment: Incompleted Contents Paper +
+
+
+
+
+ + ♻ ☆ Continuous 16-bit Training: Accelerating 32-bit Pre-Trained Neural + Networks + + +
+ In the field of deep learning, the prevalence of models initially trained +with 32-bit precision is a testament to its robustness and accuracy. However, +the continuous evolution of these models often demands further training, which +can be resource-intensive. This study introduces a novel approach where we +continue the training of these pre-existing 32-bit models using 16-bit +precision. This technique not only caters to the need for efficiency in +computational resources but also significantly improves the speed of additional +training phases. By adopting 16-bit precision for ongoing training, we are able +to substantially decrease memory requirements and computational burden, thereby +accelerating the training process in a resource-limited setting. Our +experiments show that this method maintains the high standards of accuracy set +by the original 32-bit training while providing a much-needed boost in training +speed. This approach is especially pertinent in today's context, where most +models are initially trained in 32-bit and require periodic updates and +refinements. The findings from our research suggest that this strategy of +16-bit continuation training can be a key solution for sustainable and +efficient deep learning, offering a practical way to enhance pre-trained models +rapidly and in a resource-conscious manner. + +
+
+ comment: Not Completed Paper +
+
+
+
+
+ + ♻ ☆ A Policy Gradient Method for Confounded POMDPs + + +
+ In this paper, we propose a policy gradient method for confounded partially +observable Markov decision processes (POMDPs) with continuous state and +observation spaces in the offline setting. We first establish a novel +identification result to non-parametrically estimate any history-dependent +policy gradient under POMDPs using the offline data. The identification enables +us to solve a sequence of conditional moment restrictions and adopt the min-max +learning procedure with general function approximation for estimating the +policy gradient. We then provide a finite-sample non-asymptotic bound for +estimating the gradient uniformly over a pre-specified policy class in terms of +the sample size, length of horizon, concentratability coefficient and the +measure of ill-posedness in solving the conditional moment restrictions. +Lastly, by deploying the proposed gradient estimation in the gradient ascent +algorithm, we show the global convergence of the proposed algorithm in finding +the history-dependent optimal policy under some technical conditions. To the +best of our knowledge, this is the first work studying the policy gradient +method for POMDPs under the offline setting. + +
+
+ comment: 95 pages, 3 figures +
+
+
+
+
+ + ♻ ☆ RLAIF: Scaling Reinforcement Learning from Human Feedback with AI + Feedback + + +
+ Reinforcement learning from human feedback (RLHF) has proven effective in +aligning large language models (LLMs) with human preferences. However, +gathering high-quality human preference labels can be a time-consuming and +expensive endeavor. RL from AI Feedback (RLAIF), introduced by Bai et al., +offers a promising alternative that leverages a powerful off-the-shelf LLM to +generate preferences in lieu of human annotators. Across the tasks of +summarization, helpful dialogue generation, and harmless dialogue generation, +RLAIF achieves comparable or superior performance to RLHF, as rated by human +evaluators. Furthermore, RLAIF demonstrates the ability to outperform a +supervised fine-tuned baseline even when the LLM preference labeler is the same +size as the policy. In another experiment, directly prompting the LLM for +reward scores achieves superior performance to the canonical RLAIF setup, where +LLM preference labels are first distilled into a reward model. Finally, we +conduct extensive studies on techniques for generating aligned AI preferences. +Our results suggest that RLAIF can achieve human-level performance, offering a +potential solution to the scalability limitations of RLHF. + +
+
+ comment: Added two more tasks and many more experiments and analyses (e.g. + same-size RLAIF, direct RLAIF, cost analysis) +
+
+
+
+
+ + ♻ ☆ Enhancing the Performance of Neural Networks Through Causal Discovery + and Integration of Domain Knowledge + + +
+ In this paper, we develop a generic methodology to encode hierarchical +causality structure among observed variables into a neural network in order to +improve its predictive performance. The proposed methodology, called +causality-informed neural network (CINN), leverages three coherent steps to +systematically map the structural causal knowledge into the layer-to-layer +design of neural network while strictly preserving the orientation of every +causal relationship. In the first step, CINN discovers causal relationships +from observational data via directed acyclic graph (DAG) learning, where causal +discovery is recast as a continuous optimization problem to avoid the +combinatorial nature. In the second step, the discovered hierarchical causality +structure among observed variables is systematically encoded into neural +network through a dedicated architecture and customized loss function. By +categorizing variables in the causal DAG as root, intermediate, and leaf nodes, +the hierarchical causal DAG is translated into CINN with a one-to-one +correspondence between nodes in the causal DAG and units in the CINN while +maintaining the relative order among these nodes. Regarding the loss function, +both intermediate and leaf nodes in the DAG graph are treated as target outputs +during CINN training so as to drive co-learning of causal relationships among +different types of nodes. As multiple loss components emerge in CINN, we +leverage the projection of conflicting gradients to mitigate gradient +interference among the multiple learning tasks. Computational experiments +across a broad spectrum of UCI data sets demonstrate substantial advantages of +CINN in predictive performance over other state-of-the-art methods. In +addition, an ablation study underscores the value of integrating structural and +quantitative causal knowledge in enhancing the neural network's predictive +performance incrementally. + +
+
+
+
+
+ + ♻ ☆ Large Language Models of Code Fail at Completing Code with Potential + Bugs NeurIPS 2023 + + +
+ Large language models of code (Code-LLMs) have recently brought tremendous +advances to code completion, a fundamental feature of programming assistance +and code intelligence. However, most existing works ignore the possible +presence of bugs in the code context for generation, which are inevitable in +software development. Therefore, we introduce and study the buggy-code +completion problem, inspired by the realistic scenario of real-time code +suggestion where the code context contains potential bugs -- anti-patterns that +can become bugs in the completed program. To systematically study the task, we +introduce two datasets: one with synthetic bugs derived from semantics-altering +operator changes (buggy-HumanEval) and one with realistic bugs derived from +user submissions to coding problems (buggy-FixEval). We find that the presence +of potential bugs significantly degrades the generation performance of the +high-performing Code-LLMs. For instance, the passing rates of CODEGEN-2B-MONO +on test cases of buggy-HumanEval drop more than 50% given a single potential +bug in the context. Finally, we investigate several post-hoc methods for +mitigating the adverse effect of potential bugs and find that there remains a +significant gap in post-mitigation performance. + +
+
+ comment: 27 pages, accepted to NeurIPS 2023 +
+
+
+
+
+ + ♻ ☆ A rule-general abductive learning by rough sets + + +
+ In real-world tasks, there is usually a large amount of unlabeled data and +labeled data. The task of combining the two to learn is known as +semi-supervised learning. Experts can use logical rules to label unlabeled +data, but this operation is costly. The combination of perception and reasoning +has a good effect in processing such semi-supervised tasks with domain +knowledge. However, acquiring domain knowledge and the correction, reduction +and generation of rules remain complex problems to be solved. Rough set theory +is an important method for solving knowledge processing in information systems. +In this paper, we propose a rule general abductive learning by rough set +(RS-ABL). By transforming the target concept and sub-concepts of rules into +information tables, rough set theory is used to solve the acquisition of domain +knowledge and the correction, reduction and generation of rules at a lower +cost. This framework can also generate more extensive negative rules to enhance +the breadth of the knowledge base. Compared with the traditional +semi-supervised learning method, RS-ABL has higher accuracy in dealing with +semi-supervised tasks. + +
+
+
+
+
+ + ♻ ☆ Fast Deep Mixtures of Gaussian Process Experts + + +
+ Mixtures of experts have become an indispensable tool for flexible modelling +in a supervised learning context, allowing not only the mean function but the +entire density of the output to change with the inputs. Sparse Gaussian +processes (GP) have shown promise as a leading candidate for the experts in +such models, and in this article, we propose to design the gating network for +selecting the experts from such mixtures of sparse GPs using a deep neural +network (DNN). Furthermore, a fast one pass algorithm called +Cluster-Classify-Regress (CCR) is leveraged to approximate the maximum a +posteriori (MAP) estimator extremely quickly. This powerful combination of +model and algorithm together delivers a novel method which is flexible, robust, +and extremely efficient. In particular, the method is able to outperform +competing methods in terms of accuracy and uncertainty quantification. The cost +is competitive on low-dimensional and small data sets, but is significantly +lower for higher-dimensional and big data sets. Iteratively maximizing the +distribution of experts given allocations and allocations given experts does +not provide significant improvement, which indicates that the algorithm +achieves a good approximation to the local MAP estimator very fast. This +insight can be useful also in the context of other mixture of experts models. + +
+
+ comment: 22 pages, 28 figures, to be published in Machine Learning journal +
+
+
+
+
+ + ♻ ☆ MuseChat: A Conversational Music Recommendation System for Videos + + +
+ Music recommendation for videos attracts growing interest in multi-modal +research. However, existing systems focus primarily on content compatibility, +often ignoring the users' preferences. Their inability to interact with users +for further refinements or to provide explanations leads to a less satisfying +experience. We address these issues with MuseChat, a first-of-its-kind +dialogue-based recommendation system that personalizes music suggestions for +videos. Our system consists of two key functionalities with associated modules: +recommendation and reasoning. The recommendation module takes a video along +with optional information including previous suggested music and user's +preference as inputs and retrieves an appropriate music matching the context. +The reasoning module, equipped with the power of Large Language Model +(Vicuna-7B) and extended to multi-modal inputs, is able to provide reasonable +explanation for the recommended music. To evaluate the effectiveness of +MuseChat, we build a large-scale dataset, conversational music recommendation +for videos, that simulates a two-turn interaction between a user and a +recommender based on accurate music track information. Experiment results show +that MuseChat achieves significant improvements over existing video-based music +retrieval methods as well as offers strong interpretability and +interactability. + +
+
+
+
+
+
+
+
+ + Multimedia 5 + +
+
+
+ + ☆ Large-scale Vision-Language Models Learn Super Images for Efficient and + High-Performance Partially Relevant Video Retrieval + + +
+ In this paper, we propose an efficient and high-performance method for +partially relevant video retrieval (PRVR), which aims to retrieve untrimmed +long videos that contain at least one relevant moment to the input text query. +In terms of both efficiency and performance, the overlooked bottleneck of +previous studies is the visual encoding of dense frames. This guides +researchers to choose lightweight visual backbones, yielding sub-optimal +retrieval performance due to their limited capabilities of learned visual +representations. However, it is undesirable to simply replace them with +high-performance large-scale vision-and-language models (VLMs) due to their low +efficiency. To address these issues, instead of dense frames, we focus on super +images, which are created by rearranging the video frames in a $N \times N$ +grid layout. This reduces the number of visual encodings to $\frac{1}{N^2}$ and +compensates for the low efficiency of large-scale VLMs, allowing us to adopt +them as powerful encoders. Surprisingly, we discover that with a simple +query-image attention trick, VLMs generalize well to super images effectively +and demonstrate promising zero-shot performance against SOTA methods +efficiently. In addition, we propose a fine-tuning approach by incorporating a +few trainable modules into the VLM backbones. The experimental results +demonstrate that our approaches efficiently achieve the best performance on +ActivityNet Captions and TVR. + +
+
+ comment: 13 pages, 11 figures +
+
+
+
+
+ + ☆ VIoTGPT: Learning to Schedule Vision Tools towards Intelligent Video + Internet of Things + + +
+ Video Internet of Things (VIoT) has shown full potential in collecting an +unprecedented volume of video data. Learning to schedule perceiving models and +analyzing the collected videos intelligently will be potential sparks for VIoT. +In this paper, to address the challenges posed by the fine-grained and +interrelated vision tool usage of VIoT, we build VIoTGPT, the framework based +on LLMs to correctly interact with humans, query knowledge videos, and invoke +vision models to accomplish complicated tasks. To support VIoTGPT and related +future works, we meticulously crafted the training dataset and established +benchmarks involving 11 representative vision models across three categories +based on semi-automatic annotations. To guide LLM to act as the intelligent +agent towards intelligent VIoT, we resort to ReAct instruction tuning based on +the collected VIoT dataset to learn the tool capability. Quantitative and +qualitative experimental results and analyses demonstrate the effectiveness of +VIoTGPT. + +
+
+
+
+
+ + ☆ RTQ: Rethinking Video-language Understanding Based on Image-text Model ACM MM 2023 + + +
+ Recent advancements in video-language understanding have been established on +the foundation of image-text models, resulting in promising outcomes due to the +shared knowledge between images and videos. However, video-language +understanding presents unique challenges due to the inclusion of highly complex +semantic details, which result in information redundancy, temporal dependency, +and scene complexity. Current techniques have only partially tackled these +issues, and our quantitative analysis indicates that some of these methods are +complementary. In light of this, we propose a novel framework called RTQ +(Refine, Temporal model, and Query), which addresses these challenges +simultaneously. The approach involves refining redundant information within +frames, modeling temporal relations among frames, and querying task-specific +information from the videos. Remarkably, our model demonstrates outstanding +performance even in the absence of video-language pre-training, and the results +are comparable with or superior to those achieved by state-of-the-art +pre-training methods. + +
+
+ comment: Accepted by ACM MM 2023 as Oral representation +
+
+
+
+
+ + ☆ Video Summarization: Towards Entity-Aware Captions + + +
+ Existing popular video captioning benchmarks and models deal with generic +captions devoid of specific person, place or organization named entities. In +contrast, news videos present a challenging setting where the caption requires +such named entities for meaningful summarization. As such, we propose the task +of summarizing news video directly to entity-aware captions. We also release a +large-scale dataset, VIEWS (VIdeo NEWS), to support research on this task. +Further, we propose a method that augments visual information from videos with +context retrieved from external world knowledge to generate entity-aware +captions. We demonstrate the effectiveness of our approach on three video +captioning models. We also show that our approach generalizes to existing news +image captions dataset. With all the extensive experiments and insights, we +believe we establish a solid basis for future research on this challenging +task. + +
+
+
+
+
+ + ♻ ☆ Rethinking Event-based Human Pose Estimation with 3D Event + Representations + + +
+ Human pose estimation is a fundamental and appealing task in computer vision. +Traditional frame-based cameras and videos are commonly applied, yet, they +become less reliable in scenarios under high dynamic range or heavy motion +blur. In contrast, event cameras offer a robust solution for navigating these +challenging contexts. Predominant methodologies incorporate event cameras into +learning frameworks by accumulating events into event frames. However, such +methods tend to marginalize the intrinsic asynchronous and high temporal +resolution characteristics of events. This disregard leads to a loss in +essential temporal dimension data, crucial for discerning distinct actions. To +address this issue and to unlock the 3D potential of event information, we +introduce two 3D event representations: the Rasterized Event Point Cloud +(RasEPC) and the Decoupled Event Voxel (DEV). The RasEPC collates events within +concise temporal slices at identical positions, preserving 3D attributes with +statistical cues and markedly mitigating memory and computational demands. +Meanwhile, the DEV representation discretizes events into voxels and projects +them across three orthogonal planes, utilizing decoupled event attention to +retrieve 3D cues from the 2D planes. Furthermore, we develop and release +EV-3DPW, a synthetic event-based dataset crafted to facilitate training and +quantitative analysis in outdoor scenes. On the public real-world DHP19 +dataset, our event point cloud technique excels in real-time mobile +predictions, while the decoupled event voxel method achieves the highest +accuracy. Experiments on EV-3DPW demonstrate that the robustness of our +proposed 3D representation methods compared to traditional RGB images and event +frame techniques under the same backbones. Our code and dataset have been made +publicly available at https://github.com/MasterHow/EventPointPose. + +
+
+ comment: Extended version of arXiv:2206.04511. The code and dataset are + available at https://github.com/MasterHow/EventPointPose +
+
+
+
+
+
+
+ + + +
+
+ +
+
+ + diff --git a/index.js b/index.js new file mode 100644 index 00000000..69f5da7b --- /dev/null +++ b/index.js @@ -0,0 +1,39 @@ +/* Exapand/Collapse with TAB key */ +var expanded = false; +document.onkeydown = function (e) { + if (e.keyCode === 9) { + expanded = !expanded; + document.querySelectorAll("details").forEach(detail => detail.open = expanded); + return false; + } +}; + +/* Switch Theme */ +const toggleSwitch = document.querySelector('.theme-switch input[type="checkbox"]'); + +function switchTheme(e) { + if (e.target.checked) { + document.documentElement.setAttribute('data-theme', 'light'); + document.getElementById("theme-icon").className = "ri-sun-line"; + localStorage.setItem('theme', 'light'); //add this + } else { + document.documentElement.setAttribute('data-theme', 'dark'); + document.getElementById("theme-icon").className = "ri-moon-line"; + localStorage.setItem('theme', 'dark'); //add this + } +} + +toggleSwitch.addEventListener('change', switchTheme, false); +const currentTheme = localStorage.getItem('theme') ? localStorage.getItem('theme') : null; +if (currentTheme) { + document.documentElement.setAttribute('data-theme', currentTheme); + if (currentTheme === 'light') { + toggleSwitch.checked = true; + } +} + +const timestamp = document.getElementById("build-timestamp"); +const timestamp_local = new Date(timestamp.getAttribute("datetime")).toLocaleString(); + +const badge = document.getElementById("build-timestamp-badge"); +// badge.src = `https://img.shields.io/github/workflow/status/mlnlp-world/myarxiv/Update?=${timestamp_local}&style=for-the-badge`